In [65]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from semantic_text_similarity.models import WebBertSimilarity
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import spatial
import parmap
import os
import swifter
from tqdm import tqdm
import sqlite3

In [66]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [67]:
doc2vec_model = Doc2Vec.load('models/doc2vec/doc2vec_2010_2016_no_Others')

In [68]:
articles = pd.read_csv('./data/news_predictions/news_2016_predictions.csv')

In [69]:
articles = articles.dropna(subset=['transcript'])
articles = articles.drop(['Unnamed: 0'], axis=1)
articles = articles.loc[articles.month == 6]

In [70]:
vector = []
transcripts = articles.transcript.values

preprocessed_transcripts = parmap.map(preprocess_text, transcripts, pm_pbar=True)

155392it [00:14, 10971.79it/s]                           


In [71]:
vector_transcripts = parmap.map(doc2vec_model.infer_vector, preprocessed_transcripts, pm_pbar=True)
articles['vector'] = vector_transcripts

155392it [01:52, 1384.76it/s]                            


In [75]:
import datetime
articles['date'] = articles.apply(lambda x: datetime.date(x.year, x.month, x.day), axis=1)

In [87]:
print('preparing data!!')
df = pd.read_csv('./data/2016_debate.csv')
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.topic == 'Others'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index)

preparing data!!


In [88]:
df['vector'] = df.swifter.apply(lambda x: doc2vec_model.infer_vector(preprocess_text(x['transcript'])), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=2142.0, style=ProgressStyle(descriptio…




In [89]:
from dateutil import parser
df['date'] = df['date'].apply(lambda x: parser.parse(x).date())

In [90]:
df

Unnamed: 0,date,topic,transcript,vector
0,2016-01-05,Defence,"With permission, Mr Speaker, I will make a sta...","[0.0038627787, 0.16613455, 0.7865501, -0.02226..."
1,2016-01-05,European Union,"With permission, Mr Speaker, I would like to m...","[0.08787448, -1.0847125, 1.7855617, -0.9878115..."
2,2016-01-05,Energy and environment,"With permission, Mr Speaker, I would like to m...","[-0.54103595, -0.8347334, -0.19081135, 2.35676..."
3,2016-01-05,Economy and finance,I thank the many colleagues who have stayed so...,"[-1.5605081, -0.7581342, 1.0427865, 0.28244412..."
4,2016-01-05,Health services and medicine,14. What steps his Department is taking to in...,"[-0.5808947, 0.52790385, 0.5559714, 1.5502323,..."
...,...,...,...,...
2437,2016-12-20,Health services and medicine,13. How many patient days of delayed discharge...,"[-1.2311676, -0.5779405, -0.5349262, -0.414065..."
2438,2016-12-20,Health services and medicine,10. What assessment he has made of the potenti...,"[-0.8975457, -0.15285091, 0.819843, 0.14723383..."
2439,2016-12-20,Health services and medicine,T2. If he will make a statement on his depart...,"[-0.13278913, 1.2363504, 0.85807025, 1.251639,..."
2440,2016-12-20,Health services and medicine,15. What assessment he has made of the potenti...,"[0.66957015, 0.7701326, -2.025422, 1.739207, -..."


In [96]:
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
def predict(news):
    dt = news.date
    news_vector = np.array(news.vector)
    news_vector = news_vector.reshape(1, 100)
    df_vector = df.vector.values
    vec = [df_vector[i] for i in range(len(df_vector))]
    vec = np.array(vec)
    max_index = -1
    max_sim = -1
    sim = cosine_similarity(news_vector, vec)
    max_index = np.argmax(sim)
    
    return df.iloc[max_index]['topic']

In [97]:
predict(articles.iloc[4])

'Agriculture, animals, food and rural affairs'

In [98]:
preds = []

for index, row in tqdm(articles.iterrows(), total=len(articles)):
    preds.append(predict(row))
    
articles['prediction_cosine'] = preds

  1%|▏         | 2274/155293 [00:05<06:04, 420.32it/s]


KeyboardInterrupt: 

In [94]:
articles

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,vector,date,prediction_cosine
815059,163795,Belfast Telegraph,1,6,2016,North West 'needs task force on jobs to be rev...,"Gavin Killeen, who said more needs to be done ...",1,"Parliament, government and politics",72.42,"Business, industry and consumers",6.75,Economy and finance,4.93,"[0.21521333, -0.61631656, 1.1988628, 0.2426858...",2016-06-01,"Parliament, government and politics"
815060,163795,Belfast Telegraph,1,6,2016,Time to take your town centre to heart,"Rather than complain, residents can participat...",1,"Culture, media and sport",24.98,"Business, industry and consumers",23.64,"Agriculture, animals, food and rural affairs",10.36,"[0.09298251, 1.0889144, 2.142314, -1.4128298, ...",2016-06-01,"Agriculture, animals, food and rural affairs"
815061,163795,Belfast Telegraph,1,6,2016,Causeway Coast for food heaven,"Despite its infancy, the CCAG Food Network has...",1,"Culture, media and sport",52.31,"Agriculture, animals, food and rural affairs",17.63,"Business, industry and consumers",8.31,"[1.4593987, 1.3749042, 0.7192617, 1.3394088, 1...",2016-06-01,"Agriculture, animals, food and rural affairs"
815062,163795,Belfast Telegraph,1,6,2016,Here comes summer - time for Country Kitchen s...,Country Kitchen side salads are the perfect ac...,1,"Culture, media and sport",28.93,Others,24.34,"Business, industry and consumers",10.81,"[0.0028712775, 0.92605275, -0.25203443, -0.232...",2016-06-01,"Agriculture, animals, food and rural affairs"
815063,163795,Belfast Telegraph,1,6,2016,NI for the cream of the crop,"Comber Earlies - in season now, Lough Neagh Ee...",1,"Agriculture, animals, food and rural affairs",40.33,"Culture, media and sport",39.01,Others,4.54,"[0.3191508, 1.3862733, 0.52248347, 1.443869, 1...",2016-06-01,"Agriculture, animals, food and rural affairs"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970363,412338,Wales,30,6,2016,Amazon announce Second Prime Day shopping even...,"Prime Day will take place on July 12, with the...",1,"Business, industry and consumers",20.31,Others,15.73,Transport,6.81,"[1.3523831, -0.6249906, 2.4624922, 1.0074334, ...",2016-06-30,Communities and families
970364,412338,Wales,30,6,2016,Morning news headlines: Boris Johnson and Ther...,Tory heavyweights Boris Johnson and Theresa Ma...,1,"Parliament, government and politics",25.05,"Culture, media and sport",18.58,International affairs,17.69,"[-0.42023158, 0.48719677, -0.2652266, -0.17420...",2016-06-30,International affairs
970365,412338,Wales,30,6,2016,Inspection finds Welsh police force kept too m...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",39.50,Communities and families,11.34,"Parliament, government and politics",9.34,"[-2.9431205, 0.49691546, -2.0984313, 0.6602685...",2016-06-30,"Communities and families | Crime, civil law, j..."
970366,412338,Wales,30,6,2016,Inspection found that a Welsh police force kep...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",35.47,Communities and families,14.13,Health services and medicine,10.10,"[-3.0568829, 0.4597003, -1.9608382, 0.8233612,...",2016-06-30,"Communities and families | Crime, civil law, j..."


In [95]:
articles.to_csv('news_prediction_June_cosine_sim_no_Others.csv')