In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from semantic_text_similarity.models import WebBertSimilarity
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import spatial
import parmap
import os
import swifter
from tqdm import tqdm
import sqlite3

  import pandas.util.testing as tm


In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
df = pd.read_csv('./data/partitions/bert_partitions_54_2015.csv')
df = df.drop(['Unnamed: 0', 'Source', 'Program Name', 'Time'], axis=1)

In [4]:
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
df

Unnamed: 0,Date,Duration,Transcript,partitioned_transcript
0,2015-07-05,15 mins,# You’re stayin’ alive Stayin’ alive # Feel th...,# You’re stayin’ alive Stayin’ alive # Feel th...
1,2015-07-05,20 mins,Greece has voted overwhelmingly to reject the ...,Greece has voted overwhelmingly to reject the ...
2,2015-07-04,5 mins,"Thank wu warg nwuch. Formula One now, and Lewi...","Formula One now, and Lewis Hamilton delighted ..."
3,2015-07-04,20 mins,# Hold back the river Let me look in your eyes...,# Hold back the river Let me look in your eyes...
4,2015-07-07,30 mins,Ginger Baker is one of the most innovative dru...,Ginger Baker is one of the most innovative dru...
...,...,...,...,...
1356,2015-01-30,40 mins,"Ashley was to walk down here now, I’d justjump...","Ashley was to walk down here now, I’d justjump..."
1357,2015-01-30,30 mins,"Ten minutes! ..Six and a half! Sewers, you hav...","The blood, sweat and TEARS of The Great Britis..."
1358,2015-01-30,25 mins,The Church of England and the Catholic Church ...,The Church of England and the Catholic Church ...
1359,2015-01-25,15 mins,20 years of planning. 10 years experimenting. ...,The latest political defection from Ukip to th...


In [6]:
doc2vec_model = Doc2Vec.load('models/doc2vec/doc2vec_news_2015_no_Others')

In [7]:
partitions = []
for index, row in df.iterrows():
    partition_string = row['partitioned_transcript']
    partition_date = row['Date']
    all_partitions = partition_string.split('\n---------------------\n')
    for partition in all_partitions:
        partitions.append((index, partition_date, partition))
partition_df = pd.DataFrame(partitions, columns=['partition_id', 'date', 'transcript'])

In [9]:
mask = (partition_df['date'] >= '2015-1-1') & (partition_df['date'] <= '2015-6-30')
partition_df = partition_df.loc[mask]

In [10]:
partition_df['vector'] = partition_df.swifter.apply(lambda x: doc2vec_model.infer_vector(preprocess_text(x['transcript'])), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=31092.0, style=ProgressStyle(descripti…




In [11]:
partition_df

Unnamed: 0,partition_id,date,transcript,vector
3800,89,2015-04-06,"# The National Lottery - In It To Win It, feat...","[-0.015009172, 0.3270903, 0.45771274, 0.205266..."
3801,89,2015-04-06,You may depend on it for your everyday social ...,"[0.1401679, -0.084059745, -0.09622029, -1.1517..."
3802,89,2015-04-06,And cheers for Prince Harry as he arrives in A...,"[0.10837728, 0.20077448, -0.0011135074, -0.212..."
3803,89,2015-04-06,Sweeping changes to pension rules which will g...,"[0.43655208, 0.4061765, -0.007886911, -1.32476..."
3804,89,2015-04-06,The official start of the new tax year has spa...,"[-0.046451673, 0.21055464, 0.6847504, 0.336753..."
...,...,...,...,...
55268,1360,2015-01-25,"Football now, and Match of the Day - with Spor...","[-1.3146842, 0.54991305, -1.6694269, -1.116904..."
55269,1360,2015-01-25,"Back to our main story, the Greek collections ...","[-0.41744605, 0.34475803, -0.91543835, -0.8377..."
55270,1360,2015-01-25,I need to know how this economy will turn arou...,"[0.20171957, -0.35128188, -0.34262505, -0.0239..."
55271,1360,2015-01-25,Syriza up has promised much and after years of...,"[0.24183172, -0.54903287, -0.09783117, -0.1850..."


In [12]:
del df

In [13]:
articles = pd.read_csv('./data/news_predictions/news_2015_predictions.csv')

In [14]:
articles = articles.dropna(subset=['transcript'])
articles = articles.loc[articles.top1_topic != 'Others']

In [15]:
articles = articles.drop(['Unnamed: 0'], axis=1)
articles = articles.loc[articles.month.isin([1,2,3,4,5,6])]
articles

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc
0,163795,Belfast Telegraph,1,1,2015,Plans for site of former factory to be 12-acre...,"The Artnz Belting company,\nwhich had been a m...",1,Transport,48.68,"Parliament, government and politics",12.64,Communities and families,12.03
1,163795,Belfast Telegraph,1,1,2015,Councillors left red-faced over bid for new pr...,"However, it was then revealed that a database ...",1,Communities and families,28.51,Transport,15.06,Others,10.18
3,163795,Belfast Telegraph,1,1,2015,Driver who was three times legal limit jailed,Paul McCloskey (36) of Horace Street in London...,1,"Crime, civil law, justice and rights",35.05,Others,10.45,Defence,8.50
4,163795,Belfast Telegraph,1,1,2015,A oneness that unites us in Christ;\nMY VIEW,That means more than simply recording the numb...,1,"Parliament, government and politics",53.14,International affairs,12.30,"Culture, media and sport",12.19
5,163795,Belfast Telegraph,1,1,2015,Let 2015 be year our politicians display leade...,While accepting that politicians must be pragm...,1,"Parliament, government and politics",70.15,Others,9.72,"Culture, media and sport",7.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757255,412338,Wales,30,6,2015,Women have a key role to play in plugging the ...,"Lee Hopley, chief economist at EEF, the manufa...",1,"Business, industry and consumers",35.44,"Crime, civil law, justice and rights",24.78,"Parliament, government and politics",13.13
757256,412338,Wales,30,6,2015,Ant-Man star Michael Douglas enjoys Marvel rol...,The Basic Instinct star plays scientist Dr Han...,1,"Culture, media and sport",61.44,Others,16.30,"Business, industry and consumers",5.50
757257,412338,Wales,30,6,2015,How Greek financial crisis means you can get m...,This morning £1 could buy you (EURO)1.43 - thi...,1,Economy and finance,38.41,"Business, industry and consumers",33.63,Others,9.41
757258,412338,Wales,30,6,2015,Morning news headlines: Tunisia Tourists saw t...,Eyewitnesses have claimed the Tunisian beach m...,1,International affairs,59.93,Transport,6.67,"Culture, media and sport",6.54


In [16]:
vector = []
transcripts = articles.transcript.values

preprocessed_transcripts = parmap.map(preprocess_text, transcripts, pm_pbar=True)

604160it [00:42, 14243.48it/s]                            


In [17]:
vector_transcripts = parmap.map(doc2vec_model.infer_vector, preprocessed_transcripts, pm_pbar=True)
articles['vector'] = vector_transcripts

604160it [07:35, 1327.46it/s]                            


In [18]:
del preprocessed_transcripts

In [19]:
articles

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,vector
0,163795,Belfast Telegraph,1,1,2015,Plans for site of former factory to be 12-acre...,"The Artnz Belting company,\nwhich had been a m...",1,Transport,48.68,"Parliament, government and politics",12.64,Communities and families,12.03,"[0.13620135, 1.4393163, -1.5840325, 2.8394022,..."
1,163795,Belfast Telegraph,1,1,2015,Councillors left red-faced over bid for new pr...,"However, it was then revealed that a database ...",1,Communities and families,28.51,Transport,15.06,Others,10.18,"[-1.3871644, 2.246499, -0.14144617, 1.5915717,..."
3,163795,Belfast Telegraph,1,1,2015,Driver who was three times legal limit jailed,Paul McCloskey (36) of Horace Street in London...,1,"Crime, civil law, justice and rights",35.05,Others,10.45,Defence,8.50,"[-1.5667067, 1.1605889, -0.14279681, -0.030182..."
4,163795,Belfast Telegraph,1,1,2015,A oneness that unites us in Christ;\nMY VIEW,That means more than simply recording the numb...,1,"Parliament, government and politics",53.14,International affairs,12.30,"Culture, media and sport",12.19,"[0.331254, -0.122718506, -1.3796539, 0.3864345..."
5,163795,Belfast Telegraph,1,1,2015,Let 2015 be year our politicians display leade...,While accepting that politicians must be pragm...,1,"Parliament, government and politics",70.15,Others,9.72,"Culture, media and sport",7.04,"[-0.52562904, -0.6122626, -0.11283386, 0.01387..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757255,412338,Wales,30,6,2015,Women have a key role to play in plugging the ...,"Lee Hopley, chief economist at EEF, the manufa...",1,"Business, industry and consumers",35.44,"Crime, civil law, justice and rights",24.78,"Parliament, government and politics",13.13,"[2.3693411, 0.098471396, -3.047504, 1.6969368,..."
757256,412338,Wales,30,6,2015,Ant-Man star Michael Douglas enjoys Marvel rol...,The Basic Instinct star plays scientist Dr Han...,1,"Culture, media and sport",61.44,Others,16.30,"Business, industry and consumers",5.50,"[-1.6098089, -0.6280092, 1.5385658, -0.6426866..."
757257,412338,Wales,30,6,2015,How Greek financial crisis means you can get m...,This morning £1 could buy you (EURO)1.43 - thi...,1,Economy and finance,38.41,"Business, industry and consumers",33.63,Others,9.41,"[-2.1319876, -0.032823455, 0.9358969, -1.34467..."
757258,412338,Wales,30,6,2015,Morning news headlines: Tunisia Tourists saw t...,Eyewitnesses have claimed the Tunisian beach m...,1,International affairs,59.93,Transport,6.67,"Culture, media and sport",6.54,"[1.1468259, -0.56938094, -1.1343379, 0.0406346..."


In [20]:
#articles.to_csv('./data/news_2016_predictions_vectored.csv')

In [21]:
import datetime
articles['date'] = articles.apply(lambda x: datetime.date(x.year, x.month, x.day), axis=1)

In [22]:
# articles = articles.drop(['day', 'month', 'year'], axis=1)

In [23]:
#articles['vector'] = articles.vector.apply(lambda x: list(x))

In [24]:
# sample = partition_df.sample(n=1000, weights='partition_id', random_state=1).reset_index(drop=True)

In [25]:
# def smoothing(df):
#     n = len(df)
#     rows = []
#     for i in tqdm(range(n-1)):
#         row1 = df.iloc[i]
#         row2 = df.iloc[i+1]
#         dt = row1.date
#         start_dt = dt - timedelta(days=2)
#         end_dt = dt + timedelta(days=2)
#         articles_window = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)]
#         row1_vector = np.array(row1.vector)
#         row1_vector = row1_vector.reshape(1, 100)
#         articles_vector = articles_window.vector.values
#         vec = [articles_vector[i] for i in range(len(articles_vector))]
#         vec = np.array(vec)
#         sim = cosine_similarity(row1_vector, vec)[0]
#         max_index = np.argmax(sim)
#         topic1 = articles_window.iloc[max_index]['top1_topic']
        
#         combined_transcript = row1.transcript + ' ' + row2.transcript
#         combined_vector = doc2vec_model.infer_vector(preprocess_text(combined_transcript))
#         combined_vector = np.array(combined_vector)
#         combined_vector = combined_vector.reshape(1, 100)
        
#         combined_sim = cosine_similarity(combined_vector, vec)[0]
#         max_index = np.argmax(combined_sim)
#         topic2 = articles_window.iloc[max_index]['top1_topic']
        
#         row = [row1.partition_id, row1.date, row1.transcript, max(sim), topic1, max(combined_sim), topic2]
#         rows.append(row)
    
#     columns = ['partition_id', 'date', 'transcript', 'similarity', 'topic', 'merged_similarity', 'merged_topic']
#     res_df = pd.DataFrame(rows, columns=columns)
#     return res_df

# res_df = smoothing(partition_df)
# res_df.to_csv('partition_similarity_with_merging_June_21-25_2016.csv')
# res_df

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime, timedelta
web_model = WebBertSimilarity(device='cpu', batch_size=10) #defaults to GPU prediction

def predict(partition):
    dt = partition.date
    start_dt = dt - timedelta(days=2)
    end_dt = dt + timedelta(days=2)
    articles_window = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)]
    partition_vector = np.array(partition.vector)
    partition_vector = partition_vector.reshape(1, 100)
    articles_vector = articles_window.vector.values
    vec = [articles_vector[i] for i in range(len(articles_vector))]
    vec = np.array(vec)
    sim = cosine_similarity(partition_vector, vec)
    max_index = np.argmax(sim)
    
    return articles_window.iloc[max_index]['top1_topic']

def predict_with_bert(partition):
    dt = partition.date
    start_dt = dt - timedelta(days=2)
    end_dt = dt + timedelta(days=2)
    articles_window = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)]
    partition_transcript = np.array(partition.transcript)
    articles_transcript = articles_window.transcript.values
    input_data = [(partition_transcript, a) for a in articles_transcript]
    print(input_data[0])
    sim = web_model.predict(input_data)
    max_index = np.argmax(sim)
    
    return articles_window.iloc[max_index]['top1_topic']

In [29]:
predict(partition_df.iloc[4])

'Economy and finance'

In [30]:
partition_df.iloc[4].transcript

'The official start of the new tax year has sparked a fresh battle between the two main parties over their respective economic plans. The Conservatives say new tax measures mean millions of people are better off. The Shadow chancellor Ed Balls said Britain could not afford five more years of Tory policies. Both parties say their rivals have secret plans to raise taxes if they win the general election.'

In [31]:
preds = []

for index, row in tqdm(partition_df.iterrows(), total=len(partition_df)):
    preds.append(predict(row))
    
partition_df['prediction'] = preds

100%|██████████| 31092/31092 [3:16:55<00:00,  2.63it/s]  


In [32]:
partition_df

Unnamed: 0,partition_id,date,transcript,vector,prediction
3800,89,2015-04-06,"# The National Lottery - In It To Win It, feat...","[-0.015009172, 0.3270903, 0.45771274, 0.205266...","Parliament, government and politics"
3801,89,2015-04-06,You may depend on it for your everyday social ...,"[0.1401679, -0.084059745, -0.09622029, -1.1517...","Parliament, government and politics"
3802,89,2015-04-06,And cheers for Prince Harry as he arrives in A...,"[0.10837728, 0.20077448, -0.0011135074, -0.212...","Parliament, government and politics"
3803,89,2015-04-06,Sweeping changes to pension rules which will g...,"[0.43655208, 0.4061765, -0.007886911, -1.32476...",Economy and finance
3804,89,2015-04-06,The official start of the new tax year has spa...,"[-0.046451673, 0.21055464, 0.6847504, 0.336753...",Economy and finance
...,...,...,...,...,...
55268,1360,2015-01-25,"Football now, and Match of the Day - with Spor...","[-1.3146842, 0.54991305, -1.6694269, -1.116904...","Parliament, government and politics"
55269,1360,2015-01-25,"Back to our main story, the Greek collections ...","[-0.41744605, 0.34475803, -0.91543835, -0.8377...","Parliament, government and politics"
55270,1360,2015-01-25,I need to know how this economy will turn arou...,"[0.20171957, -0.35128188, -0.34262505, -0.0239...","Parliament, government and politics"
55271,1360,2015-01-25,Syriza up has promised much and after years of...,"[0.24183172, -0.54903287, -0.09783117, -0.1850...",International affairs


In [33]:
partition_df.to_csv('./window_topic_predictions_Jan-June_2015.csv')