In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from semantic_text_similarity.models import WebBertSimilarity
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import spatial
import parmap
import os
import swifter
from tqdm import tqdm
import sqlite3

  import pandas.util.testing as tm


In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
df = pd.read_csv('./data/bert_partitions_2014.csv')
df = df.drop(['Unnamed: 0', 'Source', 'Program Name', 'Time'], axis=1)

In [4]:
df['Date'] = pd.to_datetime(df['Date'])

In [5]:
df

Unnamed: 0,Date,Duration,Transcript,partitioned_transcript
0,2014-12-06,20 mins,"#Ah... # Dreaming ofthe days # APPLAUSE Hiya, ...","# Oh, yeah # I see your smiling face # Like I’..."
1,2014-12-07,20 mins,Alex Salmond unveils plans to seek a Westminst...,Alex Salmond unveils plans to seek a Westminst...
2,2014-12-07,10 mins,against the illegal trade in wildlife. He’s ve...,"At the World Bank, he will be making a speech ..."
3,2014-12-05,30 mins,# The way I love you. # Making Christmas speci...,Serious mistakes are made by England’s health ...
4,2014-12-05,25 mins,Two British men are jailed for travelling to S...,Two British men are jailed for travelling to S...
...,...,...,...,...
1238,2014-06-26,30 mins,"Hello, Glastonbury. Hello, Glastonbury! Hello,...",Use the BBC Weather App to stay one step ahead...
1239,2014-06-26,15 mins,A little bit more cloud around but some of us ...,A little bit more cloud around but some of us ...
1240,2014-06-26,25 mins,Yet more shocking re-lations about the extent ...,Yet more shocking re-lations about the extent ...
1241,2014-06-26,30 mins,The most detailed picture yet ofthe The most d...,The most detailed picture yet ofthe The most d...


In [6]:
from datetime import datetime, timedelta
dt = df.iloc[0]['Date']
start_dt = dt - timedelta(days=2)
end_dt = dt + timedelta(days=2)
print(start_dt, ' ', end_dt)

2014-12-04 00:00:00   2014-12-08 00:00:00


In [7]:
doc2vec_model = Doc2Vec.load('models/doc2vec/doc2vec_news')

In [8]:
partitions = []
for index, row in df.iterrows():
    partition_string = row['partitioned_transcript']
    partition_date = row['Date']
    all_partitions = partition_string.split('\n---------------------\n')
    for partition in all_partitions:
        partitions.append((index, partition_date, partition))
partition_df = pd.DataFrame(partitions, columns=['partition_id', 'date', 'transcript'])

In [9]:
partition_df['vector'] = partition_df.swifter.apply(lambda x: doc2vec_model.infer_vector(preprocess_text(x['transcript'])), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=38244.0, style=ProgressStyle(descripti…




In [10]:
partition_df

Unnamed: 0,partition_id,date,transcript,vector
0,0,2014-12-06,"# Oh, yeah # I see your smiling face # Like I’...","[-0.4654743, -1.0576028, -0.4447788, 0.1935643..."
1,0,2014-12-06,Two hostages are killed in Yemen during a fail...,"[0.6032044, -0.80156326, 0.65443814, -0.146032..."
2,0,2014-12-06,"Our hearts are full of sorrow tonight, our pra...","[0.32675767, -0.44931835, 0.22331822, 0.181003..."
3,0,2014-12-06,Ferocious winds and torrential rain in the eas...,"[0.33944637, 0.3982145, -0.78446454, -0.215373..."
4,0,2014-12-06,"President Obama has condemned as ""barbaric"" th...","[1.0871935, -0.5206851, 1.609484, -0.28606078,..."
...,...,...,...,...
38239,1242,2014-06-26,It is what is on the table at the moment. It i...,"[1.0127413, -2.9796925, 1.8330736, 0.8077801, ..."
38240,1242,2014-06-26,"Tonight, we are in Wolverhampton, and welcome ...","[-0.06638427, -0.7492183, 0.048772216, 0.60429..."
38241,1242,2014-06-26,"Conservative Defence Minister Anna Soubry, Lab...","[-1.3325678, 1.0542467, -0.03578683, 0.3904657..."
38242,1242,2014-06-26,"But these can be close calls, and with hindsig...","[-0.54162824, 0.3936673, 0.27738285, 0.4509352..."


In [11]:
min_date = partition_df.date.min() - timedelta(days=2)
max_date = partition_df.date.max() + timedelta(days=2)

print('min date: ', min_date)
print('max date: ', max_date)

min date:  2013-12-30 00:00:00
max date:  2015-01-02 00:00:00


In [12]:
del df

In [13]:
articles = pd.read_csv('./data/news_2014_predictions.csv')

In [14]:
articles = articles.dropna(subset=['transcript'])

In [15]:
articles = articles.drop(['Unnamed: 0'], axis=1)
articles

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc
0,163795,Belfast Telegraph,1,1,2014,Hunt begins for team to secure cultural legacy,"As the clock struck midnight last night, Londo...",,"Parliament, government and politics",61.99,Others,27.25,"Culture, media and sport",7.38
1,163795,Belfast Telegraph,1,1,2014,Contractor to pay for power cut,Northern Ireland Electricity was flooded with ...,,Energy and environment,31.75,"Business, industry and consumers",22.29,Transport,10.36
2,163795,Belfast Telegraph,1,1,2014,Well-known city hotelier celebrates MBE,Almost a quarter-ofa-century spent transformin...,,Others,19.76,Communities and families,15.66,"Parliament, government and politics",9.68
3,163795,Belfast Telegraph,1,1,2014,"Derry treated as poor relation, say rail campa...",Campaigners from Into The West argued that Der...,,Transport,86.85,"Parliament, government and politics",2.95,"Culture, media and sport",1.89
4,163795,Belfast Telegraph,1,1,2014,Ulster are not good enough;\nTONY WARD'S DAMNI...,The Dublin-based columnist blasted Mark Anscom...,,Others,35.12,"Parliament, government and politics",31.31,"Culture, media and sport",10.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653116,412338,Wales,31,12,2014,Paper talk: Manchester City and Chelsea in for...,"Bony would command a fee of around £30m, with ...",,"Culture, media and sport",35.40,Others,32.83,"Parliament, government and politics",23.47
1653117,412338,Wales,31,12,2014,Your Gallery: Christopher loves combining medi...,"Christopher Langley is 52. He says: ""I aim to ...",,"Parliament, government and politics",85.74,Others,4.07,"Culture, media and sport",3.59
1653118,412338,Wales,31,12,2014,Seven-up for Cardiff Devils earns ninth succes...,Elite Ice Hockey League ...,,Others,69.38,"Culture, media and sport",4.63,Science and technology,4.35
1653119,412338,Wales,31,12,2014,"Man, 21, arrested on suspicion of violent Boxi...",Wesley Walters and his girlfriend Angharad Col...,,Others,38.66,"Crime, civil law, justice and rights",24.67,"Parliament, government and politics",5.85


In [16]:
vector = []
transcripts = articles.transcript.values

preprocessed_transcripts = parmap.map(preprocess_text, transcripts, pm_pbar=True)

1653120it [02:48, 9814.49it/s]                              


In [17]:
vector_transcripts = parmap.map(doc2vec_model.infer_vector, preprocessed_transcripts, pm_pbar=True)
articles['vector'] = vector_transcripts

1653120it [19:52, 1386.27it/s]                             


In [18]:
articles

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,vector
0,163795,Belfast Telegraph,1,1,2014,Hunt begins for team to secure cultural legacy,"As the clock struck midnight last night, Londo...",,"Parliament, government and politics",61.99,Others,27.25,"Culture, media and sport",7.38,"[1.2243358, -2.1193075, 2.6058152, 2.4895468, ..."
1,163795,Belfast Telegraph,1,1,2014,Contractor to pay for power cut,Northern Ireland Electricity was flooded with ...,,Energy and environment,31.75,"Business, industry and consumers",22.29,Transport,10.36,"[-0.48997223, -2.2193837, 0.91578805, -0.03129..."
2,163795,Belfast Telegraph,1,1,2014,Well-known city hotelier celebrates MBE,Almost a quarter-ofa-century spent transformin...,,Others,19.76,Communities and families,15.66,"Parliament, government and politics",9.68,"[-0.5006421, -0.60977817, -0.13471842, 2.19769..."
3,163795,Belfast Telegraph,1,1,2014,"Derry treated as poor relation, say rail campa...",Campaigners from Into The West argued that Der...,,Transport,86.85,"Parliament, government and politics",2.95,"Culture, media and sport",1.89,"[-1.0925293, -1.7030581, 2.4701743, -0.65294, ..."
4,163795,Belfast Telegraph,1,1,2014,Ulster are not good enough;\nTONY WARD'S DAMNI...,The Dublin-based columnist blasted Mark Anscom...,,Others,35.12,"Parliament, government and politics",31.31,"Culture, media and sport",10.75,"[0.50785047, -1.3902558, 2.2514608, 0.24673429..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653116,412338,Wales,31,12,2014,Paper talk: Manchester City and Chelsea in for...,"Bony would command a fee of around £30m, with ...",,"Culture, media and sport",35.40,Others,32.83,"Parliament, government and politics",23.47,"[-0.23806001, -1.4472854, 1.1219152, 0.4961517..."
1653117,412338,Wales,31,12,2014,Your Gallery: Christopher loves combining medi...,"Christopher Langley is 52. He says: ""I aim to ...",,"Parliament, government and politics",85.74,Others,4.07,"Culture, media and sport",3.59,"[-3.5835943, -0.8215955, 0.34762818, 2.311558,..."
1653118,412338,Wales,31,12,2014,Seven-up for Cardiff Devils earns ninth succes...,Elite Ice Hockey League ...,,Others,69.38,"Culture, media and sport",4.63,Science and technology,4.35,"[3.5257237, -0.92042464, 2.2676115, 0.7478787,..."
1653119,412338,Wales,31,12,2014,"Man, 21, arrested on suspicion of violent Boxi...",Wesley Walters and his girlfriend Angharad Col...,,Others,38.66,"Crime, civil law, justice and rights",24.67,"Parliament, government and politics",5.85,"[-1.3327614, -0.50032455, 0.49406138, 0.796026..."


In [19]:
#articles.to_csv('./data/news_2014_predictions_vectored.csv')

In [20]:
import datetime
articles['date'] = articles.apply(lambda x: datetime.date(x.year, x.month, x.day), axis=1)

In [21]:
articles = articles.drop(['day', 'month', 'year'], axis=1)

In [22]:
#articles['vector'] = articles.vector.apply(lambda x: list(x))

In [24]:
sample = partition_df.sample(n=1000, weights='partition_id', random_state=1).reset_index(drop=True)

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
def predict(partition):
    dt = partition.date
    start_dt = dt - timedelta(days=2)
    end_dt = dt + timedelta(days=2)
    articles_window = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)]
    partition_vector = np.array(partition.vector)
    partition_vector = partition_vector.reshape(1, 100)
    articles_vector = articles_window.vector.values
    max_index = -1
    max_sim = -1
    for i in range(len(articles_vector)):
        sim = cosine_similarity(partition_vector, [articles_vector[i]])
        if max_sim < sim:
            max_sim = sim
            max_index = i
    return articles_window.iloc[max_index]['top1_topic']

In [27]:
predict(sample.iloc[4])

'Transport'

In [28]:
sample.iloc[4].transcript

'But there are varying opinions on economic benefits of faster rail services to Birmingham and Crewe as well as Manchester and Leeds. The debate will continue over whether high-speed rail link really will provide a boost to economic growth long-term in the North of England, orwhether growth long-term in the North of England, or whether it will simply offer more opportunities for commuters to get to London more quickly. I don’t think it will boost the economies of towns in the north. For starters, they face the big tax bill to pay for the project and also the numerous examples both in the North and the Midlands of towns which already have vast buildings to London but fail to be transformed. Today’s report also says the planned link between HS2 and the high-speed rail link to the Channel tunnel should be dropped. Instead, Euston should be redeveloped at the end of the line.'

In [29]:
preds = []

for index, row in tqdm(sample.iterrows(), total=len(sample)):
    preds.append(predict(row))
    
sample['prediction'] = preds

100%|██████████| 1000/1000 [1:28:50<00:00,  5.33s/it]


In [30]:
sample.to_csv('./window_topic_predictions_2014.csv')