In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from semantic_text_similarity.models import WebBertSimilarity
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import spatial
import parmap
import os
import swifter
from tqdm import tqdm

  import pandas.util.testing as tm


In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
path = './data/2016_June_transcripts.csv'
df = pd.read_csv(path)
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,source,date,program,Ibm transcript,transcript
0,BBC News 24,2016-06-01,BBC News at Six,What do you want to be a police but one of the...,"spent God knows how many man hours, and you th..."
1,BBC1 London,2016-06-03,BBC News at One,Yeah the BBC news one cancel button. 20 years ...,Twenty years on the coroner at an inquest into...
2,BBC News 24,2016-06-14,BBC News at Six,The European Union Britain and Europe immigrat...,MUSIC: Mr Blue Sky by ELO # Sun is shinin’ in ...
3,BBC News 24,2016-06-09,BBC News at Ten,His stats I don't need anyone with a gun stand...,They don’t need anyone with a gun standing in ...
4,BBC1 London,2016-06-21,BBC News at One,Now Hey own BBC 17 macaulay has the BBC knees....,"Guilty of murder, a father who battered his si..."
5,BBC News 24,2016-06-02,BBC News at Six,Anyone's performs live room breakfast room 6 w...,BIRDSONG ROCK MUSIC PLAYS Come on now! Squeaky...
6,BBC1 London,2016-06-20,BBC News at One,At. Thank you for everything. Starts next Mond...,"MUSIC: jupiter, The Bringer Ofjollity by Holst..."
7,BBC News 24,2016-06-02,BBC News at Ten,I think it's about getting ready for surgery w...,"I can’t hear anything. OK, we need to intubate..."
8,BBC1 London,2016-06-09,BBC News at One,I only see one so if you write with the BBC ne...,Two former prime ministers join forces to warn...
9,BBC1 London,2016-06-01,BBC News at One,130 on BBC 1. With so many claims being made o...,"It’s notjust about getting rid of cancer, it’s..."


In [4]:
def filter_short_sentence(sentences):
    n_sent = len(sentences)
    filtered_sentences = []
    for i in range(n_sent):
        if len(sentences[i].split()) >= 10:
            filtered_sentences.append(sentences[i])
    return filtered_sentences

def partition_transcript_into_topics(transcript):
    cluster = []
    sentences = nltk.sent_tokenize(transcript)
    sentences = filter_short_sentence(sentences)
    if (len(sentences) == 0):
        return ''
    n_sent = len(sentences)
    current_cluster = [sentences[0]]
    n_cluster = 1
    for i in range(1, n_sent):
        sim = web_model.predict([(sentences[i], sentences[i-1])])[0]/5
        
#         for sent in current_cluster:
#             sim+= web_model.predict([(sent, sentences[i])])[0]/5
            
#         avg_sim = sim/len(current_cluster)
        
        if sim >= 0.1:
            current_cluster.append(sentences[i])
            if i == n_sent - 1:
                cluster.append(' '.join(current_cluster))
        else:
            n_cluster += 1
            cluster.append(' '.join(current_cluster))
            current_cluster = [sentences[i]]
    return '\n---------------------\n'.join(cluster)

In [5]:
web_model = WebBertSimilarity(device='cpu', batch_size=10) #defaults to GPU prediction

In [8]:
transcripts = df['transcript'].values
ibm_transcripts = df['Ibm transcript'].values

In [9]:
partitions = []
for transcript in tqdm(ibm_transcripts):
    partitions.append(partition_transcript_into_topics(transcript))
                      
df['ibm_partitioned_transcript'] = partitions

100%|██████████| 10/10 [01:27<00:00,  8.74s/it]


In [10]:
partitions = []
for transcript in tqdm(transcripts):
    partitions.append(partition_transcript_into_topics(transcript))
                      
df['partitioned_transcript'] = partitions

100%|██████████| 10/10 [02:16<00:00, 13.68s/it]


In [11]:
df

Unnamed: 0,source,date,program,Ibm transcript,transcript,ibm_partitioned_transcript,partitioned_transcript
0,BBC News 24,2016-06-01,BBC News at Six,What do you want to be a police but one of the...,"spent God knows how many man hours, and you th...",What do you want to be a police but one of the...,"spent God knows how many man hours, and you th..."
1,BBC1 London,2016-06-03,BBC News at One,Yeah the BBC news one cancel button. 20 years ...,Twenty years on the coroner at an inquest into...,20 years on the corner investigating the death...,Twenty years on the coroner at an inquest into...
2,BBC News 24,2016-06-14,BBC News at Six,The European Union Britain and Europe immigrat...,MUSIC: Mr Blue Sky by ELO # Sun is shinin’ in ...,The European Union Britain and Europe immigrat...,MUSIC: Mr Blue Sky by ELO # Sun is shinin’ in ...
3,BBC News 24,2016-06-09,BBC News at Ten,His stats I don't need anyone with a gun stand...,They don’t need anyone with a gun standing in ...,His stats I don't need anyone with a gun stand...,They don’t need anyone with a gun standing in ...
4,BBC1 London,2016-06-21,BBC News at One,Now Hey own BBC 17 macaulay has the BBC knees....,"Guilty of murder, a father who battered his si...",Now Hey own BBC 17 macaulay has the BBC knees....,"Guilty of murder, a father who battered his si..."
5,BBC News 24,2016-06-02,BBC News at Six,Anyone's performs live room breakfast room 6 w...,BIRDSONG ROCK MUSIC PLAYS Come on now! Squeaky...,Anyone's performs live room breakfast room 6 w...,The end of a high street era - BHS is to close...
6,BBC1 London,2016-06-20,BBC News at One,At. Thank you for everything. Starts next Mond...,"MUSIC: jupiter, The Bringer Ofjollity by Holst...",At 1:00 the BBC news now on BBC one with so if...,"MUSIC: jupiter, The Bringer Ofjollity by Holst..."
7,BBC News 24,2016-06-02,BBC News at Ten,I think it's about getting ready for surgery w...,"I can’t hear anything. OK, we need to intubate...",I think it's about getting ready for surgery w...,"OK, we need to intubate, get him ready for sur..."
8,BBC1 London,2016-06-09,BBC News at One,I only see one so if you write with the BBC ne...,Two former prime ministers join forces to warn...,I only see one so if you write with the BBC ne...,Two former prime ministers join forces to warn...
9,BBC1 London,2016-06-01,BBC News at One,130 on BBC 1. With so many claims being made o...,"It’s notjust about getting rid of cancer, it’s...",With so many claims being made on both sides t...,"It’s notjust about getting rid of cancer, it’s..."


In [12]:
df.to_csv('2016_June_transcripts_with_partition.csv')

In [14]:
partitions = []
for index, row in df.iterrows():
    partition_string = row['partitioned_transcript']
    partition_date = row['date']
    all_partitions = partition_string.split('\n---------------------\n')
    for partition in all_partitions:
        partitions.append((index, partition_date, partition))
partition_df = pd.DataFrame(partitions, columns=['partition_id', 'date', 'transcript'])

In [15]:
partition_df

Unnamed: 0,partition_id,date,transcript
0,0,2016-06-01,"spent God knows how many man hours, and you th..."
1,0,2016-06-01,Four decades after the Birmingham pub bombings...
2,0,2016-06-01,One of the country’s most prolific paedophiles...
3,0,2016-06-01,"After finding debris, investigators detect sig..."
4,0,2016-06-01,And the oldest hand-written document ever foun...
...,...,...,...
366,9,2016-06-01,The governing body wants more control over the...
367,9,2016-06-01,A signal has been detected that is likely to b...
368,9,2016-06-01,Donald Trump - who’s set to be the Republican ...
369,9,2016-06-01,75 members of the japanese Self-Defence Forces...


In [16]:
partition_df.to_csv('2016_June_partition.csv')

In [17]:
partitions = []
for index, row in df.iterrows():
    partition_string = row['ibm_partitioned_transcript']
    partition_date = row['date']
    all_partitions = partition_string.split('\n---------------------\n')
    for partition in all_partitions:
        partitions.append((index, partition_date, partition))
ibm_partition_df = pd.DataFrame(partitions, columns=['partition_id', 'date', 'transcript'])

In [18]:
ibm_partition_df

Unnamed: 0,partition_id,date,transcript
0,0,2016-06-01,What do you want to be a police but one of the...
1,0,2016-06-01,4 decades after the Birmingham pub bombings ki...
2,0,2016-06-01,One of the country's most prolific paedophiles...
3,0,2016-06-01,After finding debris investigators detect sign...
4,0,2016-06-01,The oldest handwritten documents ever found in...
...,...,...,...
189,9,2016-06-01,A signal has been detected that is likely to b...
190,9,2016-06-01,Donald Trump who's set to be the Republican pr...
191,9,2016-06-01,75 members of the Japanese self defense forces...
192,9,2016-06-01,The first ever mention of London Britain's old...


In [19]:
ibm_partition_df.to_csv('2016_June_ibm_partition.csv')

In [20]:
year = 2016
excluding = '_no_Others'

# Topic Prediction with NN Classifier

In [21]:
from joblib import dump, load

enc = load('./models/doc2vec/encoder_{}{}.joblib'.format(year, excluding))

def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = []
    for i, p in enumerate(pred[0]):
        one_hot = np.zeros(len(pred[0]))
        one_hot[i] = 1
        pred_i += [(enc.inverse_transform([one_hot])[0][0], p)]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]


def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [22]:
doc2vec_model = Doc2Vec.load('./models/doc2vec/doc2vec_news_{}{}'.format(year, excluding))
classifier = build_network()
classifier.load_weights('./models/doc2vec/news_classifier_{}{}'.format(year, excluding))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f00c752eb90>

In [23]:
transcripts = partition_df.transcript.values
predict(transcripts[5])

[('Crime, civil law, justice and rights', 0.7038356),
 ('Parliament, government and politics', 0.1446062),
 ('Culture, media and sport', 0.11053462)]

In [24]:
predictions = []
from tqdm import tqdm
for transcript in tqdm(transcripts):
    predictions.append(predict(transcript))

100%|██████████| 371/371 [00:19<00:00, 19.19it/s]


In [25]:
partition_df['topic'] = predictions

In [26]:
partition_df

Unnamed: 0,partition_id,date,transcript,topic
0,0,2016-06-01,"spent God knows how many man hours, and you th...","[(Parliament, government and politics, 0.40760..."
1,0,2016-06-01,Four decades after the Birmingham pub bombings...,"[(Parliament, government and politics, 0.62804..."
2,0,2016-06-01,One of the country’s most prolific paedophiles...,"[(Crime, civil law, justice and rights, 0.5972..."
3,0,2016-06-01,"After finding debris, investigators detect sig...","[(Parliament, government and politics, 0.58782..."
4,0,2016-06-01,And the oldest hand-written document ever foun...,"[(Culture, media and sport, 0.38548765), (Parl..."
...,...,...,...,...
366,9,2016-06-01,The governing body wants more control over the...,"[(Culture, media and sport, 0.71138096), (Parl..."
367,9,2016-06-01,A signal has been detected that is likely to b...,"[(Parliament, government and politics, 0.72483..."
368,9,2016-06-01,Donald Trump - who’s set to be the Republican ...,"[(Parliament, government and politics, 0.57083..."
369,9,2016-06-01,75 members of the japanese Self-Defence Forces...,"[(International affairs, 0.41441783), (Culture..."


In [27]:
transcripts = ibm_partition_df.transcript.values
predict(transcripts[5])

[('Transport', 0.8905398),
 ('Business, industry and consumers', 0.068960205),
 ('European Union', 0.013720543)]

In [28]:
predictions = []
from tqdm import tqdm
for transcript in tqdm(transcripts):
    predictions.append(predict(transcript))

100%|██████████| 194/194 [00:12<00:00, 15.23it/s]


In [29]:
ibm_partition_df['topic'] = predictions

In [31]:
ibm_partition_df

Unnamed: 0,partition_id,date,transcript,topic
0,0,2016-06-01,What do you want to be a police but one of the...,"[(Parliament, government and politics, 0.63155..."
1,0,2016-06-01,4 decades after the Birmingham pub bombings ki...,"[(Parliament, government and politics, 0.54178..."
2,0,2016-06-01,One of the country's most prolific paedophiles...,"[(Crime, civil law, justice and rights, 0.6025..."
3,0,2016-06-01,After finding debris investigators detect sign...,"[(Parliament, government and politics, 0.64695..."
4,0,2016-06-01,The oldest handwritten documents ever found in...,"[(European Union, 0.46035329), (Parliament, go..."
...,...,...,...,...
189,9,2016-06-01,A signal has been detected that is likely to b...,"[(Parliament, government and politics, 0.78883..."
190,9,2016-06-01,Donald Trump who's set to be the Republican pr...,"[(Parliament, government and politics, 0.58243..."
191,9,2016-06-01,75 members of the Japanese self defense forces...,"[(International affairs, 0.6331176), (Parliame..."
192,9,2016-06-01,The first ever mention of London Britain's old...,"[(Business, industry and consumers, 0.5929644)..."


In [32]:
partition_df.to_csv('2016_June_partition_prediction.csv')
ibm_partition_df.to_csv('2016_June_ibm_partition_prediction.csv')

# Window Prediction: Cosine Similarity

In [33]:
partition_df['vector'] = partition_df.swifter.apply(lambda x: doc2vec_model.infer_vector(preprocess_text(x['transcript'])), axis=1)
ibm_partition_df['vector'] = ibm_partition_df.swifter.apply(lambda x: doc2vec_model.infer_vector(preprocess_text(x['transcript'])), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=371.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=194.0, style=ProgressStyle(description…




In [34]:
articles = pd.read_csv('./data/news_predictions/news_2016_predictions.csv')

In [35]:
articles = articles.dropna(subset=['transcript'])
articles = articles.loc[articles.top1_topic != 'Others']
articles = articles.drop(['Unnamed: 0'], axis=1)
articles = articles.loc[articles.month.isin([6])]
articles

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc
815059,163795,Belfast Telegraph,1,6,2016,North West 'needs task force on jobs to be rev...,"Gavin Killeen, who said more needs to be done ...",1,"Parliament, government and politics",72.42,"Business, industry and consumers",6.75,Economy and finance,4.93
815060,163795,Belfast Telegraph,1,6,2016,Time to take your town centre to heart,"Rather than complain, residents can participat...",1,"Culture, media and sport",24.98,"Business, industry and consumers",23.64,"Agriculture, animals, food and rural affairs",10.36
815061,163795,Belfast Telegraph,1,6,2016,Causeway Coast for food heaven,"Despite its infancy, the CCAG Food Network has...",1,"Culture, media and sport",52.31,"Agriculture, animals, food and rural affairs",17.63,"Business, industry and consumers",8.31
815062,163795,Belfast Telegraph,1,6,2016,Here comes summer - time for Country Kitchen s...,Country Kitchen side salads are the perfect ac...,1,"Culture, media and sport",28.93,Others,24.34,"Business, industry and consumers",10.81
815063,163795,Belfast Telegraph,1,6,2016,NI for the cream of the crop,"Comber Earlies - in season now, Lough Neagh Ee...",1,"Agriculture, animals, food and rural affairs",40.33,"Culture, media and sport",39.01,Others,4.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970363,412338,Wales,30,6,2016,Amazon announce Second Prime Day shopping even...,"Prime Day will take place on July 12, with the...",1,"Business, industry and consumers",20.31,Others,15.73,Transport,6.81
970364,412338,Wales,30,6,2016,Morning news headlines: Boris Johnson and Ther...,Tory heavyweights Boris Johnson and Theresa Ma...,1,"Parliament, government and politics",25.05,"Culture, media and sport",18.58,International affairs,17.69
970365,412338,Wales,30,6,2016,Inspection finds Welsh police force kept too m...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",39.50,Communities and families,11.34,"Parliament, government and politics",9.34
970366,412338,Wales,30,6,2016,Inspection found that a Welsh police force kep...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",35.47,Communities and families,14.13,Health services and medicine,10.10


In [36]:
vector = []
transcripts = articles.transcript.values

preprocessed_transcripts = parmap.map(preprocess_text, transcripts, pm_pbar=True)

vector_transcripts = parmap.map(doc2vec_model.infer_vector, preprocessed_transcripts, pm_pbar=True)
articles['vector'] = vector_transcripts

del preprocessed_transcripts

articles

123776it [00:29, 4245.83it/s]                            
123776it [04:04, 505.79it/s]                            


Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,vector
815059,163795,Belfast Telegraph,1,6,2016,North West 'needs task force on jobs to be rev...,"Gavin Killeen, who said more needs to be done ...",1,"Parliament, government and politics",72.42,"Business, industry and consumers",6.75,Economy and finance,4.93,"[0.3408227, -0.0209849, 0.17903826, -1.7234977..."
815060,163795,Belfast Telegraph,1,6,2016,Time to take your town centre to heart,"Rather than complain, residents can participat...",1,"Culture, media and sport",24.98,"Business, industry and consumers",23.64,"Agriculture, animals, food and rural affairs",10.36,"[-0.71105456, -0.4305179, -0.5447584, -0.42238..."
815061,163795,Belfast Telegraph,1,6,2016,Causeway Coast for food heaven,"Despite its infancy, the CCAG Food Network has...",1,"Culture, media and sport",52.31,"Agriculture, animals, food and rural affairs",17.63,"Business, industry and consumers",8.31,"[0.6972058, -1.0441506, 2.6348343, 0.16490413,..."
815062,163795,Belfast Telegraph,1,6,2016,Here comes summer - time for Country Kitchen s...,Country Kitchen side salads are the perfect ac...,1,"Culture, media and sport",28.93,Others,24.34,"Business, industry and consumers",10.81,"[-1.3153038, 2.404461, 1.6879399, -1.8530523, ..."
815063,163795,Belfast Telegraph,1,6,2016,NI for the cream of the crop,"Comber Earlies - in season now, Lough Neagh Ee...",1,"Agriculture, animals, food and rural affairs",40.33,"Culture, media and sport",39.01,Others,4.54,"[0.80128914, -0.14410646, 0.5001334, -0.420445..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970363,412338,Wales,30,6,2016,Amazon announce Second Prime Day shopping even...,"Prime Day will take place on July 12, with the...",1,"Business, industry and consumers",20.31,Others,15.73,Transport,6.81,"[-1.4910322, 0.32347322, 1.4588598, -0.1260592..."
970364,412338,Wales,30,6,2016,Morning news headlines: Boris Johnson and Ther...,Tory heavyweights Boris Johnson and Theresa Ma...,1,"Parliament, government and politics",25.05,"Culture, media and sport",18.58,International affairs,17.69,"[0.49234232, -1.7949353, 0.83398604, 0.3449338..."
970365,412338,Wales,30,6,2016,Inspection finds Welsh police force kept too m...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",39.50,Communities and families,11.34,"Parliament, government and politics",9.34,"[1.4860406, -0.58440536, -0.90384376, 0.650212..."
970366,412338,Wales,30,6,2016,Inspection found that a Welsh police force kep...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",35.47,Communities and families,14.13,Health services and medicine,10.10,"[1.2201128, -0.52487105, -0.9235491, 0.6955690..."


In [37]:
import datetime
articles['date'] = articles.apply(lambda x: datetime.date(x.year, x.month, x.day), axis=1)

In [45]:
from datetime import datetime, timedelta
from sklearn.metrics.pairwise import cosine_similarity

def predict_with_window(partition):
    dt = partition.date
    start_dt = dt - timedelta(days=2)
    end_dt = dt + timedelta(days=2)
    articles_window = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)]
    partition_vector = np.array(partition.vector)
    partition_vector = partition_vector.reshape(1, 100)
    articles_vector = articles_window.vector.values
    vec = [articles_vector[i] for i in range(len(articles_vector))]
    vec = np.array(vec)
    sim = cosine_similarity(partition_vector, vec)
    max_index = np.argmax(sim)
    
    return articles_window.iloc[max_index]['top1_topic']

In [46]:
partition_df['date'] = pd.to_datetime(partition_df['date'])
preds = []

for index, row in tqdm(partition_df.iterrows(), total=len(partition_df)):
    preds.append(predict_with_window(row))
    
partition_df['window prediction'] = preds

100%|██████████| 371/371 [00:38<00:00,  9.74it/s]


In [49]:
ibm_partition_df['date'] = pd.to_datetime(ibm_partition_df['date'])

preds = []

for index, row in tqdm(ibm_partition_df.iterrows(), total=len(ibm_partition_df)):
    preds.append(predict_with_window(row))
    
ibm_partition_df['window prediction'] = preds

100%|██████████| 194/194 [00:22<00:00,  8.62it/s]


In [52]:
partition_df = partition_df.drop(['vector'], axis=1)
ibm_partition_df = ibm_partition_df.drop(['vector'], axis=1)

In [53]:
partition_df.to_csv('2016_June_partition_prediction.csv')
ibm_partition_df.to_csv('2016_June_ibm_partition_prediction.csv')

# TFIDF WINDOW Prediction

In [88]:
vector = []
transcripts = articles.transcript.values

from gensim import corpora, models, similarities
preprocessed_transcripts = parmap.map(preprocess_text, transcripts, pm_pbar=True)

dictionary = corpora.Dictionary(preprocessed_transcripts)
dictionary.filter_extremes(no_below=5, no_above=0.7)







  0%|          | 0/123677 [00:00<?, ?it/s][A[A[A[A[A[A





 20%|█▉        | 24175/123677 [00:12<00:51, 1928.86it/s][A[A[A[A[A[A





 38%|███▊      | 46416/123677 [00:14<00:30, 2562.29it/s][A[A[A[A[A[A





 56%|█████▋    | 69624/123677 [00:16<00:16, 3320.16it/s][A[A[A[A[A[A





 72%|███████▏  | 88964/123677 [00:31<00:13, 2516.57it/s][A[A[A[A[A[A





 72%|███████▏  | 88964/123677 [00:31<00:13, 2516.57it/s][A[A[A[A[A[A





 92%|█████████▏| 114106/123677 [00:33<00:03, 2965.93it/s][A[A[A[A[A[A





123776it [00:34, 3613.66it/s]                            [A[A[A[A[A[A


In [89]:
raw_corpus = parmap.map(dictionary.doc2bow, preprocessed_transcripts, pm_pbar=True)

tfidf = models.TfidfModel(raw_corpus)

def tfidf_transform(t):
    return tfidf[dictionary.doc2bow(preprocess_text(t))]

vector_transcripts = parmap.map(tfidf_transform, transcripts, pm_pbar=True)
articles['vector'] = vector_transcripts

articles







  0%|          | 0/123677 [00:00<?, ?it/s][A[A[A[A[A[A





  2%|▏         | 2901/123677 [00:02<01:24, 1422.33it/s][A[A[A[A[A[A





  9%|▉         | 11604/123677 [00:04<01:03, 1769.94it/s][A[A[A[A[A[A





 16%|█▌        | 19340/123677 [00:06<00:49, 2091.47it/s][A[A[A[A[A[A





 25%|██▌       | 30944/123677 [00:08<00:37, 2506.02it/s][A[A[A[A[A[A





 33%|███▎      | 40614/123677 [00:10<00:28, 2878.89it/s][A[A[A[A[A[A





 39%|███▉      | 48350/123677 [00:13<00:24, 3096.93it/s][A[A[A[A[A[A





 47%|████▋     | 58020/123677 [00:15<00:19, 3347.71it/s][A[A[A[A[A[A





 56%|█████▋    | 69624/123677 [00:17<00:14, 3683.94it/s][A[A[A[A[A[A





 64%|██████▍   | 79294/123677 [00:20<00:11, 3846.78it/s][A[A[A[A[A[A





 72%|███████▏  | 88964/123677 [00:22<00:08, 3945.49it/s][A[A[A[A[A[A





 80%|███████▉  | 98634/123677 [00:24<00:06, 3971.44it/s][A[A[A[A[A[A





 83%|████████▎ | 102502/123677 [00:31<00:14, 1503

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,vector,date
815059,163795,Belfast Telegraph,1,6,2016,North West 'needs task force on jobs to be rev...,"Gavin Killeen, who said more needs to be done ...",1,"Parliament, government and politics",72.42,"Business, industry and consumers",6.75,Economy and finance,4.93,"[(0, 0.009768523601801414), (1, 0.037194480658...",2016-06-01
815060,163795,Belfast Telegraph,1,6,2016,Time to take your town centre to heart,"Rather than complain, residents can participat...",1,"Culture, media and sport",24.98,"Business, industry and consumers",23.64,"Agriculture, animals, food and rural affairs",10.36,"[(0, 0.008816292574312965), (12, 0.00649942546...",2016-06-01
815061,163795,Belfast Telegraph,1,6,2016,Causeway Coast for food heaven,"Despite its infancy, the CCAG Food Network has...",1,"Culture, media and sport",52.31,"Agriculture, animals, food and rural affairs",17.63,"Business, industry and consumers",8.31,"[(55, 0.02630912819204411), (56, 0.03031247511...",2016-06-01
815062,163795,Belfast Telegraph,1,6,2016,Here comes summer - time for Country Kitchen s...,Country Kitchen side salads are the perfect ac...,1,"Culture, media and sport",28.93,Others,24.34,"Business, industry and consumers",10.81,"[(11, 0.01850950104966622), (65, 0.08106747721...",2016-06-01
815063,163795,Belfast Telegraph,1,6,2016,NI for the cream of the crop,"Comber Earlies - in season now, Lough Neagh Ee...",1,"Agriculture, animals, food and rural affairs",40.33,"Culture, media and sport",39.01,Others,4.54,"[(0, 0.015123308361842233), (15, 0.01624536985...",2016-06-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970363,412338,Wales,30,6,2016,Amazon announce Second Prime Day shopping even...,"Prime Day will take place on July 12, with the...",1,"Business, industry and consumers",20.31,Others,15.73,Transport,6.81,"[(11, 0.02671843797247299), (59, 0.04164182682...",2016-06-30
970364,412338,Wales,30,6,2016,Morning news headlines: Boris Johnson and Ther...,Tory heavyweights Boris Johnson and Theresa Ma...,1,"Parliament, government and politics",25.05,"Culture, media and sport",18.58,International affairs,17.69,"[(0, 0.007007584773078375), (5, 0.032375998337...",2016-06-30
970365,412338,Wales,30,6,2016,Inspection finds Welsh police force kept too m...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",39.50,Communities and families,11.34,"Parliament, government and politics",9.34,"[(2, 0.018038846339345438), (7, 0.038170161277...",2016-06-30
970366,412338,Wales,30,6,2016,Inspection found that a Welsh police force kep...,HM Inspectorate of Constabulary (HMIC) publish...,1,"Crime, civil law, justice and rights",35.47,Communities and families,14.13,Health services and medicine,10.10,"[(2, 0.018038846339345438), (7, 0.038170161277...",2016-06-30


In [90]:
partition_df['vector'] = partition_df.swifter.apply(lambda x: tfidf_transform(x['transcript']), axis=1)
ibm_partition_df['vector'] = ibm_partition_df.swifter.apply(lambda x: tfidf_transform(x['transcript']), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=371.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=194.0, style=ProgressStyle(description…




In [115]:
index = similarities.MatrixSimilarity(articles.vector)

In [108]:
def predict_with_tfidf_window(partition):
    dt = partition.date
    start_dt = dt - timedelta(days=2)
    end_dt = dt + timedelta(days=2)
    indices = articles.loc[(articles['date'] >= start_dt) & (articles['date'] <= end_dt)].index
    partition_vector = partition.vector
    
    sim = index[partition_vector]
    max_index = np.argmax(sim)
    
    return articles.iloc[max_index]['top1_topic']

In [111]:
preds = []
for _, row in tqdm(partition_df.iterrows(), total=len(partition_df)):
    preds.append(predict_with_tfidf_window(row))
    
partition_df['tfidf_window prediction'] = preds









  0%|          | 0/371 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  0%|          | 1/371 [00:02<12:22,  2.01s/it][A[A[A[A[A[A[A[A







  1%|          | 2/371 [00:04<12:19,  2.00s/it][A[A[A[A[A[A[A[A







  1%|          | 3/371 [00:05<11:43,  1.91s/it][A[A[A[A[A[A[A[A







  1%|          | 4/371 [00:07<11:10,  1.83s/it][A[A[A[A[A[A[A[A







  1%|▏         | 5/371 [00:08<10:46,  1.77s/it][A[A[A[A[A[A[A[A







  2%|▏         | 6/371 [00:10<10:29,  1.72s/it][A[A[A[A[A[A[A[A







  2%|▏         | 7/371 [00:12<10:18,  1.70s/it][A[A[A[A[A[A[A[A







  2%|▏         | 8/371 [00:13<10:10,  1.68s/it][A[A[A[A[A[A[A[A







  2%|▏         | 9/371 [00:15<10:07,  1.68s/it][A[A[A[A[A[A[A[A







  3%|▎         | 10/371 [00:17<10:02,  1.67s/it][A[A[A[A[A[A[A[A







  3%|▎         | 11/371 [00:18<09:56,  1.66s/it][A[A[A[A[A[A[A[A







  3%|▎         | 12/371 [00:20<09:51,  1.65s/it][

 27%|██▋       | 102/371 [02:47<07:19,  1.63s/it][A[A[A[A[A[A[A[A







 28%|██▊       | 103/371 [02:49<07:16,  1.63s/it][A[A[A[A[A[A[A[A







 28%|██▊       | 104/371 [02:50<07:13,  1.62s/it][A[A[A[A[A[A[A[A







 28%|██▊       | 105/371 [02:52<07:11,  1.62s/it][A[A[A[A[A[A[A[A







 29%|██▊       | 106/371 [02:54<07:08,  1.62s/it][A[A[A[A[A[A[A[A







 29%|██▉       | 107/371 [02:55<07:06,  1.62s/it][A[A[A[A[A[A[A[A







 29%|██▉       | 108/371 [02:57<07:04,  1.62s/it][A[A[A[A[A[A[A[A







 29%|██▉       | 109/371 [02:58<07:03,  1.62s/it][A[A[A[A[A[A[A[A







 30%|██▉       | 110/371 [03:00<07:01,  1.61s/it][A[A[A[A[A[A[A[A







 30%|██▉       | 111/371 [03:02<06:59,  1.61s/it][A[A[A[A[A[A[A[A







 30%|███       | 112/371 [03:03<06:58,  1.61s/it][A[A[A[A[A[A[A[A







 30%|███       | 113/371 [03:05<06:57,  1.62s/it][A[A[A[A[A[A[A[A







 31%|███       | 114/371 [03

 55%|█████▍    | 203/371 [05:31<04:31,  1.62s/it][A[A[A[A[A[A[A[A







 55%|█████▍    | 204/371 [05:32<04:30,  1.62s/it][A[A[A[A[A[A[A[A







 55%|█████▌    | 205/371 [05:34<04:28,  1.62s/it][A[A[A[A[A[A[A[A







 56%|█████▌    | 206/371 [05:36<04:26,  1.62s/it][A[A[A[A[A[A[A[A







 56%|█████▌    | 207/371 [05:37<04:25,  1.62s/it][A[A[A[A[A[A[A[A







 56%|█████▌    | 208/371 [05:39<04:23,  1.62s/it][A[A[A[A[A[A[A[A







 56%|█████▋    | 209/371 [05:40<04:22,  1.62s/it][A[A[A[A[A[A[A[A







 57%|█████▋    | 210/371 [05:42<04:20,  1.62s/it][A[A[A[A[A[A[A[A







 57%|█████▋    | 211/371 [05:44<04:18,  1.62s/it][A[A[A[A[A[A[A[A







 57%|█████▋    | 212/371 [05:45<04:19,  1.63s/it][A[A[A[A[A[A[A[A







 57%|█████▋    | 213/371 [05:47<04:20,  1.65s/it][A[A[A[A[A[A[A[A







 58%|█████▊    | 214/371 [05:49<04:18,  1.64s/it][A[A[A[A[A[A[A[A







 58%|█████▊    | 215/371 [05

 82%|████████▏ | 304/371 [08:16<01:49,  1.63s/it][A[A[A[A[A[A[A[A







 82%|████████▏ | 305/371 [08:18<01:47,  1.63s/it][A[A[A[A[A[A[A[A







 82%|████████▏ | 306/371 [08:19<01:46,  1.63s/it][A[A[A[A[A[A[A[A







 83%|████████▎ | 307/371 [08:21<01:44,  1.63s/it][A[A[A[A[A[A[A[A







 83%|████████▎ | 308/371 [08:22<01:43,  1.64s/it][A[A[A[A[A[A[A[A







 83%|████████▎ | 309/371 [08:24<01:41,  1.64s/it][A[A[A[A[A[A[A[A







 84%|████████▎ | 310/371 [08:26<01:39,  1.64s/it][A[A[A[A[A[A[A[A







 84%|████████▍ | 311/371 [08:27<01:38,  1.64s/it][A[A[A[A[A[A[A[A







 84%|████████▍ | 312/371 [08:29<01:36,  1.64s/it][A[A[A[A[A[A[A[A







 84%|████████▍ | 313/371 [08:31<01:34,  1.63s/it][A[A[A[A[A[A[A[A







 85%|████████▍ | 314/371 [08:32<01:33,  1.63s/it][A[A[A[A[A[A[A[A







 85%|████████▍ | 315/371 [08:34<01:31,  1.64s/it][A[A[A[A[A[A[A[A







 85%|████████▌ | 316/371 [08

In [112]:
len(preds)

371

In [None]:
preds = []
for _, row in tqdm(ibm_partition_df.iterrows(), total=len(ibm_partition_df)):
    preds.append(predict_with_tfidf_window(row))
    
ibm_partition_df['tfidf_window prediction'] = preds









  0%|          | 0/194 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







  1%|          | 1/194 [00:01<05:16,  1.64s/it][A[A[A[A[A[A[A[A







  1%|          | 2/194 [00:03<05:14,  1.64s/it][A[A[A[A[A[A[A[A







  2%|▏         | 3/194 [00:04<05:12,  1.64s/it][A[A[A[A[A[A[A[A







  2%|▏         | 4/194 [00:06<05:10,  1.63s/it][A[A[A[A[A[A[A[A







  3%|▎         | 5/194 [00:08<05:08,  1.63s/it][A[A[A[A[A[A[A[A







  3%|▎         | 6/194 [00:09<05:07,  1.63s/it][A[A[A[A[A[A[A[A







  4%|▎         | 7/194 [00:11<05:05,  1.63s/it][A[A[A[A[A[A[A[A







  4%|▍         | 8/194 [00:13<05:03,  1.63s/it][A[A[A[A[A[A[A[A







  5%|▍         | 9/194 [00:14<05:01,  1.63s/it][A[A[A[A[A[A[A[A







  5%|▌         | 10/194 [00:16<05:00,  1.63s/it][A[A[A[A[A[A[A[A







  6%|▌         | 11/194 [00:17<04:58,  1.63s/it][A[A[A[A[A[A[A[A







  6%|▌         | 12/194 [00:19<04:57,  1.63s/it][

 53%|█████▎    | 102/194 [02:47<02:32,  1.65s/it][A[A[A[A[A[A[A[A







 53%|█████▎    | 103/194 [02:49<02:30,  1.65s/it][A[A[A[A[A[A[A[A







 54%|█████▎    | 104/194 [02:50<02:28,  1.65s/it][A[A[A[A[A[A[A[A







 54%|█████▍    | 105/194 [02:52<02:27,  1.65s/it][A[A[A[A[A[A[A[A







 55%|█████▍    | 106/194 [02:54<02:25,  1.65s/it][A[A[A[A[A[A[A[A







 55%|█████▌    | 107/194 [02:55<02:23,  1.65s/it][A[A[A[A[A[A[A[A







 56%|█████▌    | 108/194 [02:57<02:21,  1.65s/it][A[A[A[A[A[A[A[A







 56%|█████▌    | 109/194 [02:59<02:20,  1.65s/it][A[A[A[A[A[A[A[A







 57%|█████▋    | 110/194 [03:00<02:18,  1.65s/it][A[A[A[A[A[A[A[A







 57%|█████▋    | 111/194 [03:02<02:16,  1.65s/it][A[A[A[A[A[A[A[A







 58%|█████▊    | 112/194 [03:04<02:15,  1.65s/it][A[A[A[A[A[A[A[A







 58%|█████▊    | 113/194 [03:05<02:13,  1.65s/it][A[A[A[A[A[A[A[A







 59%|█████▉    | 114/194 [03

In [None]:
partition_df

In [None]:
partition_df.to_csv('2016_June_partition_prediction.csv')
ibm_partition_df.to_csv('2016_June_ibm_partition_prediction.csv')