In [88]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter

In [89]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [90]:
transcripts = pd.read_csv('./data/bert_partitions_2016.csv')
transcripts = transcripts.drop(['Unnamed: 0'], axis=1)

In [91]:
transcripts

Unnamed: 0,Source,Date,Program Name,Time,Duration,Transcript
0,BBC1 London,4-jun-2016,Joins BBC News,01:40,260 mins,we would have seen during this weekend and the...
1,BBC1 London,4-jun-2016,BBC Weekend News,12:00,15 mins,"Muhammad Ali, the boxing legend and giant of 2..."
2,BBC1 London,4-jun-2016,BBC Weekend News,18:30,10 mins,# just gimme the love just gimme the love... #...
3,BBC1 London,4-jun-2016,BBC Weekend News,22:50,20 mins,But BUT This BMT This mas his BUT This was his...
4,BBC1 London,2-jun-2016,Joins BBC News,00:20,340 mins,It looks as though the northwest seeing the be...
...,...,...,...,...,...,...
1502,BBC1 London,4-dec-2016,BBC Weekend News,22:00,20 mins,when Mark Duggan arrived at the scene at Ferry...
1503,BBC1 London,30-dec-2016,Joins BBC News,01:20,280 mins,"By midnight, it’s probably across northern Eng..."
1504,BBC1 London,30-dec-2016,BBC News,13:10,15 mins,Will it be happily ever after for the Mitchell...
1505,BBC1 London,30-dec-2016,BBC News,18:30,15 mins,Whoop-whoop-whoop. Whoop-whoop-whoop! Do you t...


In [92]:
partitions = []
for index, row in transcripts.iterrows():
    partition_string = row['partitioned_transcript']
    partition_date = row['Date']
    all_partitions = partition_string.split('\n---------------------\n')
    for partition in all_partitions:
        partitions.append((index, partition_date, partition))

KeyError: 'partitioned_transcript'

In [77]:
partitions[110]

(3,
 '5-dec-2014',
 'It is late but it is far from stoppage time in terms of Fifa and Canada soccer having plenty of time to do the right thing.')

In [78]:
def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = [(topics_index_to_name_map[i], p) for i,p in enumerate(pred[0])]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]
    

In [79]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [80]:
doc2vec_model = Doc2Vec.load('models/doc2vec/doc2vec_news')
classifier = build_network()
classifier.load_weights('./models/doc2vec/doc2vec_news_classifier')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8637768d90>

In [81]:
partition_df = pd.DataFrame(partitions, columns=['partition_id', 'date', 'transcript'])

In [82]:
partition_df

Unnamed: 0,partition_id,date,transcript
0,0,6-dec-2014,"# Oh, yeah # I see your smiling face # Like I’..."
1,0,6-dec-2014,Two hostages are killed in Yemen during a fail...
2,0,6-dec-2014,"Our hearts are full of sorrow tonight, our pra..."
3,0,6-dec-2014,Ferocious winds and torrential rain in the eas...
4,0,6-dec-2014,"President Obama has condemned as ""barbaric"" th..."
...,...,...,...
38239,1242,26-jun-2014,It is what is on the table at the moment. It i...
38240,1242,26-jun-2014,"Tonight, we are in Wolverhampton, and welcome ..."
38241,1242,26-jun-2014,"Conservative Defence Minister Anna Soubry, Lab..."
38242,1242,26-jun-2014,"But these can be close calls, and with hindsig..."


In [83]:
transcripts = partition_df.transcript.values
predict(transcripts[4])

[('Defence', 0.72018236),
 ('International affairs', 0.12466005),
 ('Crime, civil law, justice and rights', 0.07066607)]

In [84]:
transcripts[4]

'President Obama has condemned as "barbaric" the killing of two hostages in Yemen by al-Qaeda militants during a failed rescue attempt by US special forces. Mr Obama had authorised the operation after concluding that the life of British-born American journalist Luke Somers was in "imminent danger." It’s understood that prior to the raid there’d been an agreement that his fellow captive, South African aid worker Pierre Korkie, would be released.'

In [85]:
partition_df['topic'] = partition_df.swifter.apply(lambda x: predict(x['transcript']), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=38244.0, style=ProgressStyle(descripti…




In [86]:
partition_df

Unnamed: 0,partition_id,date,transcript,topic
0,0,6-dec-2014,"# Oh, yeah # I see your smiling face # Like I’...","[(Others, 0.50809795), (Parliament, government..."
1,0,6-dec-2014,Two hostages are killed in Yemen during a fail...,"[(Parliament, government and politics, 0.26035..."
2,0,6-dec-2014,"Our hearts are full of sorrow tonight, our pra...","[(Parliament, government and politics, 0.51417..."
3,0,6-dec-2014,Ferocious winds and torrential rain in the eas...,"[(Culture, media and sport, 0.4472414), (Parli..."
4,0,6-dec-2014,"President Obama has condemned as ""barbaric"" th...","[(Defence, 0.74868727), (International affairs..."
...,...,...,...,...
38239,1242,26-jun-2014,It is what is on the table at the moment. It i...,"[(Culture, media and sport, 0.81954867), (Parl..."
38240,1242,26-jun-2014,"Tonight, we are in Wolverhampton, and welcome ...","[(Parliament, government and politics, 0.44672..."
38241,1242,26-jun-2014,"Conservative Defence Minister Anna Soubry, Lab...","[(Crime, civil law, justice and rights, 0.8296..."
38242,1242,26-jun-2014,"But these can be close calls, and with hindsig...","[(Parliament, government and politics, 0.73541..."


In [87]:
partition_df.to_csv('topics_pred_on_bert_partitioned_bbc_with_news_classifier.csv')