In [27]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import spatial
import parmap
import os
import swifter

In [28]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [29]:
path = './data/bbc/2014/transcripts'
files = os.listdir(path)

data = []
for source in files:
    data.append(pd.read_csv(os.path.join(path, source)))
df = pd.concat(data)
df = df.drop(['Unnamed: 0', 'Has Transcript', 'Unavailable link', 'Unavailable reason'], axis=1)
df

Unnamed: 0,Source,Date,Program Name,Time,Duration,Transcript
0,BBC1 London,6-dec-2014,BBC News,22:00,20 mins,"#Ah... # Dreaming ofthe days # APPLAUSE Hiya, ..."
1,BBC1 London,7-dec-2014,BBC News,22:00,20 mins,Alex Salmond unveils plans to seek a Westminst...
2,BBC1 London,7-dec-2014,BBC London News,22:20,10 mins,against the illegal trade in wildlife. He’s ve...
3,BBC1 London,5-dec-2014,BBC News at One,13:00,30 mins,# The way I love you. # Making Christmas speci...
4,BBC1 London,5-dec-2014,BBC News at Ten,22:00,25 mins,Two British men are jailed for travelling to S...
...,...,...,...,...,...,...
107,BBC1 London,26-jun-2014,BBC News at One,13:00,30 mins,"Hello, Glastonbury. Hello, Glastonbury! Hello,..."
108,BBC1 London,26-jun-2014,BBC London News,13:30,15 mins,A little bit more cloud around but some of us ...
109,BBC1 London,26-jun-2014,BBC News at Six,19:10,25 mins,Yet more shocking re-lations about the extent ...
110,BBC1 London,26-jun-2014,BBC News at Ten,22:00,30 mins,The most detailed picture yet ofthe The most d...


In [30]:
transcripts = df['Transcript'].values

In [31]:
sentences = [nltk.sent_tokenize(transcript) for transcript in transcripts]

In [32]:
X = [j for sub in sentences for j in sub]

In [7]:
preprocessed_X = parmap.map(preprocess_text, X, pm_pbar=True)
tagged_X = [models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_X)]

362496it [00:01, 225434.20it/s]           


In [35]:
preprocessed_X[4]

['yeah',
 'your',
 'smiling',
 'face',
 'like',
 'never',
 'seen',
 'before',
 'wonderful']

In [171]:
print('training doc2vec')
doc2vec_model = Doc2Vec(vector_size=100, window=3, workers=mp.cpu_count(), epochs=40)
doc2vec_model.build_vocab(tagged_X)
doc2vec_model.train(tagged_X, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

training doc2vec


In [9]:
doc2vec_model.save('./models/doc2vec/sent2vec')

In [8]:
doc2vec_model = Doc2Vec.load('./models/doc2vec/sent2vec')

In [85]:
from sklearn.metrics.pairwise import cosine_similarity

def filter_short_sentence(sentences):
    n_sent = len(sentences)
    filtered_sentences = []
    for i in range(n_sent):
        if len(sentences[i].split()) >= 10:
            filtered_sentences.append(sentences[i])
    return filtered_sentences

def partition_transcript_into_topics(transcript):
    cluster = []
    sentences = nltk.sent_tokenize(transcript)
    sentences = filter_short_sentence(sentences)
    n_sent = len(sentences)
    current_cluster = [sentences[0]]
    n_cluster = 1
    for i in range(1, n_sent):
        vec1 = doc2vec_model.infer_vector(preprocess_text(sentences[i]))
        sim = 0
        for sent in current_cluster:
            vec2 = doc2vec_model.infer_vector(preprocess_text(sent))
            sim+= cosine_similarity([vec1], [vec2])[0]
            
        avg_sim = sim/len(current_cluster)
        if avg_sim >= 0.4:
            current_cluster.append(sentences[i])
            if i == n_sent - 1:
                cluster.append(' '.join(current_cluster))
        else:
            n_cluster += 1
            cluster.append(' '.join(current_cluster))
            current_cluster = [sentences[i]]
    return '\n---------------------\n'.join(cluster)

In [16]:
df.reset_index(inplace=True)

In [337]:
df['partitions'] = df.swifter.apply(lambda x: partition_transcript_into_topics(x['Transcript']), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1243.0, style=ProgressStyle(descriptio…

215
0.8310677260160446
0.5629817545413971
0.59916819135348
0.7156999688595533
0.5207093566656112
0.6719989522049824
0.9847413514341626
0.9916586196050048
0.539669543504715
0.6362369507551193
0.6660276452700297
0.6056173890829086
0.7701626002788544
0.8159753341848651
0.7166747248598507
0.6941114515066147
0.8103604879644182
0.5169069945812226
0.7502360083162785
0.8510854677297175
0.91628955795358
0.5543610762272563
0.8946740665783485
0.6917838179506361
0.9200764732325778
1.0581592693924904
0.8327228128910065
0.841384906321764
0.8296404716869196
0.8041429165750742
0.8663453623652458
0.9507665280252695
0.8918930366635323
0.9188507534563541
0.7822711020708084
0.712328914552927
0.936956737563014
0.9624238833785057
0.9137035235762596
0.9162227362394333
0.9467099184791247
1.0225630272179842
0.9971644734032452
0.9894961575046182
0.606303483247757
0.8012752085924149
1.034554875145356
1.0164541564881802
0.9638351052999496
1.2204315811395645
1.0877333208918571
1.000034620617953
1.0925231128931046


0.8474755808711052
0.8674654811620712
0.8081858058770498
0.8762097986681121
0.8433472961187363
0.9330228798919253
0.7980816662311554
0.89095763489604
1.111539401113987
1.0869233831763268
1.0844817608594894
0.754339873790741
0.9229605011641979
1.001172777498141
0.33513981103897095
0.48075278103351593
0.9622223650415739
0.9293306022882462
0.8230206072330475
0.8309323042631149
0.8603624117871126
0.722078487277031
0.6819855332374573
0.7484746476014456
0.7709993890353611
0.8122266306891106
0.7802484755714735
0.7153907975181937
0.7569124468348243
0.7930394030020883
0.7682915083490885
0.8049772618604558


KeyboardInterrupt: 

In [81]:
transcripts[2]

'against the illegal trade in wildlife. He’s very passionate about that. At the World Bank, he will be making a speech which we are told will significant. A critical visit in a country which is of such critical importance to the United Kingdom. Time now for some sport. Premier League football on the way but we start with rugby union where there was a big upset in the European champions cup as Leicester secured a famous victory over the defending champions Toulon. Leicester are not often underdogs but against Toulon, very few are able to complete with rugby’s Galactica ‘s. What they cannot defend is errors. Although Bryan Habana showed why he is the brightest star in the Toulon constellation. The French side had no answer to the boot of Owain Williams as Leicester gave Welford Road a European night to remember. Earlier, Chris Robshaw marked his 200th harlequins appearance in the style that we earlier saw for England. This was the first time harlequins had met Leinster since the informan

In [82]:
x = partition_transcript_into_topics(transcripts[2])

In [83]:
print(x)

At the World Bank, he will be making a speech which we are told will significant. A critical visit in a country which is of such critical importance to the United Kingdom. Premier League football on the way but we start with rugby union where there was a big upset in the European champions cup as Leicester secured a famous victory over the defending champions Toulon. Leicester are not often underdogs but against Toulon, very few are able to complete with rugby’s Galactica ‘s.
---------------------
Although Bryan Habana showed why he is the brightest star in the Toulon constellation.
---------------------
The French side had no answer to the boot of Owain Williams as Leicester gave Welford Road a European night to remember.
---------------------
Earlier, Chris Robshaw marked his 200th harlequins appearance in the style that we earlier saw for England. This was the first time harlequins had met Leinster since the informants game where the use of fake blood rocked rugby.
-----------------

In [56]:
df.to_csv('./partitioned_transcripts.csv')

In [62]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = [(topics_index_to_name_map[i], p) for i,p in enumerate(pred[0])]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]

In [63]:
doc2vec_model = Doc2Vec.load('./models/doc2vec/doc2vec_12_13')
classifier = build_network()
classifier.load_weights('./models/doc2vec/classifier_12_13')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f469b1cbd50>

In [84]:
predict('That is cleared by Williams, only as far as Song.')

[('Parliament, government and politics', 0.40107185),
 ('Others', 0.11620629),
 ('Culture, media and sport', 0.064145476)]