In [64]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from semantic_text_similarity.models import WebBertSimilarity
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
from scipy import spatial
import parmap
import os
import swifter
from tqdm import tqdm

In [65]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [67]:
path = './data/bbc/2016/transcripts'
files = os.listdir(path)

data = []
for source in files:
    data.append(pd.read_csv(os.path.join(path, source)))
df = pd.concat(data)
df = df.drop(['Unnamed: 0', 'Has Transcript', 'Unavailable link', 'Unavailable reason'], axis=1)
df = df.reset_index(drop=True)
df

Unnamed: 0,Source,Date,Program Name,Time,Duration,Transcript
0,BBC1 London,4-jun-2016,Joins BBC News,01:40,260 mins,we would have seen during this weekend and the...
1,BBC1 London,4-jun-2016,BBC Weekend News,12:00,15 mins,"Muhammad Ali, the boxing legend and giant of 2..."
2,BBC1 London,4-jun-2016,BBC Weekend News,18:30,10 mins,# just gimme the love just gimme the love... #...
3,BBC1 London,4-jun-2016,BBC Weekend News,22:50,20 mins,But BUT This BMT This mas his BUT This was his...
4,BBC1 London,2-jun-2016,Joins BBC News,00:20,340 mins,It looks as though the northwest seeing the be...
...,...,...,...,...,...,...
1502,BBC1 London,4-dec-2016,BBC Weekend News,22:00,20 mins,when Mark Duggan arrived at the scene at Ferry...
1503,BBC1 London,30-dec-2016,Joins BBC News,01:20,280 mins,"By midnight, it’s probably across northern Eng..."
1504,BBC1 London,30-dec-2016,BBC News,13:10,15 mins,Will it be happily ever after for the Mitchell...
1505,BBC1 London,30-dec-2016,BBC News,18:30,15 mins,Whoop-whoop-whoop. Whoop-whoop-whoop! Do you t...


In [68]:
def filter_short_sentence(sentences):
    n_sent = len(sentences)
    filtered_sentences = []
    for i in range(n_sent):
        if len(sentences[i].split()) >= 10:
            filtered_sentences.append(sentences[i])
    return filtered_sentences

def partition_transcript_into_topics(transcript):
    cluster = []
    sentences = nltk.sent_tokenize(transcript)
    sentences = filter_short_sentence(sentences)
    n_sent = len(sentences)
    current_cluster = [sentences[0]]
    n_cluster = 1
    for i in range(1, n_sent):
        sim = web_model.predict([(sentences[i], sentences[i-1])])[0]/5
        
#         for sent in current_cluster:
#             sim+= web_model.predict([(sent, sentences[i])])[0]/5
            
#         avg_sim = sim/len(current_cluster)
        
        if sim >= 0.1:
            current_cluster.append(sentences[i])
            if i == n_sent - 1:
                cluster.append(' '.join(current_cluster))
        else:
            n_cluster += 1
            cluster.append(' '.join(current_cluster))
            current_cluster = [sentences[i]]
    return '\n---------------------\n'.join(cluster)

In [69]:
web_model = WebBertSimilarity(device='cpu', batch_size=10) #defaults to GPU prediction

In [70]:
transcripts = df['Transcript'].values

In [71]:
print(partition_transcript_into_topics(transcripts[3]))

But BUT This BMT This mas his BUT This was his sTuge, stage. But this was his stage, where the one and only Muhammad Ali reigns supreme. I’m of a real champion of the world, do you hear me? This man is a megastar, add some more mega Toure onto that. My only fault is that I don’t realise how great I really am. You don’t like me because my big mouth, that was only publicity. Time flies when you are in good company, don’t it? I think it was the greatest betrayal in Swedish history. Wallander concludes... Tributes are paid around the world to Muhammad Ali, who’s died at the age of 74. I’ll whip any man in the world and I want everybody out there on TV to know it.
---------------------
The way he moved - the speed, the grace, the power- I knew it was something magical. The Kentucky boy rose from humble beginnings to become three-time World Heavyweight Champion.
---------------------
Politically, he was controversial because of his conversion to Islam, his stand on civil rights and his refus

In [None]:
df['partitioned_transcript'] = df.swifter.apply(partition_transcript_into_topics)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1507.0, style=ProgressStyle(descriptio…

In [None]:
df

In [None]:
df.to_csv('./bert_partitions_2016.csv')