In [102]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd
from pathlib import Path
import calendar
import ast
import os
import nltk

In [103]:
keywords = ['control.border', 'control.immigr', '350.million']
web_model = WebBertSimilarity(device='cpu', batch_size=10) #defaults to GPU prediction

def contains_keywords(transcript):
    stopW = stopwords.words('english')
    ps = PorterStemmer()
    
    # to lower case
    clean_transcript = transcript.lower()
    clean_transcript = word_tokenize(clean_transcript)
    # remove stopwords and single characters
    clean_transcript = [i for i in clean_transcript if i not in stopW and len(i) > 1]
    # stemming
    clean_transcript = [ps.stem(word) for word in clean_transcript]
    
    # bigrams
    phrases = list(nltk.bigrams(clean_transcript))
    phrases = [phrase[0] + '.' + phrase[1] for phrase in phrases]
    flag = False
    for keyword in keywords:
        if keyword in phrases:
            flag = True
            break
    return flag

def get_transcripts(bbc_id, year, start_month, end_month):
    print('preparing data!!')
    path = './data/bbc/{}/{}/transcripts'.format(bbc_id, year)
    months = [calendar.month_abbr[i].lower() for i in range(start_month, end_month+1)]
    
    transcript_files = os.listdir(path)
    transcript_files = [tf for tf in transcript_files if tf.split('-')[0].split()[-1] in months]
    
    dataframes = []
    for transcript_file in transcript_files:
        dataframes.append(pd.read_csv(os.path.join(path, transcript_file)))

    df = pd.concat(dataframes)
    return df.drop(['Unnamed: 0'], axis=1)

def match_partition_id_with_transcript(Date, Transcript):
    partition_df_for_date = partition_df.loc[partition_df.date == Date]
    ids = partition_df_for_date.partition_id.unique()
    matching_id = None
    id_found = False
    for id_ in ids:
        partition_df_for_id = partition_df_for_date.loc[partition_df_for_date.partition_id == id_]
        for index, row in partition_df_for_id.iterrows():
            if row['transcript'] in Transcript:
                matching_id = id_
                id_found = True
                break
        if id_found:
            break
    partition_df_for_transcript = partition_df_for_date.loc[partition_df_for_date.partition_id == matching_id]
    keywords_in_european = 0
    if len(partition_df_for_transcript.loc[(partition_df_for_transcript.has_keywords == True) & (partition_df_for_transcript.topic == 'European_Union')]) > 0:
        keywords_in_european = 1
    return keywords_in_european
        

# Config

In [104]:
bbc_id = 54
year = 2016
month_start = 7
month_end = 12
excluding = 'no_Others'

In [105]:
partition_df = pd.read_csv('./data/partition_predictions/topics_pred_on_bert_partitioned_bbc_{}_{}_with_news_classifier_{}.csv'.format(bbc_id, year, excluding))
partition_df = partition_df.drop(['Unnamed: 0'], axis=1)
partition_df['topic'] = partition_df.apply(lambda x: ast.literal_eval(x['topic']), axis=1)
partition_df['topic'] = partition_df.apply(lambda x: x['topic'][0][0], axis=1)
partition_df['topic'] = partition_df.apply(lambda x: "_".join(x.topic.replace(',', '').split(" ")), axis=1)
partition_df['has_keywords'] = partition_df['transcript'].apply(lambda x: contains_keywords(x))
partition_df['length'] = partition_df['transcript'].apply(lambda x: len(x.split()))
partition_df

Unnamed: 0,partition_id,date,transcript,topic,has_keywords,length
0,0,4-jun-2016,we would have seen during this weekend and the...,Culture_media_and_sport,False,44
1,0,4-jun-2016,manage that and make sure we stay within the g...,Culture_media_and_sport,False,14
2,0,4-jun-2016,We start at the desk where you have the three ...,Culture_media_and_sport,False,83
3,0,4-jun-2016,It helps to correct this atmosphere which is v...,Culture_media_and_sport,False,65
4,0,4-jun-2016,Twitter’s live streaming video service announc...,Culture_media_and_sport,False,74
...,...,...,...,...,...,...
102489,1506,30-dec-2016,"The former Bishop of Liverpool, JamesJones, ch...",Parliament_government_and_politics,False,80
102490,1506,30-dec-2016,Southern rail passengers have been warned that...,Transport,False,177
102491,1506,30-dec-2016,"So on the eve of the new year, | make this cha...",Culture_media_and_sport,False,50
102492,1506,30-dec-2016,"It too says it’s willing to talk but, once aga...",Transport,False,145


In [None]:
transcript_df = get_transcripts(bbc_id, year, month_start, month_end)
transcript_df['has_keywords'] = transcript_df['Transcript'].apply(lambda x: contains_keywords(x))
transcript_df

preparing data!!


In [83]:
transcript_df_with_keywords = transcript_df.loc[transcript_df.has_keywords == True].reset_index(drop=True)
transcript_df_with_keywords['keywords_in_european'] = transcript_df_with_keywords.apply(lambda x: 
                                            match_partition_id_with_transcript(x['Date'], x['Transcript']), axis=1)
transcript_df_with_keywords

Unnamed: 0,Source,Date,Program Name,Time,Duration,Has Transcript,Transcript,Unavailable link,Unavailable reason,has_keywords
0,BBC1 London,7-aug-2016,BBC Weekend News,12:45,15 mins,True,Good afternoon. People living in areas affecte...,,,True
1,BBC1 London,7-aug-2016,BBC Weekend News,18:00,20 mins,True,Russia is banned from next month’s Paralympic ...,,,True
2,BBC1 London,4-aug-2016,Joins BBC News,01:10,290 mins,True,and that’s been sitting across the country. Fa...,,,True
3,BBC1 London,5-aug-2016,Joins BBC News,00:05,355 mins,True,and temperatures still managing to get up to a...,,,True
4,BBC1 London,25-aug-2016,BBC News at One,13:00,30 mins,True,The death toll in the Italian earthquake rises...,,,True
...,...,...,...,...,...,...,...,...,...,...
69,BBC News 24,20-dec-2016,BBC News at Ten,22:00,30 mins,True,Police in Berlin say the driver of the lorry i...,,,True
70,BBC1 London,21-dec-2016,Joins BBC News,00:50,310 mins,True,light which is unconscionable. Every light whi...,,,True
71,BBC1 London,22-dec-2016,Joins BBC News,01:25,275 mins,True,he continued to attack the remaining giant sno...,,,True
72,BBC1 London,23-dec-2016,Joins BBC News,00:50,310 mins,True,I like young men. I like their company. I want...,,,True


In [77]:
transcript_df_with_keywords.to_csv('bbc_transcripts_with_keywords_{}_{}_{}-{}.csv'.format(bbc_id, year,
                                                                       calendar.month_abbr[month_start].lower(),
                                                                       calendar.month_abbr[month_end].lower()))