In [33]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd
from pathlib import Path
import calendar
import ast
import os
import nltk

In [41]:
keywords = ['control.border', 'control.immigr', '350.million']

def get_bigrams(transcript):
    stopW = stopwords.words('english')
    ps = PorterStemmer()
    
    # to lower case
    clean_transcript = transcript.lower()
    clean_transcript = word_tokenize(clean_transcript)
    # remove stopwords and single characters
    clean_transcript = [i for i in clean_transcript if i not in stopW and len(i) > 1]
    # stemming
    clean_transcript = [ps.stem(word) for word in clean_transcript]
    
    # bigrams
    phrases = list(nltk.bigrams(clean_transcript))
    phrases = [phrase[0] + '.' + phrase[1] for phrase in phrases]
    
    return phrases

def contains_keywords(transcript):
    stopW = stopwords.words('english')
    ps = PorterStemmer()
    
    # to lower case
    clean_transcript = transcript.lower()
    clean_transcript = word_tokenize(clean_transcript)
    # remove stopwords and single characters
    clean_transcript = [i for i in clean_transcript if i not in stopW and len(i) > 1]
    # stemming
    clean_transcript = [ps.stem(word) for word in clean_transcript]
    
    # bigrams
    phrases = list(nltk.bigrams(clean_transcript))
    phrases = [phrase[0] + '.' + phrase[1] for phrase in phrases]
    flag = False
    for keyword in keywords:
        if keyword in phrases:
            flag = True
            break
    return flag

def get_transcripts(bbc_id, year, start_month, end_month):
    print('preparing data!!')
    path = '../data/bbc/{}/{}/transcripts'.format(bbc_id, year)
    months = [calendar.month_abbr[i].lower() for i in range(start_month, end_month+1)]
    
    transcript_files = os.listdir(path)
    transcript_files = [tf for tf in transcript_files if tf.split('-')[0].split()[-1] in months]
    
    dataframes = []
    for transcript_file in transcript_files:
        dataframes.append(pd.read_csv(os.path.join(path, transcript_file)))

    df = pd.concat(dataframes)
    return df.drop(['Unnamed: 0'], axis=1)

def match_partition_id_with_transcript(Date, Transcript):
    partition_df_for_date = partition_df.loc[partition_df.date == Date]
    ids = partition_df_for_date.partition_id.unique()
    matching_id = None
    id_found = False
    for id_ in ids:
        partition_df_for_id = partition_df_for_date.loc[partition_df_for_date.partition_id == id_]
        for index, row in partition_df_for_id.iterrows():
            if row['transcript'] in Transcript:
                matching_id = id_
                id_found = True
                break
        if id_found:
            break
    partition_df_for_transcript = partition_df_for_date.loc[partition_df_for_date.partition_id == matching_id]
    keywords_in_european = 0
    partition_where_keywords_occur = partition_df_for_transcript.loc[(partition_df_for_transcript.has_keywords == True) & (partition_df_for_transcript.topic == 'European_Union')]
    if len(partition_where_keywords_occur) > 0:
        keywords_in_european = 1
    partitions = '\n------------'.join(partition_where_keywords_occur.transcript.values)
    return keywords_in_european, partitions
        

# Config

In [42]:
bbc_id = 54
year = 2016
month_start = 1
month_end = 6
excluding = 'no_Others'

In [43]:
partition_df = pd.read_csv('../data/partition_predictions/topics_pred_on_bert_partitioned_bbc_{}_{}_with_news_classifier_{}.csv'.format(bbc_id, year, excluding))
partition_df = partition_df.drop(['Unnamed: 0'], axis=1)
partition_df['topic'] = partition_df.apply(lambda x: ast.literal_eval(x['topic']), axis=1)
partition_df['topic'] = partition_df.apply(lambda x: x['topic'][0][0], axis=1)
partition_df['topic'] = partition_df.apply(lambda x: "_".join(x.topic.replace(',', '').split(" ")), axis=1)
partition_df['has_keywords'] = partition_df['transcript'].apply(lambda x: contains_keywords(x))
partition_df['length'] = partition_df['transcript'].apply(lambda x: len(x.split()))
partition_df

Unnamed: 0,partition_id,date,transcript,topic,has_keywords,length
0,0,4-jun-2016,we would have seen during this weekend and the...,Culture_media_and_sport,False,44
1,0,4-jun-2016,manage that and make sure we stay within the g...,Culture_media_and_sport,False,14
2,0,4-jun-2016,We start at the desk where you have the three ...,Culture_media_and_sport,False,83
3,0,4-jun-2016,It helps to correct this atmosphere which is v...,Culture_media_and_sport,False,65
4,0,4-jun-2016,Twitter’s live streaming video service announc...,Culture_media_and_sport,False,74
...,...,...,...,...,...,...
102489,1506,30-dec-2016,"The former Bishop of Liverpool, JamesJones, ch...",Parliament_government_and_politics,False,80
102490,1506,30-dec-2016,Southern rail passengers have been warned that...,Transport,False,177
102491,1506,30-dec-2016,"So on the eve of the new year, | make this cha...",Culture_media_and_sport,False,50
102492,1506,30-dec-2016,"It too says it’s willing to talk but, once aga...",Transport,False,145


In [44]:
transcript_df = get_transcripts(bbc_id, year, month_start, month_end)
transcript_df['has_keywords'] = transcript_df['Transcript'].apply(lambda x: contains_keywords(x))

preparing data!!


In [45]:
transcript_df_with_keywords = transcript_df.loc[transcript_df.has_keywords == True].reset_index(drop=True)

in_eu = []
keyword_partition = []

for index, row in transcript_df_with_keywords.iterrows():
    in_european, partition_with_keyword = match_partition_id_with_transcript(row['Date'], row['Transcript'])
    in_eu.append(in_european)
    keyword_partition.append(partition_with_keyword)
    
transcript_df_with_keywords['keywords_in_european'] = in_eu
transcript_df_with_keywords['partition_with_keywords'] = keyword_partition
transcript_df_with_keywords

Unnamed: 0,Source,Date,Program Name,Time,Duration,Has Transcript,Transcript,Unavailable link,Unavailable reason,has_keywords,keywords_in_european,partition_with_keywords
0,BBC1 London,4-jun-2016,Joins BBC News,01:40,260 mins,True,we would have seen during this weekend and the...,,,True,0,
1,BBC1 London,4-jun-2016,BBC Weekend News,22:50,20 mins,True,But BUT This BMT This mas his BUT This was his...,,,True,1,Now to some of the rest of the day’s news. Lor...
2,BBC News 24,2-jun-2016,BBC News at Six,18:00,30 mins,True,BIRDSONG ROCK MUSIC PLAYS Come on now! Squeaky...,,,True,0,
3,BBC News 24,2-jun-2016,BBC News at Ten,22:00,30 mins,True,"I can’t hear anything. OK, we need to intubate...",,,True,1,The British taxpayer’s money is now just being...
4,BBC1 London,5-jun-2016,Joins BBC News,02:35,205 mins,True,at least we get our borders back. It’s a nonse...,,,True,1,Beattie said the UK could win the referendum o...
...,...,...,...,...,...,...,...,...,...,...,...,...
89,BBC News 24,26-may-2016,BBC News at Six,18:00,30 mins,True,it’s great to know that someone from Buccaneer...,,,True,0,
90,BBC News 24,25-may-2016,BBC News at Ten,22:00,30 mins,True,My kids are afraid ofthe police. Islam needs t...,,,True,0,
91,BBC1 London,27-may-2016,BBC News at Ten,22:00,25 mins,True,PS: Do you think that men’s names are harder t...,,,True,1,The committee pointed out two main figures it ...
92,BBC1 London,28-may-2016,BBC Weekend News,22:30,20 mins,True,An appeal for the Rio Olympics to be postponed...,,,True,1,"In it they said, voters were promised it they ..."


In [46]:
transcript_df_with_keywords.to_csv('bbc_transcripts_with_keywords_{}_{}_{}-{}.csv'.format(bbc_id, year,
                                                                       calendar.month_abbr[month_start].lower(),
                                                                       calendar.month_abbr[month_end].lower()))

In [22]:
df_dump = transcript_df_with_keywords.loc[transcript_df_with_keywords.Date == '2-nov-2016']

In [23]:
df_dump


Unnamed: 0,Source,Date,Program Name,Time,Duration,Has Transcript,Transcript,Unavailable link,Unavailable reason,has_keywords,keywords_in_european,partition_with_keywords
45,BBC1 London,2-nov-2016,Joins BBC News,00:45,315 mins,True,"and the crisp, sunny afternoons, that is what ...",,,True,0,


In [24]:
contains_keywords(df_dump.iloc[0])

2-nov-2016
350.million


True

In [26]:
big = get_bigrams(df_dump.iloc[0]['Transcript'])

In [31]:
big.index('350.million')


11646

In [32]:
df_dump.iloc[0]['Transcript']

