In [969]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter

In [970]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [1140]:
bbc_id = 54
year = 2014
excluding = '_no_Others'

In [1141]:
df1 = pd.read_csv('../data/partition_predictions/window_topic_predictions_Jan-June_54_{}.csv'.format(year))
df2 = pd.read_csv('../data/partition_predictions/window_topic_predictions_Jul-Dec_54_{}.csv'.format(year))
partition_df = pd.concat([df1, df2])
partition_df = partition_df.drop(['Unnamed: 0', 'vector'], axis=1)

In [1142]:
partition_df

Unnamed: 0,partition_id,date,transcript,type,prediction
0,435,2014-03-01,"Don’t mind, do you? Shame for both of us to l...",p,"Agriculture, animals, food and rural affairs"
1,435,2014-03-01,"Yeah. Cool. Sorry, who are you? Lofty. KNOCK ...",s,"Parliament, government and politics"
2,435,2014-03-01,Today I was led to believe that a patient I h...,p,"Crime, civil law, justice and rights"
3,435,2014-03-01,This is potentially a grave threat to the ter...,p,International affairs
4,435,2014-03-01,"Also tonight, Ed Miliband’s plans to reform L...",p,"Parliament, government and politics"
...,...,...,...,...,...
28741,730,2014-11-25,That’s all from us.,s,"Parliament, government and politics"
28742,730,2014-11-25,Now it’s time for the news where you are.,p,"Parliament, government and politics"
28743,730,2014-11-25,Good night. -- achievements. Good evening fro...,s,"Parliament, government and politics"
28744,730,2014-11-25,A coroner has ruled that a problem with the f...,p,Health services and medicine


In [1143]:
partition_df['date'] = pd.to_datetime(partition_df['date'])

In [1144]:
partition_df['word_len'] = partition_df['transcript'].apply(lambda x: len(x.split()))

In [1145]:
partition_df

Unnamed: 0,partition_id,date,transcript,type,prediction,word_len
0,435,2014-03-01,"Don’t mind, do you? Shame for both of us to l...",p,"Agriculture, animals, food and rural affairs",65
1,435,2014-03-01,"Yeah. Cool. Sorry, who are you? Lofty. KNOCK ...",s,"Parliament, government and politics",23
2,435,2014-03-01,Today I was led to believe that a patient I h...,p,"Crime, civil law, justice and rights",106
3,435,2014-03-01,This is potentially a grave threat to the ter...,p,International affairs,21
4,435,2014-03-01,"Also tonight, Ed Miliband’s plans to reform L...",p,"Parliament, government and politics",19
...,...,...,...,...,...,...
28741,730,2014-11-25,That’s all from us.,s,"Parliament, government and politics",4
28742,730,2014-11-25,Now it’s time for the news where you are.,p,"Parliament, government and politics",9
28743,730,2014-11-25,Good night. -- achievements. Good evening fro...,s,"Parliament, government and politics",13
28744,730,2014-11-25,A coroner has ruled that a problem with the f...,p,Health services and medicine,68


In [1146]:
partition_df.iloc[6]

partition_id                                         435
date                                 2014-03-01 00:00:00
transcript       The FA is to investigate. Good evening.
type                                                   s
prediction          Crime, civil law, justice and rights
word_len                                               7
Name: 6, dtype: object

In [1147]:
partition_df = partition_df.reset_index(drop=True)

In [1148]:
partition_df.columns = ['partition_id', 'date', 'transcript', 'type', 'topic', 'word_len']

In [1149]:
from tqdm import tqdm
n_part = len(partition_df)
drop_indices = []
for i in tqdm(range(n_part)):
    row = partition_df.iloc[i]
    if row['type'] != 's':
        continue
    start_index = i
    end_index = i
    
    row1 = partition_df.iloc[i-1]
    
    while end_index < n_part:
        row2 = partition_df.iloc[end_index + 1]
        if row2['date'] != row1['date']:
            i = end_index+1
            drop_indices.extend(range(start_index, end_index+1))
            break
        if row2['type'] == 's':
            end_index += 1
            continue
        else:
            if row1['topic'] == row2['topic']:
                i = end_index + 1
                break
            else:
                i = end_index+1
                drop_indices.extend(range(start_index, end_index+1))
                break

100%|██████████| 56423/56423 [00:12<00:00, 4528.69it/s]


In [1150]:
dropped_df = partition_df.iloc[drop_indices]

In [1151]:
dropped_df

Unnamed: 0,partition_id,date,transcript,type,topic,word_len
1,435,2014-03-01,"Yeah. Cool. Sorry, who are you? Lofty. KNOCK ...",s,"Parliament, government and politics",23
6,435,2014-03-01,The FA is to investigate. Good evening.,s,"Crime, civil law, justice and rights",7
25,435,2014-03-01,That’s all the sport for now. That’s it.,s,"Parliament, government and politics",8
34,436,2014-03-02,I -- language he is talking about. There are ...,s,"Business, industry and consumers",16
40,436,2014-03-02,"Daniella Relph reports. It can be addictive, ...",s,"Crime, civil law, justice and rights",16
...,...,...,...,...,...,...
56407,730,2014-11-25,"Ben Wright, BBC News, Walthamstow Academy, Lo...",s,"Parliament, government and politics",7
56409,730,2014-11-25,"Yes. Absolutely, yes.",s,"Parliament, government and politics",3
56411,730,2014-11-25,"Gavin Hewitt, BBC News Strasbourg.",s,"Culture, media and sport",5
56418,730,2014-11-25,That’s all from us.,s,"Parliament, government and politics",4


In [1152]:
dropped_df.to_csv('dropped_partitions_54_{}.csv'.format(year))

In [1153]:
partition_df2 = partition_df.drop(partition_df.index[drop_indices])

In [1154]:
partition_df2

Unnamed: 0,partition_id,date,transcript,type,topic,word_len
0,435,2014-03-01,"Don’t mind, do you? Shame for both of us to l...",p,"Agriculture, animals, food and rural affairs",65
2,435,2014-03-01,Today I was led to believe that a patient I h...,p,"Crime, civil law, justice and rights",106
3,435,2014-03-01,This is potentially a grave threat to the ter...,p,International affairs,21
4,435,2014-03-01,"Also tonight, Ed Miliband’s plans to reform L...",p,"Parliament, government and politics",19
5,435,2014-03-01,"And Newcastle’s manager, Alan Pardew, is sent...",p,"Parliament, government and politics",13
...,...,...,...,...,...,...
56416,730,2014-11-25,"joe Wilson, BBC News.",s,"Parliament, government and politics",4
56417,730,2014-11-25,Manchester City have kept alive their slim ch...,p,"Culture, media and sport",298
56419,730,2014-11-25,Now it’s time for the news where you are.,p,"Parliament, government and politics",9
56421,730,2014-11-25,A coroner has ruled that a problem with the f...,p,Health services and medicine,68


In [1155]:
partition_df2.to_csv('window_prediction_unmerged_{}.csv'.format(year))

In [1156]:
len(dropped_df)

10732

In [1157]:
len(partition_df2)

45691

In [1158]:
len(partition_df2) + len(dropped_df)

56423

In [1159]:
len(partition_df)

56423

In [1160]:
len(drop_indices)

10732

In [1161]:
partition_df2.loc[partition_df2.type == 's']

Unnamed: 0,partition_id,date,transcript,type,topic,word_len
10,435,2014-03-01,"Ina moment, in Ukraine are next.",s,"Parliament, government and politics",6
20,435,2014-03-01,Northampton’s Pisi had to run through shadows...,s,Employment and training,12
27,435,2014-03-01,Tomorrow morning starts off pretty grey. Not ...,s,"Parliament, government and politics",15
31,436,2014-03-02,Emily Buchanan is at RAF Northolt. What are W...,s,"Parliament, government and politics",11
37,436,2014-03-02,This latest rampage was far more bloody.,s,"Parliament, government and politics",7
...,...,...,...,...,...,...
56341,728,2014-11-25,Prince Telemon of Aegina! Am I too ambitious?...,s,"Parliament, government and politics",19
56367,729,2014-11-25,Thank you.,s,"Parliament, government and politics",2
56395,730,2014-11-25,I was wrong.,s,"Parliament, government and politics",3
56399,730,2014-11-25,Daniela Relph reports.,s,"Agriculture, animals, food and rural affairs",3


In [1162]:
rows = []

n_part = len(partition_df2)
i = 0
while i < n_part:
    row = partition_df2.iloc[i]
    if row['type'] == 'p':
        i+=1
        rows.append([row['partition_id'], row['date'], row['transcript'], row['type'], row['topic']])
        continue
    else:
#         j = i
#         while j < i:
#             row2 = partition_df2.iloc[j]
#             if row2['type'] == 'p':
#                 rows[-1][2] = rows[-1][2] + ' ' + row2['transcript']
#                 i = j + 1
#                 break
            
#             rows[-1][2] = rows[-1][2] + ' ' + row2['transcript']
#             j = j + 1
        rows[-1][2] = rows[-1][2] + ' ' + row['transcript']
        rows[-1][2] = rows[-1][2] + ' ' + partition_df2.iloc[i+1]['transcript']
        i+=1
    i+=1
        

In [1163]:
res_df = pd.DataFrame(rows, columns=partition_df2.columns[0:-1])

In [1164]:
res_df

Unnamed: 0,partition_id,date,transcript,type,topic
0,435,2014-03-01,"Don’t mind, do you? Shame for both of us to l...",p,"Agriculture, animals, food and rural affairs"
1,435,2014-03-01,Today I was led to believe that a patient I h...,p,"Crime, civil law, justice and rights"
2,435,2014-03-01,This is potentially a grave threat to the ter...,p,International affairs
3,435,2014-03-01,"Also tonight, Ed Miliband’s plans to reform L...",p,"Parliament, government and politics"
4,435,2014-03-01,"And Newcastle’s manager, Alan Pardew, is sent...",p,"Parliament, government and politics"
...,...,...,...,...,...
36986,730,2014-11-25,"A legitimate, aggressive delivery from a fast...",p,Health services and medicine
36987,730,2014-11-25,"Fast bowling, with an intention to intimidate...",p,"Culture, media and sport"
36988,730,2014-11-25,Now it’s time for the news where you are.,p,"Parliament, government and politics"
36989,730,2014-11-25,A coroner has ruled that a problem with the f...,p,Health services and medicine


In [1165]:
res_df.to_csv('window_topic_prediction_with_short_sentences_merged_54_{}.csv'.format(year))

In [1166]:
dropped_df['month'] = dropped_df['date'].apply(lambda x: x.month)
res_df['month'] = res_df['date'].apply(lambda x: x.month)
partition_df['month'] = partition_df['date'].apply(lambda x: x.month)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1167]:
dropped_df['word_len'] = dropped_df['transcript'].apply(lambda x: len(x.split()))
res_df['word_len'] = res_df['transcript'].apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1168]:
others_month_count = dropped_df[['word_len', 'month']].groupby(['month'], as_index=False).sum()

In [1169]:
total_month_count = partition_df[['word_len', 'month']].groupby(['month'], as_index=False).sum()

In [1170]:
others_share = []

for month in range(1, 13):
    total = total_month_count.loc[total_month_count.month == month].iloc[0].word_len
    others = others_month_count.loc[others_month_count.month == month].iloc[0].word_len
    others_share.append([month, others/total])

In [1171]:
share_df = pd.DataFrame(others_share, columns=['month', 'share'])

In [1172]:
share_df

Unnamed: 0,month,share
0,1,0.020068
1,2,0.020351
2,3,0.021735
3,4,0.018279
4,5,0.019027
5,6,0.019727
6,7,0.020268
7,8,0.020325
8,9,0.0191
9,10,0.021267


In [1173]:
share_df.to_csv('Others_share_{}.csv'.format(year))