In [48]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import class_weight
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import ast

In [49]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        

In [50]:
def contains_europen(x):
    y = x.lower()
    flag = 'e.u' in y or 'eu' in y or 'european' in y or 'european union' in y or 'europeanunion' in y
    return flag

In [51]:
year = 2015
bbc_id = 279

In [52]:
df = pd.read_csv('./data/partition_predictions/topics_pred_on_bert_partitioned_bbc_{}_{}_with_news_classifier_no_Others.csv'.format(bbc_id, year))
df = df.drop(['Unnamed: 0'], axis=1)
df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x))
df['european_union'] = df['transcript'].apply(lambda x: contains_europen(x))

In [53]:
df['length'] = df.apply(lambda x: len(x.transcript.split()), axis=1)

In [54]:
df

Unnamed: 0,partition_id,date,transcript,topic,european_union,length
0,0,4-oct-2015,An eight-year old boy and a pensioner die in a...,"[(Parliament, government and politics, 0.64659...",False,14
1,0,4-oct-2015,Paid leave for grandparents - the Chancellor o...,"[(Parliament, government and politics, 0.88700...",False,19
2,0,4-oct-2015,President Obama promises a full investigation ...,"[(International affairs, 0.7953888), (Parliame...",False,39
3,0,4-oct-2015,Disappointment for fans as hosts England are d...,"[(Culture, media and sport, 0.699106), (Parlia...",False,24
4,1,4-oct-2015,Breaking news - at least 13 dead in flash floo...,"[(Parliament, government and politics, 0.56243...",False,40
...,...,...,...,...,...,...
10780,472,31-aug-2015,But we start with a new blockade causing miser...,"[(Transport, 0.8362127), (Parliament, governme...",False,61
10781,472,31-aug-2015,"It is confusing, nobody knows what is happenin...","[(Parliament, government and politics, 0.41144...",False,45
10782,472,31-aug-2015,The toilets are full and dirty and there is no...,"[(Parliament, government and politics, 0.55218...",False,21
10783,472,31-aug-2015,"At the port’s entrance, lifeboats had been pos...","[(Parliament, government and politics, 0.48233...",False,28


In [37]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

topics = ['Agriculture, animals, food and rural affairs', 'Asylum, immigration and nationality', 'Business, industry and consumers', 'Communities and families', 'Crime, civil law, justice and rights', 'Culture, media and sport', 'Defence', 'Economy and finance', 'Education', 'Employment and training', 'Energy and environment', 'European Union', 'Health services and medicine', 'Housing and planning', 'International affairs', 'Parliament, government and politics', 'Science and technology', 'Social security and pensions', 'Social services', 'Transport', 'Others'
]

In [38]:
rows = []
for month in months:
    df_month = df.loc[df.date.str.contains(month)]
    total_length = df_month['length'].sum()
    top1 = {topic:0 for topic in topics}
    top2 = {topic:0 for topic in topics}
    top3 = {topic:0 for topic in topics}
    for index, row in df_month.iterrows():
        length = row['length']
        t1 = row['topic'][0][0]
        t2 = row['topic'][1][0]
        t3 = row['topic'][2][0]
        top1[t1] += length
        top2[t2] += length
        top3[t3] += length
    for topic in topics:
        row = [month, topic, top1[topic]/total_length, top2[topic]/total_length, top3[topic]/total_length]
        rows.append(row)



In [39]:
len(rows)

252

In [40]:
res = pd.DataFrame(rows, columns=['month', 'topic', 'top1', 'top2', 'top3'])

In [45]:
res = res.fillna(0)

In [46]:
res

Unnamed: 0,month,topic,top1,top2,top3
0,jan,"Agriculture, animals, food and rural affairs",0.000000,0.000000,0.000000
1,jan,"Asylum, immigration and nationality",0.000000,0.000000,0.000000
2,jan,"Business, industry and consumers",0.000000,0.000000,0.000000
3,jan,Communities and families,0.000000,0.000000,0.000000
4,jan,"Crime, civil law, justice and rights",0.000000,0.000000,0.000000
...,...,...,...,...,...
247,dec,Science and technology,0.000000,0.000000,0.000000
248,dec,Social security and pensions,0.000000,0.000000,0.000000
249,dec,Social services,0.000000,0.000000,0.000000
250,dec,Transport,0.122547,0.108267,0.112028


In [47]:
res.to_csv('bbc_partition_topic_month_words_share_{}_{}_no_Others.csv'.format(bbc_id, year))