In [34]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import class_weight
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import ast

In [35]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        

In [47]:
def contains_europen(x):
    y = x.lower()
    flag = 'e.u' in y or 'eu' in y or 'european' in y or 'european union' in y or 'europeanunion' in y
    return flag

In [48]:
df = pd.read_csv('./topics_pred_on_bert_partitioned_bbc_2016_with_news_classifier_no_Others.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x))
df['european_union'] = df['transcript'].apply(lambda x: contains_europen(x))

In [49]:
df['length'] = df.apply(lambda x: len(x.transcript.split()), axis=1)

In [50]:
df

Unnamed: 0,partition_id,date,transcript,topic,european_union,length
0,0,4-jun-2016,we would have seen during this weekend and the...,"[(Culture, media and sport, 0.7764439), (Parli...",False,44
1,0,4-jun-2016,manage that and make sure we stay within the g...,"[(Culture, media and sport, 0.5958313), (Parli...",False,14
2,0,4-jun-2016,We start at the desk where you have the three ...,"[(Culture, media and sport, 0.8847824), (Parli...",False,83
3,0,4-jun-2016,It helps to correct this atmosphere which is v...,"[(Culture, media and sport, 0.9530382), (Parli...",False,65
4,0,4-jun-2016,Twitter’s live streaming video service announc...,"[(Culture, media and sport, 0.4875487), (Parli...",False,74
...,...,...,...,...,...,...
102489,1506,30-dec-2016,"The former Bishop of Liverpool, JamesJones, ch...","[(Parliament, government and politics, 0.71647...",False,80
102490,1506,30-dec-2016,Southern rail passengers have been warned that...,"[(Transport, 0.8351138), (Culture, media and s...",False,177
102491,1506,30-dec-2016,"So on the eve of the new year, | make this cha...","[(Culture, media and sport, 0.68709666), (Parl...",False,50
102492,1506,30-dec-2016,"It too says it’s willing to talk but, once aga...","[(Transport, 0.91989994), (Business, industry ...",False,145


In [51]:
df['european_union'].value_counts()

False    89245
True     13249
Name: european_union, dtype: int64

In [28]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

topics = ['Agriculture, animals, food and rural affairs', 'Asylum, immigration and nationality', 'Business, industry and consumers', 'Communities and families', 'Crime, civil law, justice and rights', 'Culture, media and sport', 'Defence', 'Economy and finance', 'Education', 'Employment and training', 'Energy and environment', 'European Union', 'Health services and medicine', 'Housing and planning', 'International affairs', 'Parliament, government and politics', 'Science and technology', 'Social security and pensions', 'Social services', 'Transport', 'Others'
]

In [29]:
rows = []
for month in months:
    df_month = df.loc[df.date.str.contains(month)]
    total_length = df_month['length'].sum()
    top1 = {topic:0 for topic in topics}
    top2 = {topic:0 for topic in topics}
    top3 = {topic:0 for topic in topics}
    for index, row in df_month.iterrows():
        length = row['length']
        t1 = row['topic'][0][0]
        t2 = row['topic'][1][0]
        t3 = row['topic'][2][0]
        top1[t1] += length
        top2[t2] += length
        top3[t3] += length
    for topic in topics:
        row = [month, topic, top1[topic]/total_length, top2[topic]/total_length, top3[topic]/total_length]
        rows.append(row)

In [30]:
len(rows)

252

In [31]:
res = pd.DataFrame(rows, columns=['month', 'topic', 'top1', 'top2', 'top3'])

In [32]:
res

Unnamed: 0,month,topic,top1,top2,top3
0,jan,"Agriculture, animals, food and rural affairs",0.023694,0.029057,0.033320
1,jan,"Asylum, immigration and nationality",0.000000,0.000000,0.000836
2,jan,"Business, industry and consumers",0.062213,0.069864,0.092238
3,jan,Communities and families,0.001204,0.003908,0.006193
4,jan,"Crime, civil law, justice and rights",0.077572,0.055262,0.093012
...,...,...,...,...,...
247,dec,Science and technology,0.000000,0.000000,0.000000
248,dec,Social security and pensions,0.000000,0.000000,0.000000
249,dec,Social services,0.000000,0.000000,0.000000
250,dec,Transport,0.094225,0.060006,0.079362


In [33]:
res.to_csv('bbc_partition_topic_month_words_share_2016_no_Others.csv')