In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import class_weight
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap

In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
df = pd.read_csv('./data/news_2016_predictions.csv')

In [5]:
df = df.drop(['Unnamed: 0'], axis=1)

In [6]:
df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc
0,163795,Belfast Telegraph,1,1,2016,My pride and joy;\nTrimble delight at record 2...,Having already become Ulster's most-capped pla...,,"Parliament, government and politics",47.93,Others,11.57,"Culture, media and sport",9.18
1,163795,Belfast Telegraph,1,1,2016,SWEDE TALKER;\nIf you don't recognise Alicia V...,"""It's always there,'' says the rising star, of...",,Others,35.08,"Business, industry and consumers",19.82,"Culture, media and sport",13.38
2,163795,Belfast Telegraph,1,1,2016,Ricky Warwick & Damon Johnson,"Most of Warwick's time is spent on the road, e...",,"Business, industry and consumers",22.21,"Culture, media and sport",15.08,Others,14.46
3,163795,Belfast Telegraph,1,1,2016,PICK OF THE WEEK,The five-piece play a multitude of instruments...,,"Culture, media and sport",81.15,"Parliament, government and politics",8.58,Others,5.41
4,163795,Belfast Telegraph,1,1,2016,Folk royalty are on the one road to a great show,The multi-platinum selling High Kings - Finbar...,,"Parliament, government and politics",32.26,Others,15.07,"Culture, media and sport",8.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863167,412338,Wales,31,12,2016,New Year's Eve rail delays as services disrupt...,Arriva Trains Wales services are being further...,,Transport,97.54,"Crime, civil law, justice and rights",0.43,"Parliament, government and politics",0.41
1863168,412338,Wales,31,12,2016,Has 2016 been all bad? Of course not! And here...,It's the year Britain was torn in two by Brexi...,,"Culture, media and sport",30.67,Others,20.27,"Parliament, government and politics",13.23
1863169,412338,Wales,31,12,2016,Advice for driving in fog: when to use your fo...,Fog can significantly impact driving condition...,,Transport,31.38,Others,15.98,"Parliament, government and politics",12.02
1863170,412338,Wales,31,12,2016,Driver left with 'serious' injuries following ...,"The single-vehicle road traffic collision, inv...",,Others,24.51,"Parliament, government and politics",17.96,Transport,15.37


In [34]:
df_filtered = df.loc[~df.top1_topic.str.contains('Others')]
df_filtered = df_filtered.loc[df.top1_acc >= 40]

In [35]:
df_filtered

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc
0,163795,Belfast Telegraph,1,1,2016,My pride and joy;\nTrimble delight at record 2...,Having already become Ulster's most-capped pla...,,"Parliament, government and politics",47.93,Others,11.57,"Culture, media and sport",9.18
3,163795,Belfast Telegraph,1,1,2016,PICK OF THE WEEK,The five-piece play a multitude of instruments...,,"Culture, media and sport",81.15,"Parliament, government and politics",8.58,Others,5.41
5,163795,Belfast Telegraph,1,1,2016,music round-up,The fun starts at 7pm in the Black Box Green R...,,"Culture, media and sport",59.40,Others,9.63,"Business, industry and consumers",8.74
6,163795,Belfast Telegraph,1,1,2016,Tuned in ...;\nQ Radio DJ Kathryn B Wilson che...,"However, the unmistakable vocals and club beat...",,"Culture, media and sport",73.97,Others,8.25,"Parliament, government and politics",7.70
12,163795,Belfast Telegraph,1,1,2016,74 killed on Northern Ireland's roads in 2015,"The PSNI said 34 drivers, 17 passengers, 19 pe...",,Transport,59.53,"Parliament, government and politics",11.05,Health services and medicine,6.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863158,412338,Wales,31,12,2016,Who is Swansea City contender Gary Rowett? The...,The 42-year-old was sacked by Birmingham City ...,,"Parliament, government and politics",42.08,"Culture, media and sport",32.99,"Business, industry and consumers",11.30
1863162,412338,Wales,31,12,2016,Welsh rugby's winners and losers of 2016 as a ...,Former wing wizard Shane Williams succinctly s...,,"Culture, media and sport",42.15,Others,21.01,"Parliament, government and politics",14.77
1863163,412338,Wales,31,12,2016,Is Jools Holland's Hootenanny live on New Year...,"Don't say we didn't warn you. Well, it turns o...",,Transport,51.73,Others,11.45,"Culture, media and sport",6.56
1863165,412338,Wales,31,12,2016,'Swansea City should appoint Roy Hodgson after...,But the Sky Sports pundit is now backing axed ...,,"Culture, media and sport",51.00,Others,10.19,"Parliament, government and politics",7.49


In [47]:
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

topics = ['Agriculture, animals, food and rural affairs', 'Asylum, immigration and nationality', 'Business, industry and consumers', 'Communities and families', 'Crime, civil law, justice and rights', 'Culture, media and sport', 'Defence', 'Economy and finance', 'Education', 'Employment and training', 'Energy and environment', 'European Union', 'Health services and medicine', 'Housing and planning', 'International affairs', 'Parliament, government and politics', 'Science and technology', 'Social security and pensions', 'Social services', 'Transport', 'Others'
]

months_map = {i: months[i-1] for i in range(1, 13)}

In [45]:
top1_counts = {month: {topic: 0 for topic in topics} for month in range(1, 13)}
top2_counts = {month: {topic: 0 for topic in topics} for month in range(1, 13)}
top3_counts = {month: {topic: 0 for topic in topics} for month in range(1, 13)}
count_month = {month: 0 for month in range(1, 13)}

In [46]:
for index, row in df_filtered.iterrows():
    month = row['month']
    count_month[month] += 1
    top1 = row['top1_topic']
    top2 = row['top2_topic']
    top3 = row['top3_topic']
    top1_counts[month][top1] += 1
    top2_counts[month][top2] += 1
    top3_counts[month][top3] += 1

In [49]:
rows = []
for month in range(1, 13):
    for topic in topics:
        share1 = top1_counts[month][topic] / count_month[month]
        share2 = top2_counts[month][topic] / count_month[month]
        share3 = top3_counts[month][topic] / count_month[month]
        row = [months_map[month], topic, share1, share2, share3]
        rows.append(row)

In [51]:
res = pd.DataFrame(rows, columns=['month', 'topic', 'top1', 'top2', 'top3'])

In [52]:
res

Unnamed: 0,month,topic,top1,top2,top3
0,jan,"Agriculture, animals, food and rural affairs",0.022827,0.015314,0.035419
1,jan,"Asylum, immigration and nationality",0.000383,0.007111,0.023402
2,jan,"Business, industry and consumers",0.070588,0.094680,0.134525
3,jan,Communities and families,0.001936,0.008490,0.015678
4,jan,"Crime, civil law, justice and rights",0.095772,0.037987,0.045960
...,...,...,...,...,...
247,dec,Science and technology,0.000000,0.000793,0.009491
248,dec,Social security and pensions,0.000000,0.000480,0.000855
249,dec,Social services,0.000000,0.000000,0.000021
250,dec,Transport,0.073345,0.024219,0.036380


In [53]:
res.to_csv('news_topic_month_shares_2016_no_Others.csv')