In [317]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import class_weight
import multiprocessing as mp
import calendar
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import ast

In [318]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        

In [319]:
# df = pd.read_csv('./bbc_predictions_News_at_One_Six_Ten.csv')
# df = df.drop(['Unnamed: 0'], axis=1)
# df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x)[0][0])

In [320]:
years = list(range(2014, 2019))
df_list = []
for year in years:
    df_list.append(pd.read_csv('../data/partition_predictions/window_topic_prediction_with_short_sentences_merged_54_{}.csv'.format(year)))
df = pd.concat(df_list)
df = df.loc[df.source.isin(['BBC News at One', 'BBC News at Six', 'BBC News at Ten'])]
df = df.drop(['Unnamed: 0'], axis=1)
df['length'] = df['transcript'].apply(lambda x: len(x.split()))
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].apply(lambda x: x.year)
df['month'] = df['date'].apply(lambda x: x.month)
df['day'] = df['date'].apply(lambda x: x.day)
df

Unnamed: 0,partition_id,date,source,transcript,type,topic,length,year,month,day
133,442,2014-03-07,BBC News at Six,The head ofthe The head of the Metropolitan P...,p,"Parliament, government and politics",92,2014,3,7
134,442,2014-03-07,BBC News at Six,Vladimir Putin opens the Winter Paralympics i...,p,"Culture, media and sport",15,2014,3,7
135,442,2014-03-07,BBC News at Six,Network Rail apologises unreservedly for what...,p,"Business, industry and consumers",20,2014,3,7
136,442,2014-03-07,BBC News at Six,"The cost of a pint of milk going down, as the...",p,"Parliament, government and politics",20,2014,3,7
137,442,2014-03-07,BBC News at Six,And trenches where the First World War’s sold...,p,"Parliament, government and politics",18,2014,3,7
...,...,...,...,...,...,...,...,...,...,...
100257,1581,2018-10-31,BBC News at Ten,Racing’s allure stems in part from studying t...,p,"Culture, media and sport",30,2018,10,31
100258,1581,2018-10-31,BBC News at Ten,"Tonight, are jokes about vegans off limits, a...",p,"Agriculture, animals, food and rural affairs",19,2018,10,31
100259,1581,2018-10-31,BBC News at Ten,"We discuss what’s fair game, so to speak, and...",p,"Parliament, government and politics",48,2018,10,31
100260,1581,2018-10-31,BBC News at Ten,Now a pharmaceutical company is facing crimin...,p,Health services and medicine,370,2018,10,31


In [321]:
topics = list(df['topic'].unique())
topics

['Parliament, government and politics',
 'Culture, media and sport',
 'Business, industry and consumers',
 'Transport',
 'Crime, civil law, justice and rights',
 'International affairs',
 'Education',
 'Defence',
 'Energy and environment',
 'Communities and families',
 'Economy and finance',
 'Asylum, immigration and nationality',
 'Health services and medicine',
 'Agriculture, animals, food and rural affairs',
 'European Union',
 'Employment and training',
 'Housing and planning',
 'Science and technology',
 'Social services',
 'Social security and pensions']

In [322]:
years = [2014, 2015, 2016, 2017, 2018]
rows = []
rows_std = []
rows_min = []
rows_max = []

for year in years:
        print(year)
        df_year = df.loc[df.year == year].reset_index(drop=True)
        for month in range(1, 13):
            df_month = df_year.loc[df_year.month == month].reset_index(drop=True)
            
            for day in range(1, calendar.monthrange(year, month)[1] + 1):
                df_day = df_month.loc[df_month.day == day].reset_index(drop=True)
                partition_id = None
                pos = 0
                topic_ranks_for_day = {topic:[] for topic in topics}
                last_topic = None
                for index, row in df_day.iterrows():
                    if partition_id is None:
                        partition_id = row['partition_id']
                        pos = 1
                        last_topic = row['topic']
                    elif partition_id == row['partition_id']:
                        if last_topic == row['topic']:
                            continue
                        last_topic = row['topic']
                        pos += 1
                    else:
                        pos = 1
                        partition_id = row['partition_id']
                        last_topic = row['topic']
                    topic_ranks_for_day[row['topic']].append(pos)
                row = [year, month, day]
                row_std = [year, month, day]
                row_min = [year, month, day]
                row_max = [year, month, day]

                row += [np.nanmean(topic_ranks_for_day[topic]) for topic in topics]
                row_std += [np.nanstd(topic_ranks_for_day[topic]) for topic in topics]
                row_min += [min(topic_ranks_for_day[topic], default=0) for topic in topics]
                row_max += [max(topic_ranks_for_day[topic], default=0) for topic in topics]

                rows_min.append(row_min)
                rows_max.append(row_max)
                rows_std.append(row_std)
                rows.append(row)

2014


  keepdims=keepdims)


2015
2016
2017
2018


In [323]:
# topics = list(topics_name_to_index_map.keys())
header = ['year', 'month', 'day']
header += list(topics)
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']

In [324]:
res = pd.DataFrame(rows, columns=header)
res

Unnamed: 0,year,month,day,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,1,,,,,,,,...,,,,,,,,,,
1,2014,1,2,16.882353,23.5,18.000000,21.3,11.5,17.75,,...,,23.0,13.666667,,,,,,,
2,2014,1,3,15.222222,25.5,12.666667,29.0,,,23.5,...,13.0,20.0,,6.75,,19.0,,,,
3,2014,1,4,,,,,,,,...,,,,,,,,,,
4,2014,1,5,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,2018,12,27,3.333333,,11.000000,9.0,,,,...,,,3.000000,,8.0,,,,,
1822,2018,12,28,8.666667,,9.000000,15.0,18.0,6.00,,...,,3.5,20.000000,17.00,11.5,,,,11.0,
1823,2018,12,29,,,,,,,,...,,,,,,,,,,
1824,2018,12,30,,,,,,,,...,,,,,,,,,,


In [325]:
res_std_df = pd.DataFrame(rows_std, columns=header)
res_std_df

Unnamed: 0,year,month,day,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,1,,,,,,,,...,,,,,,,,,,
1,2014,1,2,10.587908,0.5,6.164414,10.344564,4.112988,5.494315,,...,,0.0,4.189935,,,,,,,
2,2014,1,3,9.052658,1.5,7.408704,1.000000,,,1.5,...,0.0,0.0,,3.76663,,0.0,,,,
3,2014,1,4,,,,,,,,...,,,,,,,,,,
4,2014,1,5,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,2018,12,27,2.054805,,1.000000,2.000000,,,,...,,,1.000000,,0.0,,,,,
1822,2018,12,28,5.405758,,0.000000,8.000000,4.546061,0.000000,,...,,1.5,0.000000,5.00000,7.5,,,,0.0,
1823,2018,12,29,,,,,,,,...,,,,,,,,,,
1824,2018,12,30,,,,,,,,...,,,,,,,,,,


In [326]:
res_min_df = pd.DataFrame(rows_min, columns=header)
res_min_df

Unnamed: 0,year,month,day,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014,1,2,2,23,11,3,3,9,0,...,0,23,8,0,0,0,0,0,0,0
2,2014,1,3,2,24,3,28,0,0,22,...,13,20,0,1,0,19,0,0,0,0
3,2014,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,1,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,2018,12,27,1,0,10,7,0,0,0,...,0,0,2,0,8,0,0,0,0,0
1822,2018,12,28,1,0,9,7,13,6,0,...,0,2,20,12,4,0,0,0,11,0
1823,2018,12,29,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1824,2018,12,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [327]:
res_max_df = pd.DataFrame(rows_max, columns=header)
res_max_df

Unnamed: 0,year,month,day,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014,1,2,36,24,26,34,16,24,0,...,0,23,18,0,0,0,0,0,0,0
2,2014,1,3,29,27,21,30,0,0,25,...,13,20,0,11,0,19,0,0,0,0
3,2014,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,1,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1821,2018,12,27,6,0,12,11,0,0,0,...,0,0,4,0,8,0,0,0,0,0
1822,2018,12,28,16,0,9,23,24,6,0,...,0,5,20,22,19,0,0,0,11,0
1823,2018,12,29,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1824,2018,12,30,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [328]:
res.to_csv('topic_ranks_news_at_1_6_10_by_month_day.csv')
res_std_df.to_csv('topic_ranks_std_news_at_1_6_10_by_month_day.csv')
res_min_df.to_csv('topic_ranks_min_news_at_1_6_10_by_month_day.csv')
res_max_df.to_csv('topic_ranks_max_news_at_1_6_10_by_month_day.csv')

In [329]:
res = res.drop(['day'], axis=1)
final_res = res.groupby(['year', 'month'], as_index=False).mean()
final_res

Unnamed: 0,year,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,13.849128,19.153505,15.013333,17.775556,13.398183,13.191378,17.0,17.122807,...,9.322685,16.0,16.990278,14.767857,16.333333,14.5,19.0,,,
1,2014,2,14.788249,17.459792,14.761786,18.774831,14.309863,13.19537,18.287037,15.964103,...,17.485417,4.5,20.99,15.367857,16.407407,14.1,14.0,,,
2,2014,3,14.892898,17.294434,15.25822,13.698485,16.1165,12.076417,19.087143,14.395739,...,13.94582,20.666667,14.838542,20.6875,7.1,15.488095,31.0,,,
3,2014,4,12.359295,15.180123,13.096491,15.825728,12.860903,11.548485,13.516667,14.435714,...,14.971429,25.5,14.705357,14.866667,13.611111,11.25,,,,
4,2014,5,14.68165,18.986293,12.578563,20.339706,14.334553,13.040985,15.505556,14.854976,...,14.934314,17.333333,19.468421,21.590909,14.785185,20.375,24.5,,,
5,2014,6,13.616138,17.462524,14.442143,20.012976,15.059636,11.335676,14.953505,15.138778,...,11.908254,14.0,14.184259,15.151515,12.62,13.485714,,,,
6,2014,7,14.81137,17.301779,15.2956,17.695296,15.331571,9.841139,16.552941,13.338095,...,13.382353,,17.162755,20.681818,18.0,14.666667,,,,
7,2014,8,12.846431,14.883658,13.822361,17.452941,13.372044,9.842328,13.837946,13.510196,...,12.611111,8.0,12.292361,15.55,21.875,18.5,,,,
8,2014,9,11.86778,13.198531,13.109051,14.907857,12.343501,11.092108,15.022727,11.541176,...,13.407158,17.777778,12.545238,14.163636,18.0,10.333333,,,,
9,2014,10,13.91532,16.349892,15.710582,18.972933,13.258244,12.872293,17.054444,16.454678,...,14.796296,14.0,14.283223,21.930556,15.928571,15.316667,,,13.0,


In [330]:
res_std_df = res_std_df.drop(['day'], axis=1)
res_std_df = res_std_df.groupby(['year', 'month'], as_index=False).mean()
res_std_df

Unnamed: 0,year,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,8.230017,5.590045,5.795167,4.876354,5.375219,5.018023,1.85736,2.499154,...,2.431358,0.0,3.673558,2.154822,0.0,0.0,0.0,,,
1,2014,2,8.917217,6.404066,5.141061,7.927078,6.708357,5.263432,3.016205,3.130924,...,3.031141,1.5,2.579852,2.611471,1.623727,0.7,0.0,,,
2,2014,3,8.789966,6.753986,6.095727,7.061927,6.331568,4.987746,2.841871,4.133993,...,3.417754,0.333333,2.901786,2.0625,1.9,1.127085,0.0,,,
3,2014,4,7.314587,5.692723,4.3769,5.092765,5.535679,4.871147,3.931906,3.668067,...,1.983788,2.160247,2.34376,1.77998,1.624819,1.75,,,,
4,2014,5,8.705774,8.800569,6.570228,3.485289,7.946091,6.288266,3.1175,4.734575,...,4.280994,0.333333,3.567795,2.917207,1.996582,0.625,0.0,,,
5,2014,6,8.382115,7.389446,6.276261,2.99115,7.26964,6.397866,3.700957,4.503766,...,3.076617,1.471405,3.354051,1.41976,1.604099,0.649018,,,,
6,2014,7,8.70539,7.804603,6.149931,5.774493,6.921993,6.651559,5.187908,3.910825,...,1.517524,,4.576063,1.938948,1.0,0.0,,,,
7,2014,8,7.563102,6.223117,5.908615,3.741635,5.690758,5.392611,4.807983,4.131275,...,1.45095,0.0,4.394244,1.544392,4.189338,0.0,,,,
8,2014,9,7.23407,6.115502,5.372905,4.165682,4.823374,4.712212,3.065193,2.75376,...,4.001993,1.930905,2.661296,2.556189,1.982821,0.833333,,,,
9,2014,10,8.271383,7.787491,6.536992,4.377015,7.047882,6.161776,2.882857,4.769379,...,2.971623,0.0,6.873486,2.347556,3.917393,2.446987,,,0.0,


In [331]:
res_min_df = res_min_df.drop(['day'], axis=1)
res_min_df = res_min_df.groupby(['year', 'month'], as_index=False).mean()
res_min_df

Unnamed: 0,year,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,1.225806,6.580645,5.483871,5.354839,4.129032,3.935484,5.774194,8.806452,...,3.741935,2.064516,7.709677,5.483871,1.580645,1.870968,0.612903,0.0,0.0,0.0
1,2014,2,0.928571,4.571429,6.107143,4.928571,4.178571,3.714286,4.785714,5.714286,...,7.678571,0.107143,9.678571,6.285714,1.5,2.392857,0.5,0.0,0.0,0.0
2,2014,3,1.096774,5.129032,5.354839,3.290323,5.064516,3.774194,4.935484,5.709677,...,4.935484,1.967742,5.709677,4.806452,0.83871,3.16129,1.0,0.0,0.0,0.0
3,2014,4,1.2,5.6,4.866667,5.5,4.466667,3.966667,4.433333,4.733333,...,5.9,1.566667,5.666667,3.733333,2.366667,1.266667,0.0,0.0,0.0,0.0
4,2014,5,0.870968,2.935484,2.83871,8.354839,2.096774,2.548387,6.806452,5.451613,...,5.419355,1.645161,8.83871,6.580645,6.064516,2.548387,1.580645,0.0,0.0,0.0
5,2014,6,0.8,3.466667,4.666667,10.933333,3.4,2.033333,5.9,4.833333,...,4.166667,1.233333,6.0,4.933333,1.666667,2.933333,0.0,0.0,0.0,0.0
6,2014,7,0.967742,3.0,5.129032,7.451613,4.967742,1.258065,5.322581,5.0,...,6.419355,0.0,7.806452,6.612903,1.096774,2.83871,0.0,0.0,0.0,0.0
7,2014,8,0.903226,3.516129,4.193548,6.935484,3.419355,1.419355,3.83871,5.0,...,3.258065,0.258065,4.516129,4.483871,1.0,1.193548,0.0,0.0,0.0,0.0
8,2014,9,0.933333,3.4,3.566667,6.333333,4.133333,3.4,4.133333,4.966667,...,3.466667,1.566667,5.933333,4.1,3.666667,1.9,0.0,0.0,0.0,0.0
9,2014,10,0.935484,3.322581,5.16129,8.0,3.0,2.935484,6.516129,6.419355,...,6.806452,0.903226,3.129032,7.451613,2.806452,1.903226,0.0,0.0,0.83871,0.0


In [332]:
res_max_df = res_max_df.drop(['day'], axis=1)
res_max_df = res_max_df.groupby(['year', 'month'], as_index=False).mean()
res_max_df

Unnamed: 0,year,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,2014,1,19.967742,18.419355,14.967742,11.096774,14.935484,13.741935,7.419355,12.096774,...,7.483871,2.064516,13.548387,7.83871,1.580645,1.870968,0.612903,0.0,0.0,0.0
1,2014,2,22.785714,18.714286,15.678571,17.857143,16.5,13.142857,7.035714,9.0,...,11.857143,0.214286,13.071429,9.428571,2.0,2.642857,0.5,0.0,0.0,0.0
2,2014,3,20.645161,17.483871,15.645161,14.483871,17.225806,13.709677,7.16129,12.032258,...,9.903226,2.032258,9.258065,5.870968,1.451613,3.806452,1.0,0.0,0.0,0.0
3,2014,4,18.566667,17.2,11.6,13.533333,15.4,13.433333,8.866667,8.8,...,8.1,1.9,8.433333,5.033333,3.133333,1.733333,0.0,0.0,0.0,0.0
4,2014,5,22.225806,20.645161,15.774194,13.354839,18.580645,16.419355,10.903226,13.387097,...,11.064516,1.709677,14.741935,8.741935,8.516129,2.709677,1.580645,0.0,0.0,0.0
5,2014,6,20.533333,19.9,15.466667,15.733333,17.6,16.433333,11.8,10.666667,...,7.833333,1.533333,11.1,6.133333,2.433333,3.366667,0.0,0.0,0.0,0.0
6,2014,7,23.483871,21.870968,17.709677,18.548387,19.258065,17.870968,12.580645,11.83871,...,8.451613,0.0,15.806452,8.064516,1.225806,2.83871,0.0,0.0,0.0,0.0
7,2014,8,18.096774,15.612903,14.064516,12.258065,13.935484,13.645161,9.903226,11.774194,...,4.290323,0.258065,12.677419,5.516129,1.645161,1.193548,0.0,0.0,0.0,0.0
8,2014,9,18.966667,15.766667,12.5,13.266667,13.666667,14.2,6.666667,8.666667,...,7.966667,2.033333,10.866667,6.166667,4.8,2.233333,0.0,0.0,0.0,0.0
9,2014,10,20.387097,18.677419,16.516129,15.870968,17.258065,15.677419,9.806452,13.967742,...,10.870968,0.903226,17.806452,9.387097,4.677419,2.935484,0.0,0.0,0.83871,0.0


In [333]:
final_res.to_csv('topic_ranks_news_at_1_6_10_by_year_month.csv')
res_std_df.to_csv('topic_ranks_std_news_at_1_6_10_by_year_month.csv')
res_min_df.to_csv('topic_ranks_min_news_at_1_6_10_by_year_month.csv')
res_max_df.to_csv('topic_ranks_max_news_at_1_6_10_by_year_month.csv')

# Topic Ranks for all

In [8]:
years = [2015, 2016, 2017]
bbc_ids = [54, 106, 107, 175, 279]
exclude = 'no_Others'

In [14]:
rows = []
for year in years:
    for bbc_id in bbc_ids:
        print(year, bbc_id)
        df = pd.read_csv('../data/partition_predictions/topics_pred_on_bert_partitioned_bbc_{}_{}_with_news_classifier_{}.csv'.format(bbc_id, year, exclude))
        df = df.drop(['Unnamed: 0'], axis=1)
        df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x))
        df['topic'] = df['topic'].apply(lambda x: x[0][0])
        df['length'] = df['transcript'].apply(lambda x: len(x.split()))
        df
        for month in range(1, 13):
            df_month = df.loc[df.date.str.contains(months[month-1])]

            partition_id = None
            pos = 0
            topic_ranks_for_month = {topic:[] for topic in topics}
            last_topic = None
            for index, row in df_month.iterrows():
                if partition_id is None:
                    partition_id = row['partition_id']
                    pos = 1
                    last_topic = row['topic']
                elif partition_id == row['partition_id']:
                    if last_topic == row['topic']:
                        continue
                    last_topic = row['topic']
                    pos += 1
                else:
                    pos = 1
                    partition_id = row['partition_id']
                    last_topic = row['topic']
                topic_ranks_for_month[row['topic']].append(pos)
            row = [year, month, bbc_id]
            row += [np.nanmean(topic_ranks_for_month[topic]) for topic in topics]
            rows.append(row)

2015 54




2015 106
2015 107
2015 175
2015 279
2016 54
2016 106
2016 107
2016 175
2016 279
2017 54
2017 106
2017 107
2017 175
2017 279


In [15]:
res = pd.DataFrame(rows, columns=header)

In [16]:
res

Unnamed: 0,year,month,bbc_id,"Agriculture, animals, food and rural affairs","Asylum, immigration and nationality","Business, industry and consumers",Communities and families,"Crime, civil law, justice and rights","Culture, media and sport",Defence,...,European Union,Health services and medicine,Housing and planning,International affairs,"Parliament, government and politics",Science and technology,Social security and pensions,Social services,Transport,Others
0,2015,1,54,15.210526,,21.150000,3.000000,13.023077,15.417062,12.000000,...,2.000000,12.547619,,16.170347,14.199005,,,,15.297521,
1,2015,2,54,61.107143,,41.321168,,25.737255,35.067024,19.888889,...,60.333333,30.661017,,38.699758,33.057265,,,,34.092652,
2,2015,3,54,54.485714,,51.276119,39.333333,27.631868,36.436433,49.452830,...,109.333333,34.557047,,50.253378,37.722001,,,,42.400000,
3,2015,4,54,53.900000,,33.553846,6.000000,22.686869,31.476501,28.411765,...,42.866667,23.136986,,39.393443,29.233687,,,,30.746479,
4,2015,5,54,32.131579,,34.819672,16.500000,17.326633,24.128125,44.000000,...,26.538462,15.655556,,36.459807,25.860558,,,,25.664093,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,2017,8,279,34.154762,52.0,27.028571,26.142857,30.586895,32.776931,32.080808,...,26.487500,35.391566,37.800000,29.200000,32.434870,,,,37.000000,
176,2017,9,279,35.967213,44.0,29.528302,25.695652,27.386018,29.629744,21.134328,...,25.806763,31.413333,16.000000,26.235849,30.118151,,,,27.022472,
177,2017,10,279,23.945946,7.0,25.676471,40.142857,30.840532,29.225138,36.588235,...,24.693333,35.266667,38.000000,26.538462,27.959596,,,,29.714286,
178,2017,11,279,46.411111,76.0,50.549020,50.466667,41.973333,45.995757,40.569620,...,44.082840,47.000000,69.750000,40.942500,45.899787,,,,53.112676,


In [17]:
res.to_csv('partition_topic_ranks_by_month_{}_{}.csv'.format(years[0], years[-1]))

In [19]:
header = ['year', 'bbc_id']
header += topics

In [21]:
rows = []
for year in years:
    for bbc_id in bbc_ids:
        print(year, bbc_id)
        df = pd.read_csv('../data/partition_predictions/topics_pred_on_bert_partitioned_bbc_{}_{}_with_news_classifier_{}.csv'.format(bbc_id, year, exclude))
        df = df.drop(['Unnamed: 0'], axis=1)
        df['topic'] = df['topic'].apply(lambda x: ast.literal_eval(x))
        df['topic'] = df['topic'].apply(lambda x: x[0][0])
        df['length'] = df['transcript'].apply(lambda x: len(x.split()))
        df['date'] = pd.to_datetime(df['date'])
        mask = (df['date'] >= '2015-5-1') & (df['date'] <= '2017-5-31')
        df = df[mask]

        partition_id = None
        pos = 0
        topic_ranks_for_month = {topic:[] for topic in topics}
        last_topic = None
        for index, row in df.iterrows():
            if partition_id is None:
                partition_id = row['partition_id']
                pos = 1
                last_topic = row['topic']
            elif partition_id == row['partition_id']:
                if last_topic == row['topic']:
                    continue
                last_topic = row['topic']
                pos += 1
            else:
                pos = 1
                partition_id = row['partition_id']
                last_topic = row['topic']
            topic_ranks_for_month[row['topic']].append(pos)
        row = [year, bbc_id]
        row += [np.nanmean(topic_ranks_for_month[topic]) for topic in topics]
        rows.append(row)

2015 54




2015 106
2015 107
2015 175
2015 279
2016 54
2016 106
2016 107
2016 175
2016 279
2017 54
2017 106
2017 107
2017 175
2017 279


In [22]:
res = pd.DataFrame(rows, columns=header)
res

Unnamed: 0,year,bbc_id,"Agriculture, animals, food and rural affairs","Asylum, immigration and nationality","Business, industry and consumers",Communities and families,"Crime, civil law, justice and rights","Culture, media and sport",Defence,Economy and finance,...,European Union,Health services and medicine,Housing and planning,International affairs,"Parliament, government and politics",Science and technology,Social security and pensions,Social services,Transport,Others
0,2015,54,34.473118,,27.649502,13.090909,19.071279,26.667328,21.516667,18.416149,...,28.337079,23.231563,,31.977753,26.135474,,,,29.415996,
1,2015,106,21.414286,,17.652672,,15.495708,18.430566,12.540984,15.633333,...,13.56,18.122905,,12.693368,15.561523,,,,17.938947,
2,2015,107,14.962025,,11.987879,,9.688017,12.757348,9.27027,7.809524,...,6.8,10.339853,,7.302554,11.189713,,,,12.495591,
3,2015,175,16.616279,,12.484412,17.6,13.350638,14.529379,11.156716,8.975,...,8.532258,12.335025,,9.890323,12.810626,,,,15.282609,
4,2015,279,9.876923,,8.849765,24.5,8.135965,9.83227,7.144144,6.319149,...,5.918919,9.265403,,7.639463,8.738192,,,,9.560855,
5,2016,54,69.144056,,70.162986,23.571429,43.500178,56.759548,54.504065,61.798621,...,50.840888,48.258268,,66.522819,56.756708,,,,56.498967,
6,2016,106,16.80303,,12.876364,20.0,13.524416,13.796976,11.0,12.592233,...,12.6917,15.137931,,10.586124,12.682808,,,,14.744186,
7,2016,107,12.246753,,9.140541,11.0,8.902711,10.587509,7.825,7.727273,...,8.12069,9.59944,,7.732143,10.010212,,,,11.673016,
8,2016,175,13.719101,,9.731844,7.6,9.382716,10.199633,8.915385,9.034091,...,8.029674,10.016845,,8.718182,9.720202,,,,10.416603,
9,2016,279,29.580357,,20.009804,16.5,18.682256,19.710173,17.084906,16.92053,...,19.156733,29.456311,,19.225148,18.528853,,,,26.686147,


In [23]:
res = res.drop(['year'], axis=1)
final_res = res.groupby(['bbc_id'], as_index=False).mean()
final_res

Unnamed: 0,bbc_id,"Agriculture, animals, food and rural affairs","Asylum, immigration and nationality","Business, industry and consumers",Communities and families,"Crime, civil law, justice and rights","Culture, media and sport",Defence,Economy and finance,Education,...,European Union,Health services and medicine,Housing and planning,International affairs,"Parliament, government and politics",Science and technology,Social security and pensions,Social services,Transport,Others
0,54,56.991723,86.95,59.572666,25.057143,40.762557,49.175048,47.325292,49.826362,39.850216,...,46.456273,41.967848,19.56,56.406343,48.628796,118.5,,,46.345935,
1,106,18.127994,16.333333,14.837164,18.375,13.406445,14.732616,12.368207,13.692786,15.008818,...,12.045888,15.537316,21.307692,11.485826,13.003871,,,,14.057341,
2,107,13.054887,4.0,10.198362,11.586207,8.678271,10.723378,8.495721,7.778031,10.416599,...,7.163667,9.592619,3.333333,7.116671,9.783149,,,,11.543044,
3,175,13.756721,,10.788048,11.482305,10.7127,11.507565,9.906573,9.257983,10.981377,...,8.240492,10.856898,9.75,9.117745,10.880042,,,,11.826246,
4,279,25.450323,9.2,18.660342,23.097778,17.970432,18.931832,16.660655,15.741942,22.292194,...,16.92548,22.389135,23.428571,17.543412,17.920092,,,,24.396827,


In [24]:
final_res.to_csv('partition_topic_ranks_by_2015_may_2017_may.csv')

# Method2: Word ranks weighing

In [266]:
# topics = list(topics_name_to_index_map.keys())
year = 2018
bbc_id = 54
header = ['day', 'month']
header += topics
rows = []
rows_std = []
rows_min = []
rows_max = []
lrows = []

In [267]:
df = pd.read_csv('../data/partition_predictions/window_topic_prediction_with_short_sentences_merged_54_{}.csv'.format(year))
df = df.drop(['Unnamed: 0'], axis=1)
df['length'] = df['transcript'].apply(lambda x: len(x.split()))
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x: '{}-{}-{}'.format(x.day, calendar.month_abbr[x.month].lower(), x.year))

In [268]:
df

Unnamed: 0,partition_id,date,source,transcript,type,topic,length
0,0,2-jun-2018,BBC News,"but also gathering storm clouds, some further...",p,Energy and environment,607
1,0,2-jun-2018,BBC News,"Jotheremy Thorpe, I’m arresting you for consp...",p,"Parliament, government and politics",142
2,0,2-jun-2018,BBC News,Canada hasjoined the European Union in filing...,p,European Union,94
3,0,2-jun-2018,BBC News,After the closure of its main steel plant 20 ...,p,"Business, industry and consumers",229
4,0,2-jun-2018,BBC News,This is further weakening the transatlantic r...,p,Transport,85
...,...,...,...,...,...,...,...
100273,1582,31-oct-2018,BBC London News,John is one of a group of Hackney families ta...,p,Education,278
100274,1582,31-oct-2018,BBC London News,The Duchess of Cambridge has paid a visit to ...,p,Defence,66
100275,1582,31-oct-2018,BBC London News,"Well, I’m off with the team — to do some late...",p,"Parliament, government and politics",1507
100276,1582,31-oct-2018,BBC London News,"CHEERING AND APPLAUSE Hello, and welcome to A...",p,"Parliament, government and politics",48


In [269]:
def score(start_pos, end_pos):
#     print(start_pos, end_pos)
    x = np.sum(range(start_pos, end_pos))/(end_pos - start_pos + 1)
    return x

In [270]:
for month in range(1, 13):
    for day in range(1, calendar.monthrange(year, month)[1]): # calendar.monthrange(year, month)[1]
        date_string = '{}-{}-{}'.format(day, calendar.month_abbr[month].lower(), year)
        print(date_string)
        df_day = df.loc[df.date == date_string].reset_index(drop=True)
        partition_ids = df_day.partition_id.unique()
        topic_ranks_for_day = {topic:[] for topic in topics}
        lengths_by_topic = {topic: [] for topic in topics}
        
        for id_ in partition_ids:
            df_partition = df_day.loc[df_day.partition_id == id_].reset_index(drop=True)
#             display(df_partition)
            length = df_partition['length'].sum()
            last_topic = None
            start_pos = 1
            end_pos = 1
            total_length_now = 0
            
            for index, row in df_partition.iterrows():
                
                if total_length_now >= 500:
                    break
                    
                if last_topic is None:
                    start_pos = 1
                    end_pos = start_pos + row['length'] - 1
                    last_topic = row['topic']
                elif last_topic == row['topic']:
                    end_pos = end_pos + row['length']
                    if index == len(df_partition) - 1:
                        topic_ranks_for_day[last_topic].append(score(start_pos, end_pos)) 
                        lengths_by_topic[last_topic].append(end_pos - start_pos + 1)
                else:
                    topic_ranks_for_day[last_topic].append(score(start_pos, end_pos))
                    lengths_by_topic[last_topic].append(end_pos - start_pos + 1)
                    start_pos = end_pos + 1
                    end_pos = start_pos + row['length'] - 1
                    last_topic = row['topic']
                total_length_now += row['length']
#                 print('start: ', start_pos)
#                 print('end: ', end_pos)
#         for topic in topics:
#             print(topic, topic_ranks_for_day[topic])
        row = [day, month]
        row_std = [day, month]
        lrow = [day, month]
        row_min = [day, month]
        row_max = [day, month]
        lrow += [np.mean(lengths_by_topic[topic]) for topic in topics]
        row += [np.mean(topic_ranks_for_day[topic]) for topic in topics]
        row_std += [np.nanstd(topic_ranks_for_day[topic]) for topic in topics]
        row_min += [min(topic_ranks_for_day[topic], default=0) for topic in topics]
        row_max += [max(topic_ranks_for_day[topic], default=0) for topic in topics]
        rows.append(row)
        rows_std.append(row_std)
        rows_min.append(row_min)
        rows_max.append(row_max)
        lrows.append(lrow)

1-jan-2018
2-jan-2018
3-jan-2018
4-jan-2018
5-jan-2018
6-jan-2018


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)


7-jan-2018
8-jan-2018
9-jan-2018
10-jan-2018
11-jan-2018
12-jan-2018
13-jan-2018
14-jan-2018
15-jan-2018
16-jan-2018
17-jan-2018
18-jan-2018
19-jan-2018
20-jan-2018
21-jan-2018
22-jan-2018
23-jan-2018
24-jan-2018
25-jan-2018
26-jan-2018
27-jan-2018
28-jan-2018
29-jan-2018
30-jan-2018
1-feb-2018
2-feb-2018
3-feb-2018
4-feb-2018
5-feb-2018
6-feb-2018
7-feb-2018
8-feb-2018
9-feb-2018
10-feb-2018
11-feb-2018
12-feb-2018
13-feb-2018
14-feb-2018
15-feb-2018
16-feb-2018
17-feb-2018
18-feb-2018
19-feb-2018
20-feb-2018
21-feb-2018
22-feb-2018
23-feb-2018
24-feb-2018
25-feb-2018
26-feb-2018
27-feb-2018
1-mar-2018
2-mar-2018
3-mar-2018
4-mar-2018
5-mar-2018
6-mar-2018
7-mar-2018
8-mar-2018
9-mar-2018
10-mar-2018
11-mar-2018
12-mar-2018
13-mar-2018
14-mar-2018
15-mar-2018
16-mar-2018
17-mar-2018
18-mar-2018
19-mar-2018
20-mar-2018
21-mar-2018
22-mar-2018
23-mar-2018
24-mar-2018
25-mar-2018
26-mar-2018
27-mar-2018
28-mar-2018
29-mar-2018
30-mar-2018
1-apr-2018
2-apr-2018
3-apr-2018
4-apr-2018
5-apr

In [271]:
res = pd.DataFrame(rows, columns=header)
res = res.fillna(0)
res.to_csv('partition_topic_ranks_by_day_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res

Unnamed: 0,day,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,1,1,150.602309,64.246667,0.000000,260.048843,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,186.000000,0.0,0.0,0.0,0.000000,0.0
1,2,1,199.734452,183.997494,0.000000,107.261734,0.000000,188.400627,0.0,0.000000,...,0.0,0.000000,139.827688,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
2,3,1,222.881454,114.269231,0.000000,0.000000,0.000000,305.708333,0.0,0.000000,...,0.0,0.000000,119.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
3,4,1,115.250390,181.731336,0.000000,153.205686,0.000000,121.875000,0.0,0.000000,...,0.0,0.000000,42.250000,11.500000,0.000000,0.0,0.0,0.0,0.000000,0.0
4,5,1,168.558306,150.599185,146.106643,0.000000,135.065763,197.029412,0.0,0.000000,...,0.0,0.000000,176.659574,399.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,26,12,137.760314,21.448718,52.847947,90.066667,13.500000,0.000000,0.0,130.921875,...,0.0,0.000000,60.000000,120.240509,0.000000,0.0,0.0,0.0,74.848485,0.0
349,27,12,108.913896,0.000000,0.000000,136.429688,286.631420,282.708333,0.0,339.375000,...,0.0,0.000000,51.469388,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0
350,28,12,129.411883,267.422222,100.852800,53.215517,125.416667,114.352941,0.0,0.000000,...,0.0,129.455623,154.590426,113.790698,141.086957,0.0,0.0,0.0,0.000000,0.0
351,29,12,129.304691,170.354175,178.132275,220.606061,0.000000,0.000000,0.0,0.000000,...,0.0,189.146249,0.000000,95.555556,0.000000,0.0,0.0,0.0,0.000000,0.0


In [272]:
res = pd.DataFrame(rows_std, columns=header)
res = res.fillna(0)
res.to_csv('partition_topic_ranks_std_by_day_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res

Unnamed: 0,day,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,1,1,104.460250,10.246667,0.000000,126.565618,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,2,1,119.776449,104.481720,0.000000,55.302971,0.000000,169.281442,0.0,0.0,...,0.0,0.000000,48.431136,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1,87.402898,69.213466,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,54.659171,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,4,1,63.341282,113.242207,0.000000,112.307135,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,1.750000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,5,1,121.122610,74.848972,1.970280,0.000000,101.227557,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,26,12,132.938440,1.448718,11.690846,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,133.782873,0.0,0.0,0.0,0.0,0.0,0.0
349,27,12,130.102786,0.000000,0.000000,64.103712,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
350,28,12,95.330743,0.000000,62.316214,18.715517,0.000000,0.000000,0.0,0.0,...,0.0,80.030091,107.590426,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
351,29,12,104.061584,90.247277,107.846561,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,82.953878,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [273]:
res2 = res.groupby(['month']).agg('mean').drop(['day'], axis=1)
res2

Unnamed: 0_level_0,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,Energy and environment,Communities and families,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,105.926386,66.390224,16.265077,20.316967,21.847659,17.6265,0.0,4.770278,11.686996,0.0,13.938533,0.0,21.030601,5.365848,0.89569,0.0,0.0,0.0,0.163042,0.0
2,89.392847,64.806719,5.421141,17.07173,30.906867,32.531012,4.504674,0.0,10.983957,2.565502,7.140674,0.0,5.367736,5.187377,10.097567,0.0,0.0,0.0,0.0,0.0
3,107.130213,67.107394,2.384639,35.308384,27.592772,28.627175,1.12533,31.898533,13.964877,2.450304,0.65508,0.0,22.772206,4.532872,0.584615,0.0,0.0,0.0,0.0,0.0
4,107.343865,72.592407,2.459444,12.858204,10.765396,41.41199,0.105933,10.863074,7.1575,3.51444,5.962195,15.319828,13.735561,0.0,6.781749,0.0,0.0,0.0,0.0,0.0
5,88.93453,50.693074,5.337737,20.471756,12.565843,31.944445,3.950613,0.756472,10.652825,0.227846,2.282917,0.0,13.506105,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,107.779389,53.662067,6.61687,10.938269,32.395907,28.908822,4.607124,9.797331,2.610285,4.554258,5.18077,0.0,21.356432,0.0,11.687417,0.0,0.0,0.0,3.680049,0.0
7,110.749708,54.278535,0.0,36.588153,15.952125,15.383692,3.553125,9.284428,23.648211,0.0,0.0,0.0,18.931789,7.593129,22.420992,0.0,0.0,0.0,0.0,0.0
8,90.559544,50.016703,0.598652,14.632281,31.398708,26.709477,0.0,0.0,9.817434,3.739307,0.507591,0.0,11.085998,5.522041,6.274156,0.0,0.0,0.0,0.0,0.0
9,96.770551,68.061199,2.190606,3.622471,15.754913,15.802437,0.0,3.953648,10.400141,0.0,3.983397,0.0,9.734016,7.012232,31.257602,0.0,0.0,0.0,0.0,0.0
10,110.520944,59.481943,14.627855,24.506401,32.709106,32.932603,1.986738,7.693366,17.301695,0.741182,23.739548,0.0,4.511494,1.692045,18.143417,0.983333,0.0,0.0,0.0,0.0


In [274]:
res2.to_csv('partition_topic_ranks_std_by_month_words_weight_{}_{}.csv'.format(bbc_id, year))

In [275]:
res_min = pd.DataFrame(rows_min, columns=header)
res_min = res_min.fillna(0)
res_min.to_csv('partition_topic_ranks_min_by_day_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res_min

Unnamed: 0,day,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,1,1,3.0,54.000000,0.000000,42.402778,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,186.000000,0.0,0.0,0,0.000000,0
1,2,1,1.5,97.500000,0.000000,51.958763,0.000000,31.500000,0.0,0.000000,...,0.0,0.000000,91.396552,0.000000,0.000000,0.0,0.0,0,0.000000,0
2,3,1,95.5,36.000000,0.000000,0.000000,0.000000,305.708333,0.0,0.000000,...,0.0,0.000000,76.500000,0.000000,0.000000,0.0,0.0,0,0.000000,0
3,4,1,3.0,13.500000,0.000000,40.898551,0.000000,121.875000,0.0,0.000000,...,0.0,0.000000,40.500000,11.500000,0.000000,0.0,0.0,0,0.000000,0
4,5,1,1.0,56.962617,144.136364,0.000000,45.342857,197.029412,0.0,0.000000,...,0.0,0.000000,176.659574,399.000000,0.000000,0.0,0.0,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,26,12,1.5,20.000000,36.416667,90.066667,13.500000,0.000000,0.0,130.921875,...,0.0,0.000000,60.000000,12.500000,0.000000,0.0,0.0,0,74.848485,0
349,27,12,1.0,0.000000,0.000000,60.500000,286.631420,282.708333,0.0,339.375000,...,0.0,0.000000,51.469388,0.000000,0.000000,0.0,0.0,0,0.000000,0
350,28,12,9.0,267.422222,38.536585,34.500000,125.416667,114.352941,0.0,0.000000,...,0.0,49.425532,47.000000,113.790698,141.086957,0.0,0.0,0,0.000000,0
351,29,12,12.5,85.000000,70.285714,220.606061,0.000000,0.000000,0.0,0.000000,...,0.0,97.350000,0.000000,95.555556,0.000000,0.0,0.0,0,0.000000,0


In [276]:
res_max = pd.DataFrame(rows_max, columns=header)
res_max = res_max.fillna(0)
res_max.to_csv('partition_topic_ranks_max_by_day_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res_max

Unnamed: 0,day,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,1,1,367.350000,74.493333,0.000000,428.921569,0.000000,0.000000,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,186.000000,0.0,0.0,0,0.000000,0
1,2,1,439.687500,330.992481,0.000000,162.564706,0.000000,423.442623,0.0,0.000000,...,0.0,0.000000,188.258824,0.000000,0.000000,0.0,0.0,0,0.000000,0
2,3,1,376.656566,204.307692,0.000000,0.000000,0.000000,305.708333,0.0,0.000000,...,0.0,0.000000,211.500000,0.000000,0.000000,0.0,0.0,0,0.000000,0
3,4,1,223.076923,384.455026,0.000000,265.512821,0.000000,121.875000,0.0,0.000000,...,0.0,0.000000,44.000000,11.500000,0.000000,0.0,0.0,0,0.000000,0
4,5,1,351.521008,275.086957,148.076923,0.000000,291.263736,197.029412,0.0,0.000000,...,0.0,0.000000,176.659574,399.000000,0.000000,0.0,0.0,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,26,12,438.866667,22.897436,62.652174,90.066667,13.500000,0.000000,0.0,130.921875,...,0.0,0.000000,60.000000,308.798450,0.000000,0.0,0.0,0,74.848485,0
349,27,12,323.468085,0.000000,0.000000,217.289062,286.631420,282.708333,0.0,339.375000,...,0.0,0.000000,51.469388,0.000000,0.000000,0.0,0.0,0,0.000000,0
350,28,12,332.425532,267.422222,163.169014,71.931034,125.416667,114.352941,0.0,0.000000,...,0.0,209.485714,262.180851,113.790698,141.086957,0.0,0.0,0,0.000000,0
351,29,12,361.214286,314.335793,285.978836,220.606061,0.000000,0.000000,0.0,0.000000,...,0.0,298.306569,0.000000,95.555556,0.000000,0.0,0.0,0,0.000000,0


In [277]:
for topic in topics:
    res[topic] = 500 - res[topic]
    res_min[topic] = 500 - res_min[topic]
    res_max[topic] = 500 - res_max[topic]

In [278]:
res2 = res.groupby(['month']).agg('mean').drop(['day'], axis=1)
res2.to_csv('partition_topic_ranks_by_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res2

Unnamed: 0_level_0,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,Energy and environment,Communities and families,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,394.073614,433.609776,483.734923,479.683033,478.152341,482.3735,500.0,495.229722,488.313004,500.0,486.061467,500.0,478.969399,494.634152,499.10431,500.0,500.0,500.0,499.836958,500.0
2,410.607153,435.193281,494.578859,482.92827,469.093133,467.468988,495.495326,500.0,489.016043,497.434498,492.859326,500.0,494.632264,494.812623,489.902433,500.0,500.0,500.0,500.0,500.0
3,392.869787,432.892606,497.615361,464.691616,472.407228,471.372825,498.87467,468.101467,486.035123,497.549696,499.34492,500.0,477.227794,495.467128,499.415385,500.0,500.0,500.0,500.0,500.0
4,392.656135,427.407593,497.540556,487.141796,489.234604,458.58801,499.894067,489.136926,492.8425,496.48556,494.037805,484.680172,486.264439,500.0,493.218251,500.0,500.0,500.0,500.0,500.0
5,411.06547,449.306926,494.662263,479.528244,487.434157,468.055555,496.049387,499.243528,489.347175,499.772154,497.717083,500.0,486.493895,500.0,500.0,500.0,500.0,500.0,500.0,500.0
6,392.220611,446.337933,493.38313,489.061731,467.604093,471.091178,495.392876,490.202669,497.389715,495.445742,494.81923,500.0,478.643568,500.0,488.312583,500.0,500.0,500.0,496.319951,500.0
7,389.250292,445.721465,500.0,463.411847,484.047875,484.616308,496.446875,490.715572,476.351789,500.0,500.0,500.0,481.068211,492.406871,477.579008,500.0,500.0,500.0,500.0,500.0
8,409.440456,449.983297,499.401348,485.367719,468.601292,473.290523,500.0,500.0,490.182566,496.260693,499.492409,500.0,488.914002,494.477959,493.725844,500.0,500.0,500.0,500.0,500.0
9,403.229449,431.938801,497.809394,496.377529,484.245087,484.197563,500.0,496.046352,489.599859,500.0,496.016603,500.0,490.265984,492.987768,468.742398,500.0,500.0,500.0,500.0,500.0
10,389.479056,440.518057,485.372145,475.493599,467.290894,467.067397,498.013262,492.306634,482.698305,499.258818,476.260452,500.0,495.488506,498.307955,481.856583,499.016667,500.0,500.0,500.0,500.0


In [279]:
res_min2 = res_min.groupby(['month']).agg('mean').drop(['day'], axis=1)
res_min2.to_csv('partition_topic_ranks_min_by_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res_min2

Unnamed: 0_level_0,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,Energy and environment,Communities and families,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,488.866667,427.268159,446.056511,431.61778,424.904978,406.61362,500.0,473.622413,412.424605,464.6213,444.37887,500.0,415.757455,479.289167,474.846221,497.066296,488.635627,500.0,486.087548,500.0
2,476.445156,395.732432,425.059348,444.255436,427.177803,463.305307,474.003006,488.511628,475.141273,478.652958,465.048589,500.0,465.673842,479.779632,479.611342,488.948336,500.0,500.0,500.0,500.0
3,482.304722,421.635851,455.827349,416.447157,414.283061,435.707084,476.17672,436.583245,436.199595,484.484891,475.275488,488.45619,415.78405,464.362386,469.45826,494.301984,500.0,500.0,487.005669,500.0
4,485.77194,420.006214,428.550092,428.655198,429.429226,413.774507,488.021364,457.831719,445.91022,481.835657,478.737262,481.113745,466.397495,471.829232,444.362056,500.0,500.0,500.0,489.431837,500.0
5,468.736852,427.769282,455.955694,418.02534,428.293837,464.933101,484.040589,458.471651,440.96729,489.762222,479.215754,497.611111,408.452923,467.495811,463.265779,500.0,500.0,500.0,490.241064,500.0
6,484.595402,401.917885,442.404633,432.325053,465.11214,399.594286,497.772414,491.947664,414.638068,486.439907,483.07846,478.389692,419.649331,475.26132,436.406246,494.884813,472.292305,500.0,483.408411,500.0
7,476.136264,417.167196,471.238513,409.247005,433.624269,385.140195,498.95,480.7463,441.026408,497.104274,459.648341,500.0,424.406527,457.931185,454.665586,500.0,500.0,500.0,485.90689,500.0
8,485.329841,427.694011,451.267025,421.797183,433.548175,423.583193,474.791967,475.395753,440.537442,478.720006,451.492669,497.613636,436.554315,426.391221,487.029012,492.814082,500.0,500.0,500.0,500.0
9,483.340873,419.302974,466.789131,436.033944,413.133666,422.104558,485.579073,434.515903,414.816618,478.700756,454.861457,492.815774,421.175513,472.329357,475.015016,497.172414,500.0,500.0,484.62061,500.0
10,467.239386,415.33127,460.071746,402.675866,409.645954,410.573209,486.938068,449.988492,428.833358,485.954167,433.766938,500.0,437.771719,453.049028,465.116705,490.695238,495.727922,500.0,482.089462,500.0


In [280]:
res_max2 = res_max.groupby(['month']).agg('mean').drop(['day'], axis=1)
res_max2.to_csv('partition_topic_ranks_max_by_month_words_weight_{}_{}.csv'.format(bbc_id, year))
res_max2

Unnamed: 0_level_0,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,Energy and environment,Communities and families,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,180.734823,254.770576,406.520318,381.98652,375.005452,364.096886,500.0,463.838602,387.85546,464.6213,412.732375,500.0,365.982476,468.557471,473.054842,497.066296,488.635627,500.0,485.761465,500.0
2,215.341528,233.640715,414.217067,405.534709,353.216024,388.372179,464.993658,488.511628,451.982778,473.521954,450.767241,500.0,454.93837,468.752843,456.832973,488.948336,500.0,500.0,500.0,500.0
3,165.785823,254.281487,450.070654,331.257674,352.487038,368.045685,473.240394,362.001609,405.739663,478.548329,473.701849,488.45619,366.385889,455.296641,468.289029,494.301984,500.0,500.0,487.005669,500.0
4,182.503566,236.352015,423.631204,400.645901,407.495623,320.485717,487.809498,433.576575,428.368738,474.806776,466.812871,447.219358,437.549068,471.829232,429.241476,500.0,500.0,500.0,489.431837,500.0
5,208.069633,294.253168,445.185376,369.289549,401.297487,389.391202,476.139362,456.703328,418.454882,489.278889,474.649921,497.611111,380.359151,467.495811,463.265779,500.0,500.0,500.0,490.241064,500.0
6,171.612893,281.706337,428.736786,407.278306,394.626493,325.375534,488.558166,469.987103,408.345854,475.286687,470.108563,478.389692,369.424514,475.26132,412.040482,494.884813,472.292305,500.0,476.048312,500.0
7,156.390216,280.714511,471.238513,326.590852,396.921472,348.981058,491.84375,460.24336,385.307486,497.104274,459.648341,500.0,376.43693,440.185881,403.978391,500.0,500.0,500.0,485.90689,500.0
8,217.13885,309.279022,450.069721,387.800843,364.659585,361.063838,474.791967,475.395753,418.831925,471.241392,450.412477,497.613636,413.48481,415.347139,473.2104,492.814082,500.0,500.0,500.0,500.0
9,202.178841,242.969044,462.215476,428.789001,379.93336,386.913735,485.579073,424.493649,390.400579,478.700756,445.691618,492.815774,400.478694,457.751887,404.791268,497.172414,500.0,500.0,484.62061,500.0
10,151.707011,270.033562,429.344183,343.795861,338.321038,333.974409,482.116826,434.601761,391.312529,484.471802,381.421737,500.0,428.748731,449.664938,423.792039,488.728571,495.727922,500.0,482.089462,500.0


In [281]:
res = pd.DataFrame(lrows, columns=header)
res

Unnamed: 0,day,month,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,...,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
0,1,1,96.500000,92.000000,,97.333333,,,,,...,,,,,373.0,,,,,
1,2,1,51.625000,192.333333,,91.000000,,50.666667,,,...,,,71.5,,,,,,,
2,3,1,83.142857,114.666667,,,,24.000000,,,...,,,135.5,,,,,,,
3,4,1,26.222222,88.666667,,54.000000,,40.000000,,,...,,,85.5,24.000000,,,,,,
4,5,1,25.187500,111.500000,24.0,,87.4,34.000000,,,...,,,47.0,58.000000,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,26,12,61.714286,40.000000,29.0,15.000000,28.0,,,64.0,...,,,25.0,103.333333,,,,,66.0,
349,27,12,21.250000,,,171.333333,331.0,24.000000,,16.0,...,,,98.0,,,,,,,
350,28,12,42.444444,45.000000,56.0,49.500000,36.0,17.000000,,,...,,76.000000,94.5,43.000000,23.0,,,,,
351,29,12,43.400000,175.250000,115.5,66.000000,,,,,...,,99.333333,,9.000000,,,,,,


In [282]:
res2 = res.groupby(['month']).agg('mean').drop(['day'], axis=1)
res2

Unnamed: 0_level_0,"Parliament, government and politics","Culture, media and sport","Business, industry and consumers",Transport,"Crime, civil law, justice and rights",International affairs,Education,Defence,Energy and environment,Communities and families,Economy and finance,"Asylum, immigration and nationality",Health services and medicine,"Agriculture, animals, food and rural affairs",European Union,Employment and training,Housing and planning,Science and technology,Social services,Social security and pensions
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,50.271885,135.489984,77.833333,76.916667,108.192105,64.807018,,129.555556,187.938596,144.25,125.857778,,106.770833,55.416667,136.583333,90.0,219.5,,112.25,
2,48.320682,122.881041,58.181818,79.053114,82.137255,104.65625,42.0,172.0,110.5,117.625,101.888889,,146.7,137.25,135.909524,70.0,,,,
3,52.566296,95.149453,45.229167,103.020089,82.810185,80.796,90.4,119.901961,101.666667,90.266667,71.866667,35.0,121.395833,109.071429,69.833333,84.0,,,147.0,
4,50.010779,123.575862,69.192308,88.482143,115.3125,118.048718,75.666667,119.357143,140.483333,64.333333,66.4,133.203704,64.8,59.142857,48.222222,,,,249.0,
5,76.924523,130.415035,67.018519,101.229167,61.071429,90.652211,123.125,93.722222,140.321429,95.555556,79.833333,87.0,96.921569,43.2,79.285714,,,,401.0,
6,55.085784,94.297619,95.075,60.452381,77.819444,69.692619,55.5,102.666667,184.988889,74.0,71.7,99.0,94.941176,104.285714,128.177778,134.5,181.333333,,150.666667,
7,64.246007,93.832908,167.25,77.735185,81.597436,57.301136,40.0,45.333333,98.468487,78.0,94.666667,,102.606481,77.136364,115.980769,,,,135.5,
8,46.555044,117.434401,72.125,87.162963,75.165789,75.155797,71.2,194.25,168.277778,65.166667,117.388889,22.0,63.222222,94.208333,155.25,22.5,,,,
9,61.077299,105.664031,99.714286,41.083333,63.676471,82.215556,134.333333,57.604167,109.593137,47.5,105.62963,181.0,71.821429,64.095238,126.115385,165.0,,,177.5,
10,52.6646,104.775595,53.547619,127.331349,99.198333,78.193333,121.333333,79.722222,96.107843,30.75,63.423077,,60.269231,67.142857,109.566667,131.333333,154.0,,165.0,


In [283]:
res.to_csv('average_partition_lengths_{}_{}.csv'.format(bbc_id, year))
res2.to_csv('average_partition_lengths_month_{}_{}.csv'.format(bbc_id, year))