In [16]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import class_weight
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap

In [17]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [38]:
years = range(2010, 2016)
session_start_dt = '2010-5-1'
session_end_dt = '2015-5-31'
start_str = 'May-2010'
end_str = 'May-2015'

In [39]:
df_list = []
for year in years:
    print(year)
    df_list.append(pd.read_csv('../data/{}_debate.csv'.format(year)))

2010
2011
2012
2013
2014
2015


In [40]:
df = pd.concat(df_list)

In [41]:
df['date'] = pd.to_datetime(df['date'])
mask = (df['date'] >= session_start_dt) & (df['date'] <= session_end_dt)
df = df[mask]
df = df.reset_index(drop=True)

In [42]:
# df = df.drop(df[df.topic == 'admin'].index)
df = df.loc[df.topic != 'admin']
# df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)
# df = df.drop(df[df.topic == 20].index)
df['word_count'] = df['transcript'].apply(lambda x: len(x.split()))

In [43]:
df

Unnamed: 0,date,topic,transcript,labour,conservative,others,word_count
0,2010-05-18,15,"Under the terms of Standing Order No. 1A, I am...",845,1667,462,2974
5,2010-05-25,15,May I begin by saying how grateful I am to you...,1205,6180,3675,11060
7,2010-05-26,20,"I am grateful, Mr Deputy Speaker, to be able t...",1434,1013,0,2447
8,2010-05-26,14,It is a privilege to open this year’s foreign ...,25595,23872,8824,58291
9,2010-05-26,15,(Urgent Question): To ask the Chancellor of th...,1602,877,4829,7308
...,...,...,...,...,...,...,...
12522,2015-05-27,9,I am pleased to have the opportunity to introd...,0,4386,220,4606
12524,2015-05-28,6,I should first thank the Justice Secretary. In...,139,2056,3183,5378
12525,2015-05-28,4,The Gracious Speech we heard yesterday set out...,18454,28301,5816,52571
12526,2015-05-28,20,(Urgent Question): To ask the Secretary of Sta...,774,4477,0,5251


In [44]:
word_counts_df = df[['topic', 'word_count', 'labour', 'conservative', 'others']].groupby(['topic']).sum()
word_counts_df['topic'] = word_counts_df.index
word_counts = {}
word_counts_df

Unnamed: 0_level_0,word_count,labour,conservative,others,topic
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,961742,293561,502443,165738,0
1,866390,308003,452039,106348,1
2,1854418,712691,680584,461143,2
3,1956163,707533,933222,315408,3
4,4278057,1446524,2264766,566767,4
5,1041267,302520,643255,95492,5
6,1967133,591011,1125656,250466,6
7,5614979,2378935,2470329,765715,7
8,1599154,544839,847034,207281,8
9,2067952,888209,874132,305611,9


In [45]:
for index, row in word_counts_df.iterrows():
    word_counts[topics_index_to_name_map[row['topic']]] = {
        'word_count': row['word_count'],
        'labour': row['labour'],
        'conservative': row['conservative'],
        'others': row['others']
    }
    
word_counts

{'Agriculture, animals, food and rural affairs': {'word_count': 961742,
  'labour': 293561,
  'conservative': 502443,
  'others': 165738},
 'Asylum, immigration and nationality': {'word_count': 866390,
  'labour': 308003,
  'conservative': 452039,
  'others': 106348},
 'Business, industry and consumers': {'word_count': 1854418,
  'labour': 712691,
  'conservative': 680584,
  'others': 461143},
 'Communities and families': {'word_count': 1956163,
  'labour': 707533,
  'conservative': 933222,
  'others': 315408},
 'Crime, civil law, justice and rights': {'word_count': 4278057,
  'labour': 1446524,
  'conservative': 2264766,
  'others': 566767},
 'Culture, media and sport': {'word_count': 1041267,
  'labour': 302520,
  'conservative': 643255,
  'others': 95492},
 'Defence': {'word_count': 1967133,
  'labour': 591011,
  'conservative': 1125656,
  'others': 250466},
 'Economy and finance': {'word_count': 5614979,
  'labour': 2378935,
  'conservative': 2470329,
  'others': 765715},
 'Educati

In [46]:
counts = df['topic'].value_counts()
topic_counts = {topics_index_to_name_map[key]: counts[key] for key in counts.keys()}
counts_df = pd.DataFrame.from_dict(topic_counts, orient='index', columns=['count'])
total = counts_df['count'].sum()
counts_df['fraction'] = counts_df.apply(lambda x: round(x['count']/total*100, 2), axis=1)

In [47]:
row = {'word_count': [], 'labour': [], 'conservative': [], 'others': []}
ind = counts_df.index

for i in ind:
    row['word_count'].append(word_counts[i]['word_count'])
    row['conservative'].append(word_counts[i]['conservative'])
    row['labour'].append(word_counts[i]['labour'])
    row['others'].append(word_counts[i]['others'])
counts_df['word_count'] = row['word_count']
counts_df['conservative'] = row['conservative']
counts_df['labour'] = row['labour']
counts_df['others'] = row['others']
total_word_count = counts_df['word_count'].sum()
counts_df['word fraction'] = counts_df.apply(lambda x: round(x['word_count']/total_word_count*100, 2), axis=1)
total_word_count = counts_df['conservative'].sum()
counts_df['conservative fraction'] = counts_df.apply(lambda x: round(x['conservative']/total_word_count*100, 2), axis=1)
total_word_count = counts_df['labour'].sum()
counts_df['labour fraction'] = counts_df.apply(lambda x: round(x['labour']/total_word_count*100, 2), axis=1)
total_word_count = counts_df['others'].sum()
counts_df['others fraction'] = counts_df.apply(lambda x: round(x['others']/total_word_count*100, 2), axis=1)
counts_df

Unnamed: 0,count,fraction,word_count,conservative,labour,others,word fraction,conservative fraction,labour fraction,others fraction
"Parliament, government and politics",1794,15.36,5087676,2300689,1759350,1027637,11.65,10.68,11.39,15.43
"Crime, civil law, justice and rights",1388,11.89,4278057,2264766,1446524,566767,9.8,10.51,9.36,8.51
International affairs,886,7.59,3153998,1951594,914524,287880,7.22,9.06,5.92,4.32
Economy and finance,759,6.5,5614979,2470329,2378935,765715,12.86,11.47,15.4,11.49
Health services and medicine,758,6.49,2658580,1383007,912577,362996,6.09,6.42,5.91,5.45
Others,726,6.22,4269369,2188010,1436444,644915,9.78,10.16,9.3,9.68
Communities and families,639,5.47,1956163,933222,707533,315408,4.48,4.33,4.58,4.73
Energy and environment,635,5.44,1758166,725280,598109,434777,4.03,3.37,3.87,6.53
Education,627,5.37,1599154,847034,544839,207281,3.66,3.93,3.53,3.11
"Business, industry and consumers",616,5.28,1854418,680584,712691,461143,4.25,3.16,4.61,6.92


In [48]:
# counts_df.sort_values(by=[''])

In [49]:
counts_df.to_csv('./topic_counts_debate_{}-{}.csv'.format(start_str, end_str))

# AGG COUNTS

In [31]:
years = range(2006, 2020)

In [32]:
rows = []
for year in years:
    print(year)
    df = pd.read_csv('../data/{}_debate.csv'.format(year))
#     df = df.drop(df[df.topic == 'admin'].index)
    df = df.loc[df.topic != 'admin']
#     df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
    df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)
    df['word_count'] = df['transcript'].apply(lambda x: len(x.split()))
    df['month'] = df['date'].apply(lambda x: pd.to_datetime(x).month)
    df['date'] = pd.to_datetime(df['date'])
    df_month_count = df[['month', 'word_count', 'labour', 'conservative', 'others']].groupby(['month'], as_index=False).sum()
    for index, row in df_month_count.iterrows():
        debate_count = len(df.loc[df.month == row['month']])
        rows.append([year, row['month'], debate_count, row['word_count'], row['labour'], row['conservative'], row['others']])


2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [33]:
res_df = pd.DataFrame(rows, columns=['year', 'month', 'debate_count', 'word_count', 'labour', 'conservative', 'others'])

In [34]:
res_df

Unnamed: 0,year,month,debate_count,word_count,labour,conservative,others
0,2006,6,255,956086,484028,324523,147535
1,2006,7,242,859735,471218,268191,120326
2,2006,10,219,901992,476665,256846,168481
3,2006,11,180,726120,342481,244985,138654
4,2006,12,146,614281,346587,162300,105394
...,...,...,...,...,...,...,...
133,2018,12,199,831422,251211,414788,165423
134,2019,1,278,1117126,357309,578515,181302
135,2019,2,309,1075456,328261,580131,167064
136,2019,3,317,1213871,333605,643734,236532


In [35]:
res_df.to_csv('debates_counts_agg.csv')