In [279]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric, stem_text
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.utils import class_weight
from sklearn.preprocessing import OneHotEncoder
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import os

In [265]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        if 'Parliament' in topics[0]:
            t = topics[1]
        else:
            t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [281]:
year_start = 2006
year_end = 2017
exclude = 'None'

In [282]:
hierarchy = {
    'level1':{
            'group1': [0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 14, 15, 16, 19, 20],
            'group2': [3, 12, 13, 18],
            'group3': [9, 17]
    },
    'level2':{
            'group4': [0, 10, 19],
            'group5': [1, 4, 5, 6, 7, 8, 15, 20],
            'group6': [2, 16],
            'group7': [11, 14]
    }
}

In [283]:
def get_group_level1(topic):
    if topic in hierarchy['level1']['group1']:
        return 1
    elif topic in hierarchy['level1']['group2']:
        return 2
    return 3

def get_group_level2(topic):
    if topic in hierarchy['level2']['group4']:
        return 4
    elif topic in hierarchy['level2']['group5']:
        return 5
    elif topic in hierarchy['level2']['group6']:
        return 6
    return 7

In [284]:
print('preparing data!!')
dataframes = []
for year in range(year_start, year_end + 1):
    dataframes.append(pd.read_csv('./data/{}_debate.csv'.format(year)))

df = pd.concat(dataframes)

df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)
df['level1'] = df['topic'].apply(lambda x: get_group_level1(x))
df['level2'] = df['topic'].apply(lambda x: get_group_level2(x))

preparing data!!


In [285]:
df

Unnamed: 0,topic,transcript,level1,level2
1,6,"With permission, Mr. Speaker, I should like to...",1,5
2,20,To ask the Home Secretary to make a statement ...,1,5
3,6,I inform the House that privilege is involved ...,1,5
4,17,"With permission, Mr. Speaker, I wish to make a...",3,7
5,14,To ask the Secretary of State for Defence if h...,1,7
...,...,...,...,...
1775,9,9. What estimate his Department has made of th...,3,7
1776,9,14. What assessment he has made of the level o...,3,7
1777,9,3. What assessment his Department has made of ...,3,7
1778,9,7. What recent assessment he has made of trend...,3,7


In [286]:
df.level1.value_counts()

1    12327
2     2260
3      845
Name: level1, dtype: int64

In [287]:
df.level2.value_counts()

5    8016
7    4489
4    1984
6     943
Name: level2, dtype: int64

In [288]:
counts = df['topic'].value_counts()
topic_counts = {topics_index_to_name_map[key]: counts[key] for key in counts.keys()}
counts_df = pd.DataFrame.from_dict(topic_counts, orient='index', columns=['count'])
total = counts_df['count'].sum()
counts_df['fraction'] = counts_df.apply(lambda x: round(x['count']/total*100, 2), axis=1)
counts_df

Unnamed: 0,count,fraction
"Parliament, government and politics",2142,13.88
"Crime, civil law, justice and rights",1853,12.01
International affairs,1106,7.17
Health services and medicine,1093,7.08
Economy and finance,1016,6.58
Communities and families,993,6.43
Transport,813,5.27
"Business, industry and consumers",803,5.2
Defence,791,5.13
Employment and training,788,5.11


In [289]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,topic,transcript,level1,level2
0,3,T1. If she will make a statement on her depar...,2,7
1,4,8. What recent discussions he has had with th...,1,5
2,14,3. What recent assessment she has made of the...,1,7
3,8,6. What progress her Department is making on t...,1,5
4,12,1. What steps the Government is taking to enco...,2,7
...,...,...,...,...
15427,0,14. What recent representations she has recei...,1,4
15428,12,1. How many (a) health visitors and (b) nurse...,2,7
15429,11,"Thank you, Mr Deputy Speaker, for the opportun...",1,7
15430,12,13. What recent estimate he has made of the n...,2,7


In [298]:
X = df['transcript'].values
Y = df['level1'].values

In [299]:
print('preprocessing data!!')
preprocessed_X = parmap.map(preprocess_text, X, pm_pbar=True)
tagged_X = [models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_X)]

preprocessing data!!


15488it [00:11, 1402.70it/s]                          


In [292]:
if os.path.isfile('./models/doc2vec/doc2vec_{}_{}'.format(year_start, year_end)):
    print('doc2vec already trained. Loading!!!')
    doc2vec_model = Doc2Vec.load('./models/doc2vec/doc2vec_{}_{}'.format(year_start, year_end))
else:
    print('training doc2vec')
    doc2vec_model = Doc2Vec(vector_size=100, window=3, workers=mp.cpu_count(), epochs=40)
    doc2vec_model.build_vocab(tagged_X)
    doc2vec_model.train(tagged_X, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    doc2vec_model.save('./models/doc2vec/doc2vec_{}_{}'.format(year_start, year_end))

training doc2vec


In [300]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
input_Y = Y.reshape(-1,1)
enc.fit(input_Y)
input_Y = enc.transform(input_Y).toarray()

In [301]:
print('preparing inputs')

def get_doc_vec(doc):
    return doc2vec_model.infer_vector(doc.words)

inputs = parmap.map(get_doc_vec, tagged_X, pm_pbar=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, input_Y, stratify=input_Y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

preparing inputs


15488it [01:02, 249.47it/s]                           


X_train:  (12345, 100)
y_train:  (12345, 3)
X_test:  (3087, 100)
y_test:  (3087, 3)


In [302]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [303]:
model = build_network()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
model.fit(X_train, y_train, batch_size=32, epochs=200, callbacks=[callback], class_weight=class_weights,
          validation_data=(X_test, y_test))



Train on 12345 samples, validate on 3087 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200


<tensorflow.python.keras.callbacks.History at 0x7fbf103f71d0>

In [304]:
model.save_weights('./models/doc2vec/classifier_0_{}_{}'.format(year_start, year_end))

In [305]:
model.predict(X_test[0].reshape(1,100))

array([[0.9586857 , 0.02861211, 0.01270225]], dtype=float32)

In [263]:
dump(enc, './models/doc2vec/encoder_1_{}_{}.joblib'.format(year_start, year_end))

['./models/doc2vec/encoder_1_2010_2016.joblib']

In [310]:
def get_doc_vec(doc):
    return doc2vec_model.infer_vector(doc.words)

def build_network(enc):
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])

    return model

def train(X, Y, model_name):
    print('preprocessing data!!')
    preprocessed_X = parmap.map(preprocess_text, X, pm_pbar=True)
    tagged_X = [models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_X)]
    
    enc = OneHotEncoder(handle_unknown='ignore')
    input_Y = Y.reshape(-1,1)
    enc.fit(input_Y)
    input_Y = enc.transform(input_Y).toarray()
    dump(enc, './models/doc2vec/encoder_{}_{}_{}.joblib'.format(model_name, year_start, year_end))
    
    print('preparing inputs')
    inputs = parmap.map(get_doc_vec, tagged_X, pm_pbar=True)

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(inputs, input_Y, stratify=input_Y, test_size=0.2, random_state=42)
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    print('X_train: ', X_train.shape)
    print('y_train: ', y_train.shape)
    print('X_test: ', X_test.shape)
    print('y_test: ', y_test.shape)
    
    model = build_network(enc)
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
    model.fit(X_train, y_train, batch_size=32, epochs=200, callbacks=[callback], class_weight=class_weights,
              validation_data=(X_test, y_test))
    
    model.save_weights('./models/doc2vec/classifier_{}_{}_{}'.format(model_name, year_start, year_end))

In [312]:
df_group2 = df.loc[df.level1 == 2]
df_group2

Unnamed: 0,topic,transcript,level1,level2
0,3,T1. If she will make a statement on her depar...,2,7
4,12,1. What steps the Government is taking to enco...,2,7
6,3,1. What assessment she has made of the potenti...,2,7
8,12,"I beg to move, That the Bill be now read a sec...",2,7
10,12,I sought this Adjournment debate in an attempt...,2,7
...,...,...,...,...
15406,3,"I beg to move, That this House has considered...",2,7
15418,3,10. If he will take steps to increase the num...,2,7
15424,12,1. How much funding is planned for mental heal...,2,7
15428,12,1. How many (a) health visitors and (b) nurse...,2,7


In [313]:
df_group2.topic.value_counts()

12    1093
3      993
13     126
18      48
Name: topic, dtype: int64

In [314]:
X = df_group2['transcript'].values
Y = df_group2['topic'].values

In [315]:
train(X, Y, '2')

preprocessing data!!


2268it [00:03, 689.44it/s]                           


preparing inputs


2268it [00:13, 173.39it/s]                         


X_train:  (1808, 100)
y_train:  (1808, 4)
X_test:  (452, 100)
y_test:  (452, 4)
Train on 1808 samples, validate on 452 samples
Epoch 1/200




Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
