# Import Libraries

In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from joblib import dump, load
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter
from tqdm import tqdm

  import pandas.util.testing as tm


# Preprocessing Function

In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
start_year = 2009
end_year = 2018
exclude = '_no_Others'

# Predict Function

In [4]:
def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = [(topics_index_to_name_map[i], p) for i,p in enumerate(pred[0])]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]

# Network Architecture

In [5]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

# Load Saved Models

In [6]:
doc2vec_model = Doc2Vec.load('../models/doc2vec/doc2vec_{}_{}{}'.format(start_year, end_year, exclude))
enc = load('../models/doc2vec/encoder_{}_{}{}.joblib'.format(start_year, end_year, exclude))
classifier = build_network()
classifier.load_weights('../models/doc2vec/classifier_{}_{}{}'.format(start_year, end_year, exclude))


Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7f8a5d1e4910> and <tensorflow.python.keras.layers.core.Dense object at 0x7f8a5d1e4fd0>).


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8a5d221310>

# Test for parliament debates

In [7]:
df = pd.read_csv('../data/2015_commons.csv')
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(['Unnamed: 0'], axis=1)
df = df.groupby(['date', 'topic'])['transcript'].apply(lambda x: ' '.join(x)).reset_index()
# df = df.drop(df[df.transcript.str.split().map(len) < 10].index).reset_index()
# df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [8]:
df

Unnamed: 0,date,topic,transcript
0,2015-01-05,"Autism: Diagnosis of Children, Hull","First, I would like to thank you, Mr Speaker, ..."
1,2015-01-05,Home Department >> Border Exit Checks,1. What progress her Department has made on i...
2,2015-01-05,Home Department >> Border Security (Calais),11. What recent discussions she has had with ...
3,2015-01-05,Home Department >> Chief Inspector of Borders ...,5. When she next plans to meet the independen...
4,2015-01-05,Home Department >> Citizenship Applications,20. How many applicants have been granted cit...
...,...,...,...
2200,2015-12-17,"Environment, Food and Rural Affairs >> Wine Pr...",5. What steps her Department is taking to pro...
2201,2015-12-17,Local Government Finance,I believe that our gloriously diverse country ...
2202,2015-12-17,Points of Order,"On a point of order, Madam Deputy Speaker. A n..."
2203,2015-12-17,Sexual Exploitation: Protection of 16 and 17-y...,"I beg to move, That this House notes the find..."


In [9]:
X = df['transcript'].values
Y = df['topic'].values
i = 45
print(predict(X[i]))
print('true: ', Y[i], Y[i])

[('Parliament, government and politics', 0.6222695), ('Economy and finance', 0.16140893), ('International affairs', 0.068858415)]
true:  Cabinet Office >> Topical Questions Cabinet Office >> Topical Questions


In [10]:
X[i]

'T1.   If  he will make a statement on his departmental responsibilities. [906798]  My responsibilities are for efficiency and reform, civil service issues, public sector industrial relations strategy, Government transparency, civil contingencies, civil society and cyber-security.  The Minister for the Cabinet Office stated in October 2010 that public bodies would be made more meaningfully accountable. Specifically, what new mechanisms has he put in place to make public bodies more meaningfully accountable to this House and, indeed, to the public?  Our concern with public body reform has always been to ensure that accountability is improved. A number of functions have been brought within Government to make them directly accountable to this House through Ministers. A number of other activities have been discontinued completely. The number of public bodies has been reduced by about a third. When we came into office, there were no data about the actual number of public bodies. In addition

In [11]:
df['topic'] = df.apply(lambda row: topics_index_to_name_map[row['topic']], axis=1)

KeyError: 'Autism: Diagnosis of Children, Hull'