In [17]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf

In [18]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def remove_custom_stopwords(s):
    my_stop_words = STOPWORDS.union(set(['time', 'year', 'number', 'today', 'week', 'month', 'night', 'world', 'home',
                                         'place', 'yesterday', 'life', 'wife']))
    return " ".join(w for w in s.split() if w not in my_stop_words)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   remove_custom_stopwords, strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, LDA_FILTERS)

def filter_multiple_topics(topic):
    if '|' in topic:
        return topic.split('|')[0].strip()
    return topic

def preprocess(topic):
    t = filter_multiple_topics(topic)
    if t == 'admin':
        t = 'Others'
        
    return topics_name_to_index_map[t]

In [19]:
df = pd.read_csv('./data/2014_speech.csv')
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index).reset_index()
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [20]:
df

Unnamed: 0,index,date,topic,transcript
0,0,2014-01-06,8,1. What assessment he has made of the perform...
1,1,2014-01-06,8,I associate myself and those on the Front Benc...
2,2,2014-01-06,8,"I wish you, Mr Speaker, and the whole House al..."
3,3,2014-01-06,8,My hon. Friend is absolutely right. Parents ar...
4,4,2014-01-06,8,Does not the evidence show that the most impor...
...,...,...,...,...
53055,60274,2014-12-18,9,We are not going to get into that. It is Chris...
53056,60275,2014-12-18,9,"Thank you, Mr Deputy Speaker. As the hon. Gent..."
53057,60276,2014-12-18,9,"On a point of order, Mr Deputy Speaker. If you..."
53058,60277,2014-12-18,9,I do not want this debate to deteriorate any f...


In [21]:
X = df['transcript'].values
Y = df['topic'].values

In [24]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(300)),
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [25]:
doc2vec_model = Doc2Vec.load('doc2vec')
classifier = build_network()
classifier.load_weights('./topics_classifier')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f59005821d0>

In [26]:
def predict(X, topn=3):
    preprocessed_X = preprocess_text_for_lda(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = [(topics_index_to_name_map[i], p) for i,p in enumerate(pred[0])]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]

In [27]:
i = 15686

In [28]:
predict(X[i])

[('Communities and families', 0.400531),
 ('Education', 0.3681568),
 ('Social services', 0.06354658)]

In [29]:
Y[i], topics_index_to_name_map[Y[i]]

(8, 'Education')

In [30]:
X[i]

'The right hon. Gentleman is correct to raise that issue, of which I am conscious from the statistics that he shared with the House. That is why we have made it clear—not only through the Children and Families Act 2014—that although ethnicity is an important consideration when matching for adoption, it should not be the single guiding principle that determines whether prospective adopters take on a child with a different ethnic mix from theirs. It is also why we are helping to fund local authorities, in partnership with independent fostering agencies, to examine how they can recruit more widely across our communities so that we ensure that we have a good cross-section of people coming forward to adopt.  We need to make people aware that some of the myths and barriers that they think prevent them from adopting do not exist. We want more people to come forward, so we should do everything that we can to encourage them to do so. '

In [31]:
df['topic'] = df.apply(lambda row: topics_index_to_name_map[row['topic']], axis=1)

In [32]:
df['predicted_topic'] = df.apply(lambda row: predict(row['transcript']), axis=1)

In [33]:
df.to_csv('./pred.csv')

In [34]:
df

Unnamed: 0,index,date,topic,transcript,predicted_topic
0,0,2014-01-06,Education,1. What assessment he has made of the perform...,"[(Parliament, government and politics, 0.15913..."
1,1,2014-01-06,Education,I associate myself and those on the Front Benc...,"[(Education, 0.9308774), (Others, 0.05554033),..."
2,2,2014-01-06,Education,"I wish you, Mr Speaker, and the whole House al...","[(Education, 0.9000825), (Others, 0.08470161),..."
3,3,2014-01-06,Education,My hon. Friend is absolutely right. Parents ar...,"[(Others, 0.47076026), (Education, 0.2952301),..."
4,4,2014-01-06,Education,Does not the evidence show that the most impor...,"[(Education, 0.8550697), (Others, 0.053401418)..."
...,...,...,...,...,...
53055,60274,2014-12-18,Employment and training,We are not going to get into that. It is Chris...,"[(Agriculture, animals, food and rural affairs..."
53056,60275,2014-12-18,Employment and training,"Thank you, Mr Deputy Speaker. As the hon. Gent...","[(Crime, civil law, justice and rights, 0.6356..."
53057,60276,2014-12-18,Employment and training,"On a point of order, Mr Deputy Speaker. If you...","[(Employment and training, 0.2972697), (Busine..."
53058,60277,2014-12-18,Employment and training,I do not want this debate to deteriorate any f...,"[(Parliament, government and politics, 0.31880..."
