# Import

In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.phrases import Phrases, Phraser
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf

In [10]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(1000)),
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.00001)),
        tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.00001)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

# Load Models

In [11]:
lda_model = models.ldamulticore.LdaMulticore.load('./lda_model_2012-13.pkl')
bigram_model = Phraser.load('./bigram_2012-13.pkl')
dictionary = corpora.Dictionary.load('./topics_vocab_2012-13.dict')
classifier = build_network()
classifier.load_weights('./lda_topics_classifier')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ffa112f1990>

# Preprocess

In [12]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def remove_custom_stopwords(s):
    my_stop_words = STOPWORDS.union(set(['time', 'year', 'number', 'today', 'week', 'month', 'night', 'world', 'home',
                                         'place', 'yesterday', 'life', 'wife']))
    return " ".join(w for w in s.split() if w not in my_stop_words)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   remove_custom_stopwords, strip_short2, strip_non_alphanum, strip_numeric, remove_non_nouns]
    return preprocess_string(text, LDA_FILTERS)


def filter_multiple_topics(topic):
    if '|' in topic:
        return topic.split('|')[0].strip()
    return topic

def preprocess(topic):
    t = filter_multiple_topics(topic)
    if t == 'admin':
        t = 'Others'
        
    return t

# Load Data

In [13]:
df = pd.read_csv('./data/2014_speech.csv')
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index).reset_index()
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)



In [14]:
df

Unnamed: 0,index,date,topic,transcript
0,0,2014-01-06,Education,1. What assessment he has made of the perform...
1,1,2014-01-06,Education,I associate myself and those on the Front Benc...
2,2,2014-01-06,Education,"I wish you, Mr Speaker, and the whole House al..."
3,3,2014-01-06,Education,My hon. Friend is absolutely right. Parents ar...
4,4,2014-01-06,Education,Does not the evidence show that the most impor...
...,...,...,...,...
53055,60274,2014-12-18,Employment and training,We are not going to get into that. It is Chris...
53056,60275,2014-12-18,Employment and training,"Thank you, Mr Deputy Speaker. As the hon. Gent..."
53057,60276,2014-12-18,Employment and training,"On a point of order, Mr Deputy Speaker. If you..."
53058,60277,2014-12-18,Employment and training,I do not want this debate to deteriorate any f...


# Predict

In [15]:
def predict(text, topn=3):
    preprocessed_text = preprocess_text_for_lda(text)
    bigrams = bigram_model[preprocessed_text]
    bow = dictionary.doc2bow(bigrams)
    lda_vector = lda_model.get_document_topics(bow, minimum_probability=0.0)
    lda_vector = np.array([x[1] for x in lda_vector])
    lda_vector = lda_vector.reshape(1, 1000,)
    pred = classifier.predict(lda_vector)
    pred_i = [(topics_index_to_name_map[i], p) for i,p in enumerate(pred[0])]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]
    

In [16]:
text = df['transcript'].iloc[4]
print(text)

Does not the evidence show that the most important factor is the quality of teaching in our schools? Thousands of schools around the country have chosen not to go down the academy route. Will the Secretary of State join me in congratulating Ranworth Square primary school in my constituency, where the majority of children are on free school meals but where last summer 93% achieved at least a level 4 in English, maths and writing? 


In [17]:
predict(text)

[('Education', 0.46267793), ('Others', 0.15650004), ('Transport', 0.05563398)]

In [18]:
df['predicted_topic'] = df.apply(lambda x: predict(x['transcript']), axis=1)

In [19]:
df.to_csv('./lda_pred.csv')

In [20]:
df

Unnamed: 0,index,date,topic,transcript,predicted_topic
0,0,2014-01-06,Education,1. What assessment he has made of the perform...,"[(Parliament, government and politics, 0.14581..."
1,1,2014-01-06,Education,I associate myself and those on the Front Benc...,"[(Communities and families, 0.3535391), (Parli..."
2,2,2014-01-06,Education,"I wish you, Mr Speaker, and the whole House al...","[(Education, 0.5249328), (Employment and train..."
3,3,2014-01-06,Education,My hon. Friend is absolutely right. Parents ar...,"[(Education, 0.683318), (Health services and m..."
4,4,2014-01-06,Education,Does not the evidence show that the most impor...,"[(Education, 0.34625402), (Others, 0.177424), ..."
...,...,...,...,...,...
53055,60274,2014-12-18,Employment and training,We are not going to get into that. It is Chris...,"[(Energy and environment, 0.1504166), (Parliam..."
53056,60275,2014-12-18,Employment and training,"Thank you, Mr Deputy Speaker. As the hon. Gent...","[(Others, 0.13572422), (Parliament, government..."
53057,60276,2014-12-18,Employment and training,"On a point of order, Mr Deputy Speaker. If you...","[(European Union, 0.24168938), (Economy and fi..."
53058,60277,2014-12-18,Employment and training,I do not want this debate to deteriorate any f...,"[(Parliament, government and politics, 0.14678..."
