# Import

In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.phrases import Phrases, Phraser
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(1000)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(21)
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

# Load Models

In [None]:
lda_model = models.ldamulticore.LdaMulticore.load('./lda_model_2015.pkl')
bigram_model = Phraser.load('./bigram_2015.pkl')
dictionary = corpora.Dictionary.load('./topics_vocab_2015.dict')
classifier = build_network()
classifier.load_weights('./topics_classifier')

# Preprocess

In [4]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def remove_custom_stopwords(s):
    my_stop_words = STOPWORDS.union(set(['time', 'year', 'number', 'today', 'week', 'month', 'night', 'world', 'home',
                                         'place', 'yesterday', 'life', 'wife']))
    return " ".join(w for w in s.split() if w not in my_stop_words)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   remove_custom_stopwords, strip_short2, strip_non_alphanum, strip_numeric, remove_non_nouns]
    return preprocess_string(text, LDA_FILTERS)


def filter_multiple_topics(topic):
    if '|' in topic:
        return topic.split('|')[0].strip()
    return topic

def preprocess(topic):
    t = filter_multiple_topics(topic)
    if t == 'admin':
        t = 'Others'
        
    return t

# Load Data

In [18]:
df = pd.read_csv('./data/2017_speech.csv')
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [19]:
df

Unnamed: 0,date,topic,transcript
0,2017-01-09,Employment and training,2. What recent assessment he has made of trend...
1,2017-01-09,Employment and training,The Government support those who aspire to be ...
2,2017-01-09,Employment and training,A Citizens Advice report in August 2015 said t...
3,2017-01-09,Employment and training,The hon. Gentleman is right that there should ...
4,2017-01-09,Employment and training,"Happy new year, Mr Speaker. Does my right hon..."
...,...,...,...
61087,2017-12-21,Transport,I congratulate my hon. Friend on initiating th...
61088,2017-12-21,Transport,I shall deal with the issue of careless versus...
61089,2017-12-21,Transport,"I am a member of the Justice Committee, and on..."
61090,2017-12-21,Transport,"Again, I thank my hon. Friend for her helpful ..."


# Predict

In [20]:
def predict(text):
    preprocessed_text = preprocess_text_for_lda(text)
    bigrams = bigram_model[preprocessed_text]
    bow = dictionary.doc2bow(bigrams)
    lda_vector = lda_model.get_document_topics(bow, minimum_probability=0.0)
    lda_vector = np.array([x[1] for x in lda_vector])
    lda_vector = lda_vector.reshape(1, 1000,)
    return topics_index_to_name_map[np.argmax(classifier.predict(lda_vector))]
    

In [21]:
text = df['transcript'].iloc[4]
print(text)

Happy new year, Mr Speaker.  Does my right hon. Friend agree that rather than denigrating people who become self-employed, we ought to be celebrating the fact that they are prepared to take a risk that many others are not? Will he make it as easy as possible for them to take on new employees and become employers themselves? 


In [22]:
predict(text)

'Energy and environment'

In [23]:
df['predicted_topic'] = df.apply(lambda x: predict(x['transcript']), axis=1)

In [24]:
df['count'] = 1
df[['topic', 'predicted_topic', 'count']].groupby(['topic', 'predicted_topic']).count().to_csv('./predictions.csv')

In [25]:
df.iloc[99]['transcript']

'My hon. Friend is entirely correct about the OBR’s projection at that time, but he will have noticed that that came in the context of considerably larger projected employment growth. '

In [26]:
df

Unnamed: 0,date,topic,transcript,predicted_topic,count
0,2017-01-09,Employment and training,2. What recent assessment he has made of trend...,Economy and finance,1
1,2017-01-09,Employment and training,The Government support those who aspire to be ...,"Crime, civil law, justice and rights",1
2,2017-01-09,Employment and training,A Citizens Advice report in August 2015 said t...,Transport,1
3,2017-01-09,Employment and training,The hon. Gentleman is right that there should ...,European Union,1
4,2017-01-09,Employment and training,"Happy new year, Mr Speaker. Does my right hon...",International affairs,1
...,...,...,...,...,...
61087,2017-12-21,Transport,I congratulate my hon. Friend on initiating th...,Economy and finance,1
61088,2017-12-21,Transport,I shall deal with the issue of careless versus...,Others,1
61089,2017-12-21,Transport,"I am a member of the Justice Committee, and on...","Crime, civil law, justice and rights",1
61090,2017-12-21,Transport,"Again, I thank my hon. Friend for her helpful ...","Crime, civil law, justice and rights",1


In [17]:
df.iloc[64639]['transcript']

'I accept everything that the Minister has said; that is logical. What I am concerned about is this: where is a unit such as the Royal Horse Artillery, which needs to be close to central London, going to go? We have had all these facilities built in Woolwich specifically for the Royal Horse Artillery, and now, a few years after producing them, we are going to throw them all away. It does not seem to make sense to me. '