# Import Libraries

In [59]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.phrases import Phrases, Phraser
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap

# Preprocessing Functions

In [60]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def remove_custom_stopwords(s):
    my_stop_words = STOPWORDS.union(set(['time', 'year', 'number', 'today', 'week', 'month', 'night', 'world', 'home',
                                         'place', 'yesterday', 'life', 'wife']))
    return " ".join(w for w in s.split() if w not in my_stop_words)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   remove_custom_stopwords, strip_short2, strip_non_alphanum, strip_numeric, remove_non_nouns]
    return preprocess_string(text, LDA_FILTERS)

def filter_multiple_topics(topic):
    if '|' in topic:
        return topic.split('|')[0].strip()
    return topic

def preprocess(topic):
    t = filter_multiple_topics(topic)
    if t == 'admin':
        t = 'Others'
        
    return topics_name_to_index_map[t]

# Read Data

In [61]:
df1 = pd.read_csv('./data/2012_speech.csv')
df2 = pd.read_csv('./data/2013_speech.csv')
df = pd.concat([df1, df2])
df = df.drop(['date'], axis=1)
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [62]:
df['topic'].value_counts()

7     11690
15    11319
20    10631
4      8794
14     6868
9      5914
12     5729
3      5577
2      4576
19     4438
11     3868
6      3762
10     3753
8      3425
5      3392
0      2183
13     1334
1      1204
17     1191
16      549
18      548
Name: topic, dtype: int64

In [63]:
X = df['transcript'].values
Y = df['topic'].values

# Preprocess Data

## filter

In [None]:
preprocessed_X = parmap.map(preprocess_text_for_lda, X, pm_pbar=True)
#list(map(preprocess_text_for_lda, X))

  0%|          | 0/100745 [00:00<?, ?it/s]

## create bigram and dictionary

In [None]:
bigram_model = Phrases(preprocessed_X, min_count=1, threshold=0.5)

In [None]:
dictionary = corpora.Dictionary(bigram_model[preprocessed_X])
dictionary.filter_extremes(no_above=0.40, no_below=3)

In [58]:
import os
dictionary.save(os.path.join('.', 'topics_vocab_{}.dict'.format("2012-13")))
bigram = Phraser(bigram_model)
bigram.save(os.path.join('.', "bigram_{}.pkl".format("2012-13")))

## convert to bag of words

In [24]:
#bow_X = list(map(dictionary.doc2bow, bigram_model[preprocessed_X]))
preprocessed_X = bigram_model[preprocessed_X]
bow_X = parmap.map(dictionary.doc2bow, preprocessed_X, pm_pbar=True)

100864it [00:05, 18042.48it/s]                           


# Train LDA Model

In [25]:
from gensim.models.coherencemodel import CoherenceModel
n_topics = 1000
mallet_path = "~/Mallet/bin/mallet"

model = models.wrappers.LdaMallet(mallet_path, corpus=bow_X, num_topics=n_topics, id2word=dictionary)
coherencemodel = CoherenceModel(model=model, texts=bigram_model[preprocessed_X], dictionary=dictionary, coherence='c_v')
lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(model)
lda_model.save('./lda_model_2012-13.pkl')

# prepare inputs for NN Classifier

In [35]:
def transform_to_lda_vector(bowx):
    topics = lda_model.get_document_topics(bowx, minimum_probability=0.0)
    topic_vec = [topics[i][1] for i in range(n_topics)]
    return topic_vec

inputs_X = parmap.map(transform_to_lda_vector, bow_X, pm_pbar=True)

# for bowx in bow_X:
#     topics = lda_model.get_document_topics(bowx, minimum_probability=0.0)
#     topic_vec = [topics[i][1] for i in range(n_topics)]
#     inputs.append(topic_vec)

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
inputs_Y = Y.reshape(-1,1)
enc.fit(inputs_Y)
inputs_Y = enc.transform(inputs_Y).toarray()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs_X, inputs_Y, test_size=0.2, stratify=inputs_Y, random_state=42)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

100864it [00:50, 1985.46it/s]                           


X_train:  (80596, 1000)
y_train:  (80596, 21)
X_test:  (20149, 1000)
y_test:  (20149, 21)


In [37]:
inputs[0]

# Neural Network Architecture

In [53]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(n_topics)),
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.00001)),
        tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.00001)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

# Train Classifier

In [54]:
model = build_network()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train, y_train, batch_size=32, epochs=200, callbacks=[callback], validation_data=(X_test, y_test))

Train on 80596 samples, validate on 20149 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200


<tensorflow.python.keras.callbacks.History at 0x7fab1ca0f410>

# Save Model

In [56]:
model.save_weights('./lda_topics_classifier')
