In [9]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, LDA_FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
print('preparing data!!')
df1 = pd.read_csv('./data/2011_speech.csv')
df2 = pd.read_csv('./data/2012_speech.csv')
df3 = pd.read_csv('./data/2013_speech.csv')
df = pd.concat([df2, df3])
df = df.drop(['date'], axis=1)
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

preparing data!!


In [4]:
df['topic'].value_counts()

7     11690
15    11319
20    10631
4      8794
14     6868
9      5914
12     5729
3      5577
2      4576
19     4438
11     3868
6      3762
10     3753
8      3425
5      3392
0      2183
13     1334
1      1204
17     1191
16      549
18      548
Name: topic, dtype: int64

In [5]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,topic,transcript
0,19,I welcome my hon. Friend to his new position. ...
1,11,I hear Opposition Members shouting out about w...
2,7,Would the Minister not rather be understanding...
3,9,"Exactly. Amazingly, the questionnaire process ..."
4,7,I very much doubt it; we can but hope. There a...
...,...,...
100740,14,The hon. Gentleman makes an excellent suggesti...
100741,14,"I can only ask why, then, did we not give Hans..."
100742,19,Businesses in Slough tell me that they have in...
100743,3,It is a pleasure to follow such a wise speech ...


In [6]:
X = df['transcript'].values
Y = df['topic'].values

In [7]:
print('preprocessing data!!')
preprocessed_X = list(map(preprocess_text_for_lda, X))

preprocessing data!!


In [8]:
tagged_X = [models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_X)]

In [10]:
print('training doc2vec')
doc2vec_model = Doc2Vec(vector_size=300, min_count=2, window=3, workers=mp.cpu_count(), epochs=40)
doc2vec_model.build_vocab(tagged_X)
doc2vec_model.train(tagged_X, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

training doc2vec


In [11]:
doc2vec_model.save('doc2vec')

In [12]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
Y = Y.reshape(-1,1)
enc.fit(Y)
Y = enc.transform(Y).toarray()

In [13]:
# from sklearn.preprocessing import MultiLabelBinarizer
# one_hot = MultiLabelBinarizer()
# one_hot.fit_transform(Y)
# Y = one_hot.transform(Y)

In [14]:
print('preparing inputs')
inputs = []
for x in tagged_X:
    topic_vec = doc2vec_model.infer_vector(x.words)
    inputs.append(topic_vec)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, Y, stratify=Y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

preparing inputs
X_train:  (80596, 300)
y_train:  (80596, 21)
X_test:  (20149, 300)
y_test:  (20149, 21)


In [60]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(300)),
        tf.keras.layers.Dense(1024, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)),
        tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)),
        tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.001)),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [62]:
model = build_network()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)
model.fit(X_train, y_train, batch_size=32, epochs=200, callbacks=[callback], validation_data=(X_test, y_test))

Train on 80596 samples, validate on 20149 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200


<tensorflow.python.keras.callbacks.History at 0x7f5bf0700cd0>

In [51]:
model.save_weights('./topics_classifier')