In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.phrases import Phrases, Phraser
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def remove_custom_stopwords(s):
    my_stop_words = STOPWORDS.union(set(['time', 'year', 'number', 'today', 'week', 'month', 'night', 'world', 'home',
                                         'place', 'yesterday', 'life', 'wife']))
    return " ".join(w for w in s.split() if w not in my_stop_words)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   remove_custom_stopwords, strip_short2, strip_non_alphanum, strip_numeric, remove_non_nouns]
    return preprocess_string(text, LDA_FILTERS)

def filter_multiple_topics(topic):
    if '|' in topic:
        return topic.split('|')[0].strip()
    return topic

def preprocess(topic):
    t = filter_multiple_topics(topic)
    if t == 'admin':
        t = 'Others'
        
    return topics_name_to_index_map[t]

In [3]:
df = pd.read_csv('./data/2013_speech.csv')
df = df.drop(df[df.topic == 'admin'].index)
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [4]:
df2 = df.append([df[df.topic == 18]]*10)

In [5]:
df['topic'].value_counts()

15    7457
7     6713
20    6640
4     5532
14    4585
12    3531
9     3142
3     3030
2     3019
11    2942
19    2843
6     2442
10    2304
8     2233
5     1845
0     1619
13    1387
17    1043
1     1006
16     364
18     188
Name: topic, dtype: int64

In [6]:
X = df['transcript'].values
Y = df['topic'].values

In [7]:
preprocessed_X = list(map(preprocess_text_for_lda, X))

In [8]:
bigram_model = Phrases(preprocessed_X, min_count=1, threshold=0.5)

In [9]:
dictionary = corpora.Dictionary(bigram_model[preprocessed_X])
dictionary.filter_extremes(no_above=0.40, no_below=3)

In [10]:
import os
dictionary.save(os.path.join('.', 'topics_vocab_{}.dict'.format(2015)))
bigram = Phraser(bigram_model)
bigram.save(os.path.join('.', "bigram_{}.pkl".format(2015)))

In [11]:
bow_X = list(map(dictionary.doc2bow, bigram_model[preprocessed_X]))

In [12]:
from gensim.models.coherencemodel import CoherenceModel
n_topics = 400
mallet_path = "/home/rohit/Mallet/bin/mallet"

model = models.wrappers.LdaMallet(mallet_path, corpus=bow_X, num_topics=n_topics, id2word=dictionary)
coherencemodel = CoherenceModel(model=model, texts=bigram_model[preprocessed_X], dictionary=dictionary, coherence='c_v')

In [13]:
print(coherencemodel.get_coherence())
lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(model)

0.42051873442483173


In [14]:
lda_model.save('./lda_model_2015.pkl')

In [15]:
inputs = []
for bowx in bow_X:
    topics = lda_model.get_document_topics(bowx, minimum_probability=0.0)
    topic_vec = [topics[i][1] for i in range(n_topics)]
    inputs.append(topic_vec)

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
Y = Y.reshape(-1,1)
enc.fit(Y)
Y = enc.transform(Y).toarray()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, Y, test_size=0.2, stratify=Y, random_state=42)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

X_train:  (51092, 400)
y_train:  (51092, 21)
X_test:  (12773, 400)
y_test:  (12773, 21)


In [16]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(n_topics)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(21)
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [None]:
model = build_network()
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)
model.fit(X_train, y_train, batch_size=32, epochs=400, callbacks=[callback], validation_data=(X_test, y_test))

In [65]:
model.save_weights('./topics_classifier')
