# Import Libraries

In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric, stem_text
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from joblib import dump, load
from sklearn.utils import class_weight
import multiprocessing as mp
from imblearn.over_sampling import SMOTE
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import os

# Preprocessing Functions

In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        if 'Parliament' in topics[0]:
            t = topics[1]
        else:
            t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [4]:
year_start = 2009
year_end = 2014
exclude = ''

# Read Data

In [5]:
print('preparing data!!')
dataframes = []
for year in range(year_start, year_end+1):
    dataframes.append(pd.read_csv('../data/{}_debate.csv'.format(year)))

df = pd.concat(dataframes)

df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index)
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

preparing data!!


In [6]:
if 'Parliament' in exclude:
    df = df.loc[df.topic != 15]
elif 'Others' in exclude:
    df = df.loc[df.topic != 20]

In [7]:
counts = df['topic'].value_counts()
topic_counts = {topics_index_to_name_map[key]: counts[key] for key in counts.keys()}
counts_df = pd.DataFrame.from_dict(topic_counts, orient='index', columns=['count'])
total = counts_df['count'].sum()
counts_df['fraction'] = counts_df.apply(lambda x: round(x['count']/total*100, 2), axis=1)
counts_df

Unnamed: 0,count,fraction
"Parliament, government and politics",1415,14.83
"Crime, civil law, justice and rights",1164,12.2
International affairs,725,7.6
Health services and medicine,633,6.63
Economy and finance,621,6.51
Others,601,6.3
Communities and families,595,6.23
Energy and environment,570,5.97
"Business, industry and consumers",503,5.27
Defence,470,4.93


In [8]:
df = df.sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,date,topic,transcript,labour,conservative,others
0,2011-02-15,4,23. What recent progress his Department has m...,0,162,50
1,2010-09-08,15,6. What recent discussions she has had with m...,38,534,34
2,2013-12-02,10,"With permission Mr Speaker, I would like to ma...",2617,856,5636
3,2010-11-08,7,"I beg to move, That the clause be read a Secon...",14693,8758,5711
4,2012-09-18,4,1. What steps he plans to take to ensure the ...,274,381,86
...,...,...,...,...,...,...
9538,2009-11-25,15,"With permission, Mr. Speaker, I would like to ...",4484,717,1045
9539,2012-04-17,14,3. Whether he has had discussions with the Et...,141,305,0
9540,2012-09-06,2,7. What recent assessment he has made of the ...,73,247,0
9541,2010-09-14,14,16. What recent discussions he has had with t...,164,512,0


In [9]:
X = df['transcript'].values
Y = df['topic'].values

# Preprocess Data

In [10]:
print('preprocessing data!!')
preprocessed_X = parmap.map(preprocess_text, X, pm_pbar=True)
tagged_X = [models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_X)]

preprocessing data!!


9600it [00:05, 1647.51it/s]                          


# Train Doc2Vec

In [11]:
if os.path.isfile('./models/doc2vec/doc2vec_{}_{}{}'.format(year_start, year_end, exclude)):
    print('loading doc2vec model')
    doc2vec_model = Doc2Vec.load('./models/doc2vec/doc2vec_{}_{}{}'.format(year_start, year_end, exclude))
else:
    print('training doc2vec')
    doc2vec_model = Doc2Vec(vector_size=100, window=3, workers=mp.cpu_count(), epochs=40)
    doc2vec_model.build_vocab(tagged_X)
    doc2vec_model.train(tagged_X, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
    doc2vec_model.save('./models/doc2vec/doc2vec_{}_{}{}'.format(year_start, year_end, exclude))

training doc2vec


FileNotFoundError: [Errno 2] No such file or directory: './models/doc2vec/doc2vec_2009_2014'

# Prepare Input Data for Neural Net Classifier

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
input_Y = Y.reshape(-1,1)
enc.fit(input_Y)
input_Y = enc.transform(input_Y).toarray()

In [None]:
print('preparing inputs')

def get_doc_vec(doc):
    return doc2vec_model.infer_vector(doc.words)

inputs = parmap.map(get_doc_vec, tagged_X, pm_pbar=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, input_Y, stratify=Y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

# Over-Sampling

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE('minority')

x_sm, y_sm = smote.fit_sample(X_train, y_train)
print(x_sm.shape, y_sm.shape)

# Classifier Architecture

In [3]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(y_train.shape[1], activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

# Train Neural Network

In [2]:
model = build_network()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
# class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
model.fit(x_sm, y_sm, batch_size=32, epochs=200, callbacks=[callback],
          validation_data=(X_test, y_test))

NameError: name 'tf' is not defined

# Save Model

In [None]:
model.save_weights('../models/doc2vec/classifier_{}_{}{}'.format(year_start, year_end, exclude))
dump(enc, '../models/doc2vec/encoder_{}_{}{}.joblib'.format(year_start, year_end, exclude))
doc2vec_model.save('../models/doc2vec/doc2vec_{}_{}{}'.format(year_start, year_end, exclude))

In [None]:
'./models/doc2vec/doc2vec_{}_{}{}'.format(year_start, year_end, exclude)