In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter
from tqdm import tqdm

  import pandas.util.testing as tm


In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [3]:
misclassifications = np.zeros((21,21))

In [9]:
from joblib import dump, load

enc = load('./models/doc2vec/encoder_2014_2015.joblib')

def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model
    
def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = []
    for i, p in enumerate(pred[0]):
        one_hot = np.zeros(len(pred[0]))
        one_hot[i] = 1
        pred_i += [(enc.inverse_transform([one_hot])[0][0], p)]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[0]

In [10]:
doc2vec_model = Doc2Vec.load('models/doc2vec/doc2vec_2014_2015')
classifier = build_network()
classifier.load_weights('./models/doc2vec/classifier_2014_2015')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa8068d45d0>

In [52]:
df1 = pd.read_csv('./data/2016_debate.csv')
df2 = pd.read_csv('./data/2017_debate.csv')
df = pd.concat([df1, df2])
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index).reset_index()
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [53]:
# df['topic'] = df.apply(lambda row: topics_index_to_name_map[row['topic']], axis=1)

In [54]:
df

Unnamed: 0,index,topic,transcript
0,0,14,(Urgent Question): To ask the Secretary of Sta...
1,1,10,"To ask the Secretary of State for Environment,..."
2,2,19,"With permission, Mr Speaker, I would like to m..."
3,3,1,(Urgent Question): To ask the Secretary of Sta...
4,4,20,"I beg to move, That this House has considered..."
...,...,...,...
3566,1777,9,3. What assessment his Department has made of ...
3567,1778,9,7. What recent assessment he has made of trend...
3568,1779,9,3. What progress has been made on implementing...
3569,1780,9,"I beg to move, That leave be given to bring i..."


In [55]:
transcripts = df.transcript.values
preds = []
for transcript in tqdm(transcripts):
    preds.append(predict(transcript))

df['predicted_topic'] = preds

100%|██████████| 3571/3571 [13:33<00:00,  4.39it/s]


In [56]:
df

Unnamed: 0,index,topic,transcript,predicted_topic
0,0,14,(Urgent Question): To ask the Secretary of Sta...,"(14, 0.83502865)"
1,1,10,"To ask the Secretary of State for Environment,...","(19, 0.48734906)"
2,2,19,"With permission, Mr Speaker, I would like to m...","(19, 0.9341402)"
3,3,1,(Urgent Question): To ask the Secretary of Sta...,"(14, 0.5255614)"
4,4,20,"I beg to move, That this House has considered...","(14, 0.5438213)"
...,...,...,...,...
3566,1777,9,3. What assessment his Department has made of ...,"(9, 0.7339471)"
3567,1778,9,7. What recent assessment he has made of trend...,"(9, 0.25465876)"
3568,1779,9,3. What progress has been made on implementing...,"(3, 0.22552373)"
3569,1780,9,"I beg to move, That leave be given to bring i...","(2, 0.50082535)"


In [57]:
df['predicted_topic'] = df['predicted_topic'].apply(lambda x: x[0])

In [58]:
df

Unnamed: 0,index,topic,transcript,predicted_topic
0,0,14,(Urgent Question): To ask the Secretary of Sta...,14
1,1,10,"To ask the Secretary of State for Environment,...",19
2,2,19,"With permission, Mr Speaker, I would like to m...",19
3,3,1,(Urgent Question): To ask the Secretary of Sta...,14
4,4,20,"I beg to move, That this House has considered...",14
...,...,...,...,...
3566,1777,9,3. What assessment his Department has made of ...,9
3567,1778,9,7. What recent assessment he has made of trend...,9
3568,1779,9,3. What progress has been made on implementing...,3
3569,1780,9,"I beg to move, That leave be given to bring i...",2


In [59]:
for index, row in df.iterrows():
    actual = row['topic']
    pred = row['predicted_topic']
    misclassifications[actual][pred] += 1

In [60]:
misclassifications

array([[ 69.,   0.,   8.,   0.,   7.,   3.,   0.,   0.,   0.,   0.,   3.,
          1.,   2.,   0.,   3.,   7.,   0.,   0.,   0.,   2.,   0.],
       [  0.,  20.,   4.,   4.,  10.,   2.,   0.,   0.,   2.,  11.,   0.,
          0.,  10.,   1.,  14.,   3.,   0.,   0.,   0.,   0.,   0.],
       [  1.,   0., 141.,   2.,  15.,  14.,   2.,  14.,   2.,   4.,   4.,
          0.,   4.,   0.,  29.,  18.,   0.,   0.,   0.,   5.,   2.],
       [  0.,  22.,  11., 143.,  35.,  15.,   2.,  16.,  10.,   8.,   2.,
          0.,   8.,   5.,   5.,  26.,   0.,   0.,   0.,  12.,   4.],
       [  0.,   0.,  12.,  16., 536.,   8.,  10.,   9.,  16.,  12.,   0.,
          2.,  17.,   0.,  24.,  39.,   0.,   0.,   0.,   2.,   5.],
       [  0.,   0.,  17.,   4.,   8., 166.,   7.,   0.,   0.,   1.,   0.,
          0.,   1.,   0.,   1.,  19.,   0.,   0.,   0.,   5.,   4.],
       [  0.,   0.,   4.,   2.,  13.,   0., 182.,   1.,   0.,   0.,   0.,
          0.,   2.,   0.,  27.,   3.,   0.,   0.,   0.,   0.,   1.],

In [61]:
row_sums = misclassifications.sum(axis=1)

In [62]:
m_norm =  misclassifications / row_sums[:, np.newaxis]

In [63]:
m_norm

array([[0.65714286, 0.        , 0.07619048, 0.        , 0.06666667,
        0.02857143, 0.        , 0.        , 0.        , 0.        ,
        0.02857143, 0.00952381, 0.01904762, 0.        , 0.02857143,
        0.06666667, 0.        , 0.        , 0.        , 0.01904762,
        0.        ],
       [0.        , 0.24691358, 0.04938272, 0.04938272, 0.12345679,
        0.02469136, 0.        , 0.        , 0.02469136, 0.13580247,
        0.        , 0.        , 0.12345679, 0.01234568, 0.17283951,
        0.03703704, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.00389105, 0.        , 0.54863813, 0.0077821 , 0.05836576,
        0.05447471, 0.0077821 , 0.05447471, 0.0077821 , 0.0155642 ,
        0.0155642 , 0.        , 0.0155642 , 0.        , 0.11284047,
        0.07003891, 0.        , 0.        , 0.        , 0.01945525,
        0.0077821 ],
       [0.        , 0.06790123, 0.03395062, 0.44135802, 0.10802469,
        0.0462963 , 0.00617284, 0.04938272, 0.0308642

In [64]:
m_overlap = np.zeros((21,21))

for i in range(21):
    for j in range(21):
        if i == j:
            m_overlap[i][j] = 1
        else:
            m_overlap[i][j] = (m_norm[i][j] + m_norm[j][i]) / 2

In [65]:
m_overlap

array([[1.        , 0.        , 0.04004076, 0.        , 0.03333333,
        0.01428571, 0.        , 0.00162338, 0.        , 0.00416667,
        0.03710729, 0.04614122, 0.0119688 , 0.        , 0.01562261,
        0.03698297, 0.02857143, 0.        , 0.        , 0.00952381,
        0.03232759],
       [0.        , 1.        , 0.02469136, 0.05864198, 0.0617284 ,
        0.01234568, 0.        , 0.00649351, 0.01755401, 0.06790123,
        0.        , 0.00689655, 0.0617284 , 0.00617284, 0.08775665,
        0.01851852, 0.        , 0.01428571, 0.        , 0.        ,
        0.01293103],
       [0.04004076, 0.02469136, 1.        , 0.02086636, 0.03765746,
        0.06371804, 0.01240169, 0.11327632, 0.05944661, 0.0390321 ,
        0.03060368, 0.09655172, 0.0090046 , 0.02272727, 0.05642023,
        0.07224573, 0.25714286, 0.        , 0.        , 0.01710226,
        0.03837381],
       [0.        , 0.05864198, 0.02086636, 1.        , 0.06531178,
        0.03173184, 0.00734174, 0.02631473, 0.0258487

In [66]:
m_sim = 1 - m_overlap

In [67]:
m_sim

array([[0.        , 1.        , 0.95995924, 1.        , 0.96666667,
        0.98571429, 1.        , 0.99837662, 1.        , 0.99583333,
        0.96289271, 0.95385878, 0.9880312 , 1.        , 0.98437739,
        0.96301703, 0.97142857, 1.        , 1.        , 0.99047619,
        0.96767241],
       [1.        , 0.        , 0.97530864, 0.94135802, 0.9382716 ,
        0.98765432, 1.        , 0.99350649, 0.98244599, 0.93209877,
        1.        , 0.99310345, 0.9382716 , 0.99382716, 0.91224335,
        0.98148148, 1.        , 0.98571429, 1.        , 1.        ,
        0.98706897],
       [0.95995924, 0.97530864, 0.        , 0.97913364, 0.96234254,
        0.93628196, 0.98759831, 0.88672368, 0.94055339, 0.9609679 ,
        0.96939632, 0.90344828, 0.9909954 , 0.97727273, 0.94357977,
        0.92775427, 0.74285714, 1.        , 1.        , 0.98289774,
        0.96162619],
       [1.        , 0.94135802, 0.97913364, 0.        , 0.93468822,
        0.96826816, 0.99265826, 0.97368527, 0.9741512

In [68]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [69]:
from sklearn.cluster import AgglomerativeClustering

In [88]:
cluster = AgglomerativeClustering(n_clusters=7, affinity='euclidean', linkage='ward')
cluster_map = cluster.fit_predict(m_sim)

  return linkage(y, method='ward', metric='euclidean')


In [89]:
cluster_map

array([2, 0, 4, 5, 0, 0, 0, 0, 0, 3, 2, 6, 1, 5, 6, 0, 4, 3, 1, 2, 0])

In [90]:
for i in range(21):
    print(topics_index_to_name_map[i], cluster_map[i])

Agriculture, animals, food and rural affairs 2
Asylum, immigration and nationality 0
Business, industry and consumers 4
Communities and families 5
Crime, civil law, justice and rights 0
Culture, media and sport 0
Defence 0
Economy and finance 0
Education 0
Employment and training 3
Energy and environment 2
European Union 6
Health services and medicine 1
Housing and planning 5
International affairs 6
Parliament, government and politics 0
Science and technology 4
Social security and pensions 3
Social services 1
Transport 2
Others 0
