# Import Libraries

In [1]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter

  import pandas.util.testing as tm


# Preprocessing Function

In [2]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)

def remove_non_nouns(text):
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    filter_tokens = [t[0] for t in tags if t[1] == "NN" or t[1] == "VB"]
    return ' '.join(filter_tokens)


def remove_custom_stopwords(s):
    my_stop_words = STOPWORDS.union(set(['time', 'year', 'number', 'today', 'week', 'month', 'night', 'world', 'home',
                                         'place', 'yesterday', 'life', 'wife']))
    return " ".join(w for w in s.split() if w not in my_stop_words)


def preprocess_text_for_lda(text):
    LDA_FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   remove_custom_stopwords, strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, LDA_FILTERS)

def filter_multiple_topics(topic):
    if '|' in topic:
        return topic.split('|')[0].strip()
    return topic

def preprocess(topic):
    t = filter_multiple_topics(topic)
    if t == 'admin':
        t = 'Others'
        
    return topics_name_to_index_map[t]

# Predict Function

In [11]:
def predict(X, topn=3):
    preprocessed_X = preprocess_text_for_lda(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    print(vec_X.shape)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = classifier.predict(vec_X)
    pred_i = [(topics_index_to_name_map[i], p) for i,p in enumerate(pred[0])]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]

# Network Architecture

In [12]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(21, activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

# Load Saved Models

In [18]:
doc2vec_model = Doc2Vec.load('models/doc2vec/doc2vec_15_16')
classifier = build_network()
classifier.load_weights('./models/doc2vec/classifier_15_16')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f7a6da91990>

# Test for parliament debates

In [19]:
df = pd.read_csv('./data/2014_debate.csv')
df = df.drop(df[df.topic == 'admin'].index)
df = df.drop(df[df.transcript.str.split().map(len) < 10].index).reset_index()
df['topic'] = df.apply(lambda row: preprocess(row['topic']), axis=1)

In [20]:
df

Unnamed: 0,index,topic,transcript
0,0,8,1. What assessment he has made of the perform...
1,1,8,2. How many applications his Department has r...
2,2,8,3. What steps his Department is taking to imp...
3,3,8,4. What systems his Department has in place f...
4,4,8,5. What progress he has made on encouraging t...
...,...,...,...
1839,1892,10,11. What recent assessment he has made of tre...
1840,1893,10,16. What estimate he has made of how much sub...
1841,1894,4,(Urgent Question): To ask the Secretary of Sta...
1842,1895,13,(Urgent Question): To ask the Minister to make...


In [21]:
X = df['transcript'].values
Y = df['topic'].values
i = 45
print(predict(X[i]))
print('true: ', Y[i], topics_index_to_name_map[Y[i]])

(100,)
[('Agriculture, animals, food and rural affairs', 0.68747365), ('Energy and environment', 0.13315), ('Health services and medicine', 0.03201327)]
true:  0 Agriculture, animals, food and rural affairs


In [22]:
X[i]

'2.  What assessment he has made of the scope for cutting red tape in the farming industry. [901855]  We are committed to freeing farmers from red tape to help them to seize economic opportunities. We are reducing paperwork burdens and making guidance clearer and simpler. Farmers who play by the rules now receive fewer inspections. For example, 740 members of the Environment Agency’s pig and poultry scheme are inspected once every three years, rather than annually. I expect to make an announcement shortly on further opportunities for cutting red tape as a result of the agriculture red tape challenge.  I thank the Minister for that answer, but for many farmers in my constituency overly complex livestock identification and movement controls remain a burden on their businesses. What plans does the Minister have to simplify this regime?  My hon. Friend makes a good point. Considerable progress has already been made on livestock identification and the complex rules governing animal movement

In [23]:
df['topic'] = df.apply(lambda row: topics_index_to_name_map[row['topic']], axis=1)

In [24]:
df['predicted_topic'] = df.swifter.apply(lambda row: predict(row['transcript']), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1844.0, style=ProgressStyle(descriptio…

(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)

(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)

In [123]:
df.to_csv('./doc2vec_debate_2014_pred.csv')

In [124]:
df

Unnamed: 0,index,topic,transcript,predicted_topic
0,0,Education,1. What assessment he has made of the perform...,"[(Education, 0.9740481), (Business, industry a..."
1,1,Education,2. How many applications his Department has r...,"[(Education, 0.98344123), (Business, industry ..."
2,2,Education,3. What steps his Department is taking to imp...,"[(Education, 0.44933805), (Business, industry ..."
3,3,Education,4. What systems his Department has in place f...,"[(Education, 0.7928537), (Crime, civil law, ju..."
4,4,Education,5. What progress he has made on encouraging t...,"[(Education, 0.8934695), (Business, industry a..."
...,...,...,...,...
1839,1892,Energy and environment,11. What recent assessment he has made of tre...,"[(Energy and environment, 0.9285043), (Economy..."
1840,1893,Energy and environment,16. What estimate he has made of how much sub...,"[(Energy and environment, 0.6575643), (Economy..."
1841,1894,"Crime, civil law, justice and rights",(Urgent Question): To ask the Secretary of Sta...,"[(Health services and medicine, 0.8022428), (T..."
1842,1895,Housing and planning,(Urgent Question): To ask the Minister to make...,"[(International affairs, 0.37547308), (Economy..."


# Test for Nexis news data

In [12]:
df = pd.read_csv('./data/news.csv', encoding='utf8')
df = df.dropna(subset=['transcript'])
df = df.drop(['Unnamed: 0'], axis=1)

In [13]:
df

147284

In [14]:
df['predicted_topic'] = df.swifter.apply(lambda row: predict(row['transcript']), axis=1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=147284.0, style=ProgressStyle(descript…




In [17]:
df = df.drop(['Unnamed: 0', 'topic'], axis=1)
df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,predicted_topic
0,163795,Belfast Telegraph,1,1,2014,Hunt begins for team to secure cultural legacy,"As the clock struck midnight last night, Londo...","[(Parliament, government and politics, 0.72444..."
1,163795,Belfast Telegraph,1,1,2014,Contractor to pay for power cut,Northern Ireland Electricity was flooded with ...,"[(Energy and environment, 0.38718176), (Busine..."
2,163795,Belfast Telegraph,1,1,2014,Well-known city hotelier celebrates MBE,Almost a quarter-ofa-century spent transformin...,"[(Parliament, government and politics, 0.18250..."
3,163795,Belfast Telegraph,1,1,2014,"Derry treated as poor relation, say rail campa...",Campaigners from Into The West argued that Der...,"[(Transport, 0.6714804), (Parliament, governme..."
4,163795,Belfast Telegraph,1,1,2014,Ulster are not good enough;\nTONY WARD'S DAMNI...,The Dublin-based columnist blasted Mark Anscom...,"[(Others, 0.25496763), (Parliament, government..."
...,...,...,...,...,...,...,...,...
147285,412338,Wales,31,1,2014,Wales weather: Fresh storms threaten more floo...,"A combination of driving rain, high tides, a p...","[(Parliament, government and politics, 0.44981..."
147286,412338,Wales,31,1,2014,Wales weather: Weekend high tides expected to ...,"A combination of driving rain, high tides, a p...","[(Parliament, government and politics, 0.55013..."
147287,412338,Wales,31,1,2014,Live transfer deadline day: All the breaking C...,deadline day blog,"[(Business, industry and consumers, 0.09912991..."
147288,412338,Wales,31,1,2014,Steve Tucker: February's Premier League fixtur...,Indeed it is hard to think of a month more piv...,"[(Culture, media and sport, 0.282427), (Others..."


In [18]:
df.to_csv('./doc2vec_news_pred3.csv')

In [58]:
i=2501
df.iloc[i]['predicted_topic']

[('Culture, media and sport', 0.3676517),
 ('Transport', 0.29729465),
 ('Others', 0.12573819)]

In [59]:
df.iloc[i]['program_name']

"WATCH: Giant New Year's Eve duck explodes just hours before celebrations"

In [60]:
df.iloc[i]['transcript']

'The duck had been on display for 11 days in the port of Keelung, Taiwan. But it randomly deflated in front of crowds of people, putting a dampener on the New Year\'s event. There is speculation that the duck was attacked by eagles, causing it to burst, reported the Mail Online. The oversized duck was designed by Dutch artist Florentijn Hofman and has travelled the world. Related articles The top new TV shows I\'d love to see in 2014 After the party comes the clean up! Work begins to tidy up after New Year extravaganza Huang Jing-tai, one of the organiser\'s for the port\'s New Years Even event, said: "We want to apologise to the fans of the yellow rubber duck. "The weather is fine today and we haven\'t found the cause of the problem. "We will carefully examine the duck to determine the cause."'

# Test for BBC transcripts

In [None]:
df = pd.read_csv('./data/News jan-2014 175.csv')