# Import Libraries

In [27]:
import pandas as pd
from gensim import models, corpora
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation
from gensim.parsing.preprocessing import remove_stopwords, stem_text, strip_non_alphanum, strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_short, strip_numeric
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing as mp
from sklearn.utils import class_weight
import nltk
import pandas as pd
import numpy as np
import tensorflow as tf
import parmap
import swifter

# Some constants and Preprocessing methods

In [28]:
topics_index_to_name_map = {
    0: 'Agriculture, animals, food and rural affairs',
    1: 'Asylum, immigration and nationality',
    2: 'Business, industry and consumers',
    3: 'Communities and families',
    4: 'Crime, civil law, justice and rights',
    5: 'Culture, media and sport',
    6: 'Defence',
    7: 'Economy and finance',
    8: 'Education',
    9: 'Employment and training',
    10: 'Energy and environment',
    11: 'European Union',
    12: 'Health services and medicine',
    13: 'Housing and planning',
    14: 'International affairs',
    15: 'Parliament, government and politics',
    16: 'Science and technology',
    17: 'Social security and pensions',
    18: 'Social services',
    19: 'Transport',
    20: 'Others'
}
topics_name_to_index_map = {y:x for x,y in topics_index_to_name_map.items()}

def strip_short2(text):
    return strip_short(text, minsize=4)


def preprocess_text(text):
    FILTERS = [lambda x: x.lower(), strip_multiple_whitespaces, strip_tags, strip_punctuation,
                   strip_non_alphanum, strip_numeric, strip_short2]
    return preprocess_string(text, FILTERS)

def preprocess(topic):
    ret = []
    topic = topic.strip()
    
    if '|' in topic:
        topics = topic.split('|')
        t = topics[0]
        t = t.strip()
        return topics_name_to_index_map[t]
        
    return topics_name_to_index_map[topic]

In [29]:
year = 2017
drop = '_no_Others'

# Read news predictions

In [30]:
df = pd.read_csv('../data/news_predictions/news_{}_predictions.csv'.format(year))

# Select one month to train our classifier

In [31]:
df = df.loc[df.month.isin([1,3,4,5,6])]
df = df.drop(['Unnamed: 0'], axis=1)
df = df.reset_index(drop=True)
df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc
0,400553,Belfast Telegraph,1,1,2017,Yaya Toure reveals Pep Guardiola's anger at Ma...,A lacklustre City lost 1-0 at second-placed Li...,1,"Culture, media and sport",96.34,"Crime, civil law, justice and rights",0.79,"Business, industry and consumers",0.75
1,400553,Belfast Telegraph,1,1,2017,Everton boss Ronald Koeman issues warning to S...,Koeman quit Southampton to take charge at Good...,1,Others,43.70,Communities and families,17.49,"Culture, media and sport",16.94
2,400553,Belfast Telegraph,1,1,2017,Stoke manager Mark Hughes hails Peter Crouch's...,Crouch netted in Stoke's 4-2 loss at Chelsea a...,1,"Crime, civil law, justice and rights",22.13,"Parliament, government and politics",21.14,"Culture, media and sport",20.00
3,400553,Belfast Telegraph,1,1,2017,"David Moyes braced for ""difficult"" January win...",For the second straight season the Black Cats ...,1,"Business, industry and consumers",29.02,Others,21.56,"Culture, media and sport",11.25
4,400553,Belfast Telegraph,1,1,2017,Georginio Wijnaldum unfazed by Chelsea's winni...,"A 13th successive victory, tying Arsenal's eff...",1,"Culture, media and sport",97.40,Others,0.69,Education,0.58
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692700,412338,Wales,30,6,2017,Idriss Saadi to be given the chance to impress...,But a summer exit for the Algeria internationa...,1,"Culture, media and sport",32.85,Others,17.47,Education,7.46
692701,412338,Wales,30,6,2017,Children in Vietnam gave this amazing renditio...,And the words were displayed phonetically on a...,1,"Parliament, government and politics",24.29,"Culture, media and sport",21.36,Others,19.74
692702,412338,Wales,30,6,2017,The reasons for Warren Gatland's major gamble ...,"The call has divided opinion, based on the rel...",1,Others,24.83,"Culture, media and sport",22.66,"Parliament, government and politics",14.68
692703,412338,Wales,30,6,2017,999 turns 80 years old - the most ridiculous e...,The last eight decades have seen the service e...,1,"Culture, media and sport",49.40,Others,12.63,Communities and families,12.10


drop rows where transcript is None or empty

In [32]:
df  = df.dropna(subset=['transcript'])

In [33]:
total_articles = len(df)
print('total number of news articles: {}'.format(total_articles))

total number of news articles: 692673


In [34]:
df['top1_topic'].value_counts()

Culture, media and sport                        208426
Others                                          198430
Parliament, government and politics              47783
Crime, civil law, justice and rights             40425
Health services and medicine                     32678
Economy and finance                              22594
International affairs                            21138
Business, industry and consumers                 20536
Energy and environment                           19171
Defence                                          15218
Transport                                        14840
Agriculture, animals, food and rural affairs     14621
European Union                                   12195
Communities and families                          9358
Education                                         8578
Asylum, immigration and nationality               1926
Science and technology                            1768
Employment and training                           1513
Housing an

Drop those where top topic accuracy is less than 40% or is Others

In [35]:
df = df[df.top1_acc >= 40]

if drop == '_no_Others':
    df = df[df.top1_topic != 'Others']
elif drop == '_no_Others_Parliament':
    df = df[df.top1_topic != 'Others']
    df = df[df.top1_topic != 'Parliament, government and politics']

In [36]:
df['top1_topic'].value_counts()

Culture, media and sport                        163093
Parliament, government and politics              25447
Crime, civil law, justice and rights             23488
Health services and medicine                     23226
Economy and finance                              15217
International affairs                            13133
Energy and environment                           10974
Defence                                          10054
Transport                                         9741
Business, industry and consumers                  8965
Agriculture, animals, food and rural affairs      7741
European Union                                    7629
Education                                         4940
Communities and families                          2508
Housing and planning                               626
Employment and training                            473
Asylum, immigration and nationality                433
Science and technology                             323
Social sec

In [37]:
print('articles after drop: {}'.format(len(df)))

articles after drop: 328013


In [38]:
print('percent of articles remaining {}'.format(len(df)/total_articles))

percent of articles remaining 0.47354668075700945


In [39]:
X = df['transcript'].values
Y = df['top1_topic'].values

In [40]:
print('preprocessing data!!')
preprocessed_X = parmap.map(preprocess_text, X, pm_pbar=True)
tagged_X = [models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(preprocessed_X)]

preprocessing data!!


328064it [00:35, 9193.24it/s]                            


In [41]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
Y = df.top1_topic.values
input_Y = Y.reshape(-1,1)
enc.fit(input_Y)
input_Y = enc.transform(input_Y).toarray()

In [42]:
from joblib import dump, load
dump(enc, '../models/doc2vec/encoder_{}{}.joblib'.format(year, drop))

['../models/doc2vec/encoder_2017_no_Others.joblib']

In [None]:
print('training doc2vec')
doc2vec_model = Doc2Vec(vector_size=100, window=3, workers=mp.cpu_count(), epochs=40)
doc2vec_model.build_vocab(tagged_X)
doc2vec_model.train(tagged_X, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)
doc2vec_model.save('../models/doc2vec/doc2vec_news_{}{}'.format(year, drop))

training doc2vec


In [None]:
print('preparing inputs')

def get_doc_vec(doc):
    return doc2vec_model.infer_vector(doc.words)

inputs = parmap.map(get_doc_vec, tagged_X, pm_pbar=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, input_Y, test_size=0.2, random_state=42)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

In [None]:
def build_network():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(100)),
        tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)),
        tf.keras.layers.Dense(len(enc.categories_[0]), activation='softmax')
    ])
    loss_fn = tf.keras.losses.CategoricalCrossentropy(from_logits=False, label_smoothing=0, name='categorical_crossentropy')

    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=loss_fn,
              metrics=['accuracy'])
    
    return model

In [None]:
model = build_network()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
model.fit(X_train, y_train, batch_size=32, epochs=200, callbacks=[callback], class_weight=class_weights,
          validation_data=(X_test, y_test))

In [None]:
model.save_weights('../models/doc2vec/news_classifier_{}{}'.format(year, drop))
doc2vec_model.save('../models/doc2vec/doc2vec_news_{}{}'.format(year, drop))

In [None]:
def predict(X, topn=3):
    preprocessed_X = preprocess_text(X)
    vec_X = doc2vec_model.infer_vector(preprocessed_X)
    vec_X = vec_X.reshape(1, vec_X.shape[0])
    pred = model.predict(vec_X)
    pred_i = []
    for i, p in enumerate(pred[0]):
        one_hot = np.zeros(len(pred[0]))
        one_hot[i] = 1
        pred_i += [(enc.inverse_transform([one_hot])[0][0], p)]
    pred_sorted = sorted(pred_i, key=lambda x: x[1], reverse=True)
    return pred_sorted[:topn]

In [None]:
predict(X[8])

In [None]:
X[8]

In [None]:
df['predicted_topic'] = df.swifter.apply(lambda row: predict(row['transcript']), axis=1)

In [94]:
df

Unnamed: 0,source_id,source,day,month,year,program_name,transcript,parliament,top1_topic,top1_acc,top2_topic,top2_acc,top3_topic,top3_acc,predicted_topic
24,163795,Belfast Telegraph,1,3,2017,Parties keep positions despite seat cut... if ...,"Arlene Foster's party would lose six seats, bu...",,Energy and environment,54.92,Communities and families,9.99,Health services and medicine,6.75,"[(International affairs, 0.361709), (Defence, ..."
42,163795,Belfast Telegraph,1,3,2017,Struck off: GP who falsified clinical trials,A CROOKED doctor convicted of falsifying clini...,,Transport,53.75,"Crime, civil law, justice and rights",7.65,"Parliament, government and politics",6.39,"[(Transport, 0.90817), (International affairs,..."
62,163795,Belfast Telegraph,1,3,2017,Trophies or I get the boot: Pep;\nEMIRATES FA ...,City took advantage of a free weekend to trave...,,Communities and families,43.17,Energy and environment,22.58,International affairs,11.63,"[(Employment and training, 0.36442462), (Crime..."
109,163795,Belfast Telegraph,1,3,2017,Murray is back in the groove on happy return;\...,"The top seed, now fully fit after a bout of sh...",,Employment and training,42.90,Economy and finance,7.96,"Crime, civil law, justice and rights",6.59,"[(Employment and training, 0.9796784), (Crime,..."
126,400553,Belfast Telegraph,1,3,2017,"Murder probe after body of woman, 80, found in...","At around 2:05am on Tuesday, officers went to ...",,Energy and environment,45.26,Communities and families,8.18,Others,7.82,"[(Energy and environment, 0.83248276), (Transp..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271490,244365,Wales,30,4,2017,WELSH League,Drew Fahiya scored a hat-trick for Gavin Chest...,,Energy and environment,47.92,Employment and training,9.53,"Business, industry and consumers",7.51,"[(Employment and training, 0.5066459), (Energy..."
271497,244365,Wales,30,4,2017,FINISHING LINE NOW IN SIGHT FOR THE OSPREYS,Griffiths was the first British athlete home i...,,Employment and training,44.77,Communities and families,12.17,"Crime, civil law, justice and rights",10.10,"[(Employment and training, 0.9628461), (Commun..."
271519,244365,Wales,30,4,2017,BEHIND BARS;\nHere are the faces of some of th...,"Evans, from Ebbw Vale, committed the assaults ...",,Energy and environment,63.39,Health services and medicine,7.90,Others,6.21,"[(Health services and medicine, 0.58105093), (..."
271544,412338,Wales,30,4,2017,Kumar Sangakkara helps Surrey cruise to victor...,After Dominic Sibley was out in the fourth ove...,,Employment and training,71.24,Transport,6.08,"Business, industry and consumers",4.57,"[(Employment and training, 0.87342006), (Crime..."


In [121]:
df = df.drop(['top_class', 'topic'], axis=1)

In [122]:
df.to_csv('./doc2vec_on_news_transcripts_preds.csv')

In [124]:
df['predicted_topic'] = df.swifter.apply(lambda row: predict(row['Transcript']), axis=1)

  "This pandas object has duplicate indices, and swifter may not be able to improve performance. Consider resetting the indices with `df.reset_index(drop=True)`."


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1243.0, style=ProgressStyle(descriptio…

(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)

(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)
(100,)

