# Document Classification with LDA and MLPs

In [518]:
from source_scraping import load_all, load_province
from sklearn.utils import shuffle
from topic_modelling import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.svm import LinearSVC, SVC, SVR, LinearSVR
from xgboost import XGBClassifier, XGBRegressor
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import SGD, Nadam, Adam, Adamax
from tensorflow.keras.metrics import Precision, Recall, BinaryAccuracy, AUC

import os

In [509]:
df = pd.read_csv('SCHEMA UPDATE - Collecting NPIs Effects - FULL.csv')
pos_df = df.dropna(subset=['oxford_government_response_category', 'source_full_text'])
categories = pd.unique([string[0] for string in resp_df['oxford_government_response_category']])

### Convert Document into Topic Vector

In [487]:
def lda_preprocess(texts, lda_model, lda_dict, stop_words=stopwords.words('english'), allowed_postags=['NOUN', 'ADJ', 'VERB']):
    partially_processed = custom_preprocess(texts, stop_words=stop_words, allowed_postags=allowed_postags)
    corpus = form_corpus(partially_processed, lda_dict)
    texts_by_topic = [lda_model.get_document_topics(doc) for doc in corpus]
    processed_texts = []
    for topic_list in texts_by_topic:
        feature_list = np.zeros(len(lda_model.get_topics()))
        for index, value in topic_list:
            feature_list[index] = value
        processed_texts.append(feature_list)
    return np.array(processed_texts)

### Geographical Words to Consider Removing

Could be improved: Ontarian might not be removed along with Ontario.

In [488]:
def geo_stop_words(df):
    flatten = lambda l: [item for sublist in l for item in sublist] # flatten code from stackoverflow...
    region_stop_words = set(flatten([reg.lower().split() for reg in df['region'].dropna()]))
    sub_region_stop_words = set(flatten([reg.lower().split() for reg in df['subregion'].dropna()]))
    geo_stop_words = region_stop_words.union(sub_region_stop_words)
    return geo_stop_words

In [489]:
def text_to_cats(df):
    text_to_cats = { }

    for index, row in df.dropna(subset=['oxford_government_response_category', 'source_full_text']).iterrows():
        ox_cat = row['oxford_government_response_category']
        if ox_cat:
            text = row['source_full_text']
            if text in text_to_cats:
                text_to_cats[text].append(ox_cat)
            else:
                text_to_cats[text] = [ox_cat]
                
    return text_to_cats

def vector_to_cats(v):
    return categories[np.nonzero(v)]

def process_cats(cats):
    one_hot_cat = lambda x : (categories == x[0]).astype(np.float32)
    one_hot_arr_cat = lambda a : sum([one_hot_cat(cat) for cat in a])
    vectorized = lambda y : [one_hot_arr_cat(arr) for arr in y]
    cap = lambda arr : [el if el < 1 else 1 for el in arr]
    
    return np.array([cap(el) for el in vectorized(cats)])

### Model accuracy on instances that fall into multiple categories

In [490]:
def multi_class_accuracy(y_true, y_pred):
    num_correct = 0
    num_present = 0
    for i in range(0, y_true.shape[0]):
        v = y_true[i]
        if np.count_nonzero(v) > 1:
            num_present += 1
            if (y_pred[i] == y_true[i]).all():
                num_correct += 1
                
    return num_correct / num_present

### Split DataFrame into Train and Test Sets

In [533]:
# Splits data with interventions (i.e. for multilabel classification)

def split_npi_data(df, oot=True, multilabel=True):
    df = df[df['region'] != 'Quebec']
    subsets = ['oxford_government_response_category', 'source_full_text'] if multilabel else ['source_full_text']
    df = df.dropna(subset=subsets)
    df['start_date'] = pd.to_datetime(df['start_date'])
    if oot:
        df = df.sort_values(['start_date'])
    else:
        df = df.sample(frac=1)
    
    if multilabel:
        x_raw = list(text_to_cats(df).keys())
        y = list(text_to_cats(df).values())
        y = process_cats(y)
    else:
        x_raw = df['source_full_text']
        y = np.array(df['oxford_government_response_category'].notna().astype(np.float))
    
    return train_test_split(x_raw, y, shuffle=(not oot))

### Keywords

In [492]:
h_keywords = ['hospital', 'healthcare', 'vaccine', 'trial', 'clinic']
e_keywords = ['econom', 'reopen', '$', 'financ', 'financial crisis', 'economic crisis']
c_keywords = ['social distanc', 'mask', 'isolation', 'quarantine']

def overlap(arr, string):
    for word in arr:
        if word in string.lower(): return True
    return False

e = lambda arr : np.array([[overlap(e_keywords, string)] for string in arr])
h = lambda arr : np.array([[overlap(h_keywords, string)] for string in arr])
c = lambda arr : np.array([[overlap(c_keywords, string)] for string in arr])

# Target vectors look like ['H', 'C', 'E']
keywords = [h, c, e]

### Convert Texts to Input Matrix

In [493]:
def text_to_topics(raw_texts, lda_info=None, n_topics=200, stopword_ext=[], keyword_lambdas=[]):
    if lda_info is None:
        x_train_raw = raw_texts[0]
        lda_info = lda_from_list(x_train_raw, n_topic_range=range(n_topics, n_topics + 1), stopword_extensions=stopword_ext, use_coherence=False, plot=False)
    
    lda_model = lda_info['best_model']
    id2word = lda_info['id2word']
    
    topic_features = []
    keyword_features = []
    total_features = []
        
    # raw_texts is a list of lists of documents (list of list of strings)
    for text in raw_texts:
        topic_vals = lda_preprocess(text, lda_model, id2word)
        topic_features.append(topic_vals)
        
        if keyword_lambdas:
            keyword_vals = np.concatenate([kwl(text) for kwl in keyword_lambdas], axis=1).astype(np.float)
            keyword_features.append(keyword_vals)

            total_vals = np.concatenate((topic_vals, keyword_vals), axis=1)
            total_features.append(total_vals)
   
    if not keyword_lambdas:
        total_features = topic_features
    total_features.append(lda_info) # For later use
    
    return total_features

### Make Prediction on Unseen Texts

In [494]:
def predict(model, lda_info, new_texts, keywords_lambdas=[], stopword_ext=[]):
    new_texts_processed, lda_info = text_to_topics([new_texts], lda_info, stopword_ext=stopword_ext, keyword_lambdas=keywords_lambdas)
    return model.predict(new_texts_processed)

### Model Fitting

Essentially a wrapper for Keras' <code>model.fit()</code>, this function streamlines some of the setup, like TensorBoard setup and an early stopping callback (piece of code called at the end of each epoch).

In [495]:
def fit_model(model, x_train, y_train, epochs=200, validation_split=0.2, validation_data=None, log=True, name=''):
    import time

    root_logdir = os.path.join(os.curdir, "tb_logs")
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    run_logdir = os.path.join(root_logdir, run_id + name)
    
    tensobroad_cb = keras.callbacks.TensorBoard(run_logdir)
    early_stop = keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
    callbacks = [early_stop, tensobroad_cb] if log else [early_stop]
    
    return model.fit(x_train, y_train, epochs=epochs, validation_split=validation_split, callbacks=callbacks, validation_data=validation_data)

In [396]:
mlp_model = Sequential([
        Dense(200, name='dense_200'),
        Dropout(0.5, name='first_dropout_0.5'),
        Dense(80, name='dense_80'),
        Dropout(0.5, name='second_dropout_0.5'),
#         Dense(80),
#         Dropout(0.5), # Seems unnecessary
        Dense(10, name='dense_10'),
        Dense(3, name='output_3', activation='sigmoid')
    ])

mlp_model.compile(optimizer=Nadam(), loss='binary_crossentropy', metrics=[AUC(curve='roc'), BinaryAccuracy(), Precision(), Recall()])

In [397]:
x_train_raw, x_test_raw, y_train, y_test = split_npi_data(df, oot=True)
x_train, x_test, lda_info = text_to_topics((x_train_raw, x_test_raw), n_topics=200, keyword_lambdas=keywords)
x_train, y_train = shuffle(x_train, y_train)

In [399]:
fit_model(mlp_model, x_train, y_train, epochs=200, validation_split=0.2, log=False, name='...')

Epoch 1/200


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200


Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200


<tensorflow.python.keras.callbacks.History at 0x14dd5adc0>

In [400]:
mlp_model.evaluate(x_test, y_test)



[0.33094245195388794,
 0.9323994517326355,
 0.8755020499229431,
 0.8237704634666443,
 0.8007968068122864]

In [401]:
threshold = 0.5
y_pred = (mlp_model.predict(x_test) > threshold).astype(np.float32)
x, y = np.concatenate((x_train, x_test), axis=0), np.concatenate((y_train, y_test), axis=0)

accuracy_score(y_test, y_pred), multi_class_accuracy(y, (mlp_model.predict(x) > 0.5).astype(np.float32)) # :(

0.7349397590361446

In [499]:
# df2 = load_all(verbose=True)
# df2_en = df2[df2['region'] != 'Quebec']
# df2_en = df2_en.sample(frac=1)
# df2_en.to_csv('full_release_set_en.csv')
new = df2_en['source_full_text'][:31]

In [None]:
%%time
df_2019 = load_all(start_date=datetime(2019, 10, 1), end_date=datetime(2019, 11, 1), verbose=True)

In [534]:
df_2019 = df_2019[df_2019['start_date'] <= datetime(2019, 11, 1)]
full_df = df_2019.append(pos_df).sample(frac=1)
x_train_raw, x_test_raw, y_train, y_test = split_npi_data(full_df, multilabel=False, oot=False)
x_train, x_test, lda_info = text_to_topics((x_train_raw, x_test_raw), lda_info=lda_info, n_topics=200, keyword_lambdas=keywords)
x_train, y_train = shuffle(x_train, y_train)

In [535]:
y_test

array([0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1.,
       0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 1.,
       1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0.,
       1., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0.,
       0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 1., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0.,
       0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 1.,
       0., 1., 0., 1., 0.

In [543]:
models = {
    'rnd_clf' : RandomForestClassifier(n_estimators=400, max_depth=80),
    'lin_svc_clf' : LinearSVC(),
    'rbf_svc_clf' : SVC(kernel='rbf'),
    'knn_clf' : KNeighborsClassifier(),
    'log_clf' : LogisticRegression()
}

metrics = {
    'Accuracy' : accuracy_score,
    'Precision' : precision_score,
    'Recall' : recall_score
#     'ROC AUC' : roc_auc_score
}

for name, model in models.items():
    model.fit(x_train, y_train)
    print(name.upper())
    for metric, metric_func in metrics.items():
        print(metric, metric_func(y_test, model.predict(x_test)))
    print('\n')

RND_CLF
Accuracy 0.8896551724137931
Precision 0.9237804878048781
Recall 0.8859649122807017


LIN_SVC_CLF
Accuracy 0.8362068965517241
Precision 0.8823529411764706
Recall 0.8333333333333334


RBF_SVC_CLF
Accuracy 0.8275862068965517
Precision 0.8829113924050633
Recall 0.8157894736842105


KNN_CLF
Accuracy 0.8362068965517241
Precision 0.8776758409785933
Recall 0.8391812865497076


LOG_CLF
Accuracy 0.7982758620689655
Precision 0.8440366972477065
Recall 0.8070175438596491




In [None]:
mlp_clf = Sequential([
        Dense(200, name='dense_200', activation='elu'),
        Dropout(0.5, name='first_dropout_0.5'),
        Dense(80, name='dense_80', activation='elu'),
        Dropout(0.5, name='second_dropout_0.5'),
        Dense(10, name='dense_10', activation='elu'),
        Dense(1, name='output_3', activation='sigmoid')
])

mlp_clf.compile(optimizer=Nadam(), loss='binary_crossentropy', metrics=[AUC(curve='roc'), BinaryAccuracy(), Precision(), Recall()])
fit_model(mlp_clf, x_train, y_train, epochs=200, validation_split=0.2, log=False, name='binary')

In [542]:
mlp_clf.evaluate(x_test, y_test)



[0.3489212095737457,
 0.9198486804962158,
 0.8379310369491577,
 0.8583815097808838,
 0.8684210777282715]

In [544]:
best_model = models['rnd_clf']