In [None]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
import pandas as pd
import numpy as np
import json
import spacy
import matplotlib.pyplot as plt
%matplotlib inline

# For regular expressions
import re

# For handling strings
import string

# For performing mathematical operations
import math

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve, f1_score, confusion_matrix
from keras import backend as K

# This module will be for saving the trained model for later use
import joblib

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.models import Model
from keras.models import Sequential

from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import LSTM, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D

from keras.callbacks import Callback
from keras.optimizers import Adam

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import load_model
from keras.utils.vis_utils import plot_model

In [None]:
from tqdm import tqdm_notebook

In [None]:
df = pd.read_csv('cleaned_lemmatized_text.csv')
df.head()

Unnamed: 0,is_adverse_media,lemmatized_articles
0,0,zimbabweans wake news agriculture minister per...
1,1,singapore founder singapore oil trade company ...
2,1,fraudster offer green tax efficient investment...
3,1,buenos aire reuter judicial probe possible cor...
4,0,ukraines constitutional court appear strike bl...


In [None]:
x_train, x_val, y_train, y_val = train_test_split(df['lemmatized_articles'], 
                                                    df['is_adverse_media'], 
                                                    test_size=0.1, 
                                                    random_state=42,
                                                    stratify=df['is_adverse_media'])

print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(656,) (73,) (656,) (73,)


In [None]:
MAX_NB_WORDS = 80000
oov_token = '<UNK>'
pad_type = 'post'
trunc_type = 'post'

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token=oov_token)
tokenizer.fit_on_texts(x_train)

word_index = tokenizer.word_index

# Encode training data sentences into sequences
train_sequences = tokenizer.texts_to_sequences(x_train)

# Get max training sequence length
maxlen = 600

# Pad the training sequences
train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

# Output the results of our work
#print("Word index:\n", word_index)
#print("\nTraining sequences:\n", train_sequences)
print("\nPadded training sequences:\n", train_padded)
print("\nPadded training shape:", train_padded.shape)
print("Training sequences data type:", type(train_sequences))
print("Padded Training sequences data type:", type(train_padded))


Padded training sequences:
 [[ 197  342 3000 ... 1629  190 5034]
 [2733  164  141 ...    0    0    0]
 [9229 2865  281 ...    0    0    0]
 ...
 [ 701   42  475 ...    0    0    0]
 [5330 3876  556 ...    0    0    0]
 [ 197 1244 2584 ...    0    0    0]]

Padded training shape: (656, 600)
Training sequences data type: <class 'list'>
Padded Training sequences data type: <class 'numpy.ndarray'>


In [None]:
val_sequences = tokenizer.texts_to_sequences(x_val)
val_padded = pad_sequences(val_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)

#print("Val sequences:\n", val_sequences)
#print("\nPadded val sequences:\n", val_padded)
print("\nPadded val shape:",val_padded.shape)


Padded val shape: (73, 600)


In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def get_simple_rnn_model():
    embedding_dim = 300
    embedding_matrix = np.random.random((MAX_NB_WORDS, embedding_dim))
    
    inp = Input(shape=(maxlen, ))
    x = Embedding(input_dim=MAX_NB_WORDS, output_dim=embedding_dim, input_length=maxlen, 
                  weights=[embedding_matrix], trainable=True)(inp)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(LSTM(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', f1_m])
    return model

rnn_simple_model = get_simple_rnn_model()

In [None]:
filepath="weights-improvement-{epoch:02d}-{val_f1_m:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_f1_m', verbose=1, save_best_only=True, mode='max')

batch_size = 128
epochs = 40

history = rnn_simple_model.fit(x=train_padded, 
                    y=y_train, 
                    validation_data=(val_padded, y_val), 
                    batch_size=batch_size, 
                    callbacks=[checkpoint], 
                    epochs=epochs, 
                    verbose=1)

Epoch 1/40
Epoch 00001: val_f1_m improved from -inf to 0.00000, saving model to weights-improvement-01-0.0000.hdf5
Epoch 2/40
Epoch 00002: val_f1_m improved from 0.00000 to 0.71930, saving model to weights-improvement-02-0.7193.hdf5
Epoch 3/40
Epoch 00003: val_f1_m improved from 0.71930 to 0.74074, saving model to weights-improvement-03-0.7407.hdf5
Epoch 4/40
Epoch 00004: val_f1_m did not improve from 0.74074
Epoch 5/40
Epoch 00005: val_f1_m did not improve from 0.74074
Epoch 6/40
Epoch 00006: val_f1_m did not improve from 0.74074
Epoch 7/40
Epoch 00007: val_f1_m did not improve from 0.74074
Epoch 8/40
Epoch 00008: val_f1_m did not improve from 0.74074
Epoch 9/40
Epoch 00009: val_f1_m improved from 0.74074 to 0.76471, saving model to weights-improvement-09-0.7647.hdf5
Epoch 10/40
Epoch 00010: val_f1_m improved from 0.76471 to 0.79612, saving model to weights-improvement-10-0.7961.hdf5
Epoch 11/40
Epoch 00011: val_f1_m did not improve from 0.79612
Epoch 12/40
Epoch 00012: val_f1_m did n

In [None]:
tokenizer_final = Tokenizer(num_words=MAX_NB_WORDS, oov_token=oov_token)
tokenizer_final.fit_on_texts(df['lemmatized_articles'].values)

X_train_sequences = tokenizer_final.texts_to_sequences(df['lemmatized_articles'].values)

X_train_padded = pad_sequences(X_train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
y_train_final = df['is_adverse_media'].values

In [None]:
dependencies = {'f1_m': f1_m}
best_rnn_simple_model = load_model('weights-improvement-31-0.8916.hdf5', custom_objects=dependencies)

best_rnn_simple_model.fit(x=X_train_padded, 
                          y=y_train_final)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x7f7712cf6550>

In [None]:
df_test = pd.read_csv('public_test.csv')

In [None]:
X_test_sequences = tokenizer_final.texts_to_sequences(df_test['article'].values)

X_test_padded = pad_sequences(X_test_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
y_test = df_test['label'].values


y_test_pred = best_rnn_simple_model.predict(X_test_padded, verbose=1)
y_test_pred = y_test_pred.reshape(y_test_pred.shape[0],)
y_test_pred = np.array([1 if p>=0.5 else 0 for p in y_test_pred])



In [None]:
auc_without_glove = roc_auc_score(y_test, y_test_pred)
accuracy_without_glove = accuracy_score(y_test, y_test_pred)
f1_without_glove  = f1_score(y_test, y_test_pred)

In [None]:
print(auc_without_glove, accuracy_without_glove, f1_without_glove)

0.8014632524110409 0.7861635220125787 0.8068181818181818
