In [1]:
import numpy as np
import pandas as pd
import nltk
import string
import re
import matplotlib.pyplot as plt
from scipy import stats
from keras.preprocessing import sequence, text
from keras.layers import  Input, Dense, Flatten, Add, LSTM, GlobalAveragePooling1D,SpatialDropout1D, Bidirectional,\
    BatchNormalization, Concatenate, Dropout, Activation, Input, Embedding, Conv1D, MaxPooling1D, GRU,\
    GlobalMaxPooling1D
from keras.optimizers import Adam
from keras.models import Model, load_model
import tensorflow as tf
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard, Callback
import keras.backend as K
from sklearn.model_selection import train_test_split
from collections import defaultdict

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
from sklearn.metrics import roc_auc_score
class ROCCallBack(Callback):
    def __init__(self,validation_data):
        super().__init__()
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]

    def on_epoch_end(self, epoch, logs={}):
        y_pred_val = self.model.predict(self.x_val)
        print('\nroc-auc_val: %s' % (str(round(roc_auc_score(self.y_val, y_pred_val),4))))

In [3]:
def flatten(x):
    if isinstance(x, (np.ndarray, list, tuple, pd.Series)):
        lst = []
        for i in x:
            lst += flatten(i)
        return lst
    else:
        return [x]

#### Load Data

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
output_names = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [6]:
tok=text.Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_\'`{|}~\t\n', lower=True)
tok.fit_on_texts(np.concatenate((train.comment_text.values, test.comment_text.values)))

#### Load GloVe

In [7]:
f = open('data/glove.42B.300d.txt', 'r', encoding = 'utf-8')

In [8]:
all_unique_tokens = tok.word_index.keys()

In [9]:
embeddings = {}
for line in f:
    values = line.split()
    word = values[0]
    # Whole GloVe embeddings doesn't fit in my GPU memory, so only take words which appear in data for now. 
    # Can always swap weights for embedding layer after model training
    if word in all_unique_tokens:
        coefs = np.array(values[1:], dtype = 'float32')
        embeddings[word] = coefs

In [10]:
for i in list(tok.word_index.keys()):
    if i not in embeddings.keys():
        del tok.word_index[i]
for counter, i in enumerate(tok.word_index.keys()):
    tok.word_index[i] = counter+1

In [11]:
idx2word = {b:a for a,b in tok.word_index.items()}
idx2word[0] = '<UNK>'
word2idx = defaultdict(lambda x: '<UNK>', tok.word_index)
embeddings['<UNK>'] = np.zeros((300,))

#### Data Processing

In [12]:
train['toks'] = tok.texts_to_sequences(train.comment_text.values)
test['toks'] = tok.texts_to_sequences(test.comment_text.values)

In [13]:
vocab_size = len(embeddings)
max_len = 300
n_factors = 300

In [14]:
def create_emb():
    emb = np.zeros((vocab_size+1,n_factors), dtype = 'float32')
    for i in range(0, vocab_size):
        word = idx2word[i]
        emb[i,:] = embeddings[word] #each row is a word
    return emb

In [15]:
emb = create_emb()

In [16]:
emb.shape

(190324, 300)

In [17]:
# train val  split
np.random.seed(10)
indexTrain = np.random.choice(range(train.shape[0]), size = int(0.9*train.shape[0]), replace = False)
indexVal = list(set(range(train.shape[0])) - set(indexTrain))
traindf = train.loc[indexTrain]
valdf = train.loc[indexVal]

In [18]:
dataInputTrain=sequence.pad_sequences(traindf.toks,maxlen=max_len)
dataInputVal=sequence.pad_sequences(valdf.toks,maxlen=max_len)
dataInputTest=sequence.pad_sequences(test.toks,maxlen=max_len)

In [19]:
' '.join([idx2word[i] for i in dataInputTrain[10,:]])

'<UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UN

In [26]:
def makeModel(counter, denseNodes, convFilters, dropOut):
    sequence_input = Input(shape=(max_len, ))
    x = Embedding(vocab_size+1, n_factors, input_length=max_len, weights=[emb],trainable = False)(sequence_input)
    x = Bidirectional(GRU(128, return_sequences=True,dropout=0.15,recurrent_dropout=0.15))(x)
    x = Conv1D(convFilters, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = Concatenate()([avg_pool, max_pool])
    x = Dense(denseNodes, activation = 'relu')(x)
    x = BatchNormalization(axis = -1)(x)
    x = Dropout(dropOut)(x)
    preds = Dense(6, activation="sigmoid")(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3))
    
    earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min')
    mcp_save = ModelCheckpoint('weights/lstm_mdl' + str(counter), save_best_only=True, monitor='val_loss', mode='min')
    reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='min')
    roc_callback = ROCCallBack(validation_data = [dataInputVal, valdf[output_names].values])
    
    model.fit(x = dataInputTrain,
         y = traindf[output_names].values,
         batch_size = 64, epochs = 200,
         validation_data = [dataInputVal, valdf[output_names].values],
         callbacks=[earlyStopping, mcp_save, reduce_lr_loss, roc_callback])
    
    pred = model.predict(dataInputTest, verbose = 1)
    for counter,i in enumerate(output_names):
        test[i] = pred[:,counter]
    test[['id'] + output_names].to_csv('data/answers/lstm' + str(counter) + '.csv', index = False)
    return model

In [27]:
params = [
    {'denseNodes': 128, 'convFilters': 128, 'dropOut': 0.4},
    {'denseNodes': 256, 'convFilters': 256, 'dropOut': 0.5},
    {'denseNodes': 256, 'convFilters': 128, 'dropOut': 0.5},
]
    

In [28]:
models = [makeModel(counter, **i) for counter, i in enumerate(params)]

Train on 143613 samples, validate on 15958 samples
Epoch 1/200
roc-auc_val: 0.9832
Epoch 2/200
roc-auc_val: 0.9879
Epoch 3/200
roc-auc_val: 0.9889
Epoch 4/200
roc-auc_val: 0.989
Epoch 5/200
roc-auc_val: 0.989
Epoch 6/200
roc-auc_val: 0.9887
Epoch 7/200
Epoch 00007: reducing learning rate to 0.00010000000474974513.

roc-auc_val: 0.9887
Epoch 8/200
roc-auc_val: 0.9879
Epoch 9/200
roc-auc_val: 0.9872
Epoch 10/200
Epoch 00010: reducing learning rate to 1.0000000474974514e-05.

roc-auc_val: 0.9867
Train on 143613 samples, validate on 15958 samples
Epoch 1/200
roc-auc_val: 0.9848
Epoch 2/200
roc-auc_val: 0.987
Epoch 3/200
roc-auc_val: 0.9879
Epoch 4/200
roc-auc_val: 0.9888
Epoch 5/200
roc-auc_val: 0.9893
Epoch 6/200
roc-auc_val: 0.9888
Epoch 7/200
roc-auc_val: 0.9888
Epoch 8/200
Epoch 00008: reducing learning rate to 0.00010000000474974513.

roc-auc_val: 0.9886
Epoch 9/200
roc-auc_val: 0.9886
Train on 143613 samples, validate on 15958 samples
Epoch 1/200
roc-auc_val: 0.985
Epoch 2/200
roc-au