In [1]:
import pandas as pd
import numpy as np
import progressbar
import time
import codecs
import functools
import os
import tempfile
import zipfile
import urllib
import re
from nltk.corpus import stopwords
from gensim.models import word2vec
import pickle
import nltk.data
import os
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# Turn each comment into a list of word indexes of equal length (with truncation or padding as needed).
def comments2Matrix(comment_list,model,maxlen):
    
    def commentToIndex(comment,index2word_set,model):
        indexed_comment = []
        # Loop over each word in the comment and, if it is in the model's vocaublary convert it to an index
        for word in comment:
            if word in index2word_set: 
                indexed_comment += [model.wv.vocab[word].index]
        return [indexed_comment]
    
    index2word_set = set(model.wv.index2word)
    totalComments = len(comment_list)
    
    bar = progressbar.ProgressBar(maxval=totalComments, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    # Loop over each comment in the comment_list
    i=0 #init for progress bar
    indexed_comments = [] #init
    for comment in comment_list:
        indexed_comments += commentToIndex(comment,index2word_set,model)
        i += 1
        bar.update(i)
    bar.finish()
    #return indexed_comments
    return pad_sequences(indexed_comments,maxlen = maxlen)

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15012109662511428730
]


In [4]:
# Read data from files 
train = pd.read_csv("data/train.csv", header=0)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [5]:
with open('data/tokenized_comments/remove_stops=False.lemmatize=False.spellcheck=True.train_comments.csv') as f:
    train_comments = [line.split() for line in f]
with open('data/tokenized_comments/remove_stops=False.lemmatize=False.spellcheck=True.test_comments.csv') as f:
    test_comments = [line.split() for line in f]
print("Loaded %s training comments, and %s testing comments" % (len(train_comments),len(test_comments)))

Loaded 54521 training comments, and 226998 testing comments


##### <b>Load Pretrained Glove Embeddings</b>

In [None]:
#Load w2v Model Using Gensim
from gensim.models import Word2Vec
import gensim
print("Loading Gensim Model...")
gensim_file= 'w2v_models/gensim_filtered_models/remove_stops=False.lemmatize=False.spellcheck=True.glove.42B.300d.txt'
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(gensim_file)
print("Gensim Model Loaded")

##### <b>Convert Comments to a Matrix of Indices</b>

In [None]:
# Initialize parameters for model
embed_size = 300 #Embed Size Of Model
maxlen = 150 #Max number of words to use for a specific comment
max_features = len(word_vectors.wv.vocab) # how many unique words to use (i.e num rows in embedding vector)

In [None]:
print('Converting %s comments for training set to matrices' % len(train_comments))
xtrain = comments2Matrix(train_comments,word_vectors,maxlen)
print('Converting %s comments for testing set to matrices' % len(test_comments))
xtest = comments2Matrix(test_comments,word_vectors,maxlen)

In [None]:
# convert the wv word vectors into a numpy matrix that is suitable for insertion into Keras models
embedding_matrix = np.zeros((len(word_vectors.wv.vocab), embed_size))
for i in range(len(word_vectors.wv.vocab)):
    embedding_vector = word_vectors.wv[word_vectors.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling2D, Reshape,MaxPooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv2D, SpatialDropout1D, BatchNormalization, GlobalMaxPooling2D,Conv1D
from keras.initializers import glorot_normal
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, Sequential

file_path = "BD-LSTM-noatt-maxlen100.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)

model = Sequential()
model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False,name = 'Word-Embedding-Layer')) 
model.add(Dropout(0.4,name = 'Dropout-Regularization-1')) # Best = 0.3
model.add(Bidirectional(LSTM(300, return_sequences=True, dropout=0.25, recurrent_dropout=0.25,kernel_initializer=glorot_normal(seed=None)),name = 'BDLSTM')) #Best = 300,0.25,0.25
model.add(GlobalMaxPool1D(name = 'Global-Max-Pool-1d')) 
model.add(Dense(256, activation="relu",name = 'FC-256')) # Best = 256
model.add(Dense(6, activation="sigmoid",name = 'FC-Output-Layer'))
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()
history = model.fit(xtrain, y, batch_size=50, epochs=100,validation_split=0.1, callbacks=[checkpoint,early_stop],verbose=1)

In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling2D, Reshape,MaxPooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv2D, SpatialDropout1D, BatchNormalization, GlobalMaxPooling2D,Conv1D
from keras.initializers import glorot_normal
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers, Sequential

file_path = "BD-LSTM-noatt.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,save_best_only=True, mode='min')
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)

model = Sequential()
model.add(Embedding(max_features, embed_size, weights=[embedding_matrix],trainable = False,name = 'Word-Embedding-Layer')) 
model.add(Dropout(0.3,name = 'Dropout-Regularization-1'))
model.add(Bidirectional(LSTM(300, return_sequences=True, dropout=0.25, recurrent_dropout=0.25,kernel_initializer=glorot_normal(seed=None)),name = 'BDLSTM'))
model.add(GlobalMaxPool1D(name = 'Global-Max-Pool-1d'))
model.add(Dense(256, activation="relu",name = 'FC-256'))
model.add(Dense(6, activation="sigmoid",name = 'FC-Output-Layer'))
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.summary()
#history = model.fit(xtrain, y, batch_size=256, epochs=100,validation_split=0.1, callbacks=[checkpoint,early_stop],verbose=1)


### <B> MOVE TO ANOTHER NOTEBOOK TO AVOID CONFUSION AND LOST DATA </B>

In [None]:
# Notes
# Good Success with lowering the length - probably erases a lot of padding that confuses the nn

In [None]:
file_path = "BD-LSTM-noatt-maxlen100.hdf5"
model.load_weights(file_path)
y_test = model.predict([xtest], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('submissions/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submissions/no_stops_test_scores.csv', index=False)

In [None]:
file_path = "BD-LSTM-noatt.hdf5"
model.load_weights(file_path)
y_test = model.predict([xtest], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('submissions/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submissions/glove_vectors_unlemmatized_len50_LSTM.csv', index=False)

##### <b> Ensembling Models </b>

In [None]:
lstm_100 = 'submissions/glove_vectors_unlemmatized_len100_LSTM0043.csv'
lstm_50 = 'submissions/glove_vectors_unlemmatized_len50_LSTM.csv'

In [None]:
p_lstm100 = pd.read_csv(lstm_100)
p_lstm50 = pd.read_csv(lstm_50)

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
p_res = p_lstm100.copy()
p_res[label_cols] = (p_lstm100[label_cols] + p_lstm50[label_cols]) / 2

In [None]:
p_res.to_csv('submissions/ensemble_100_50_submission.csv', index=False)