Using a default CNN single model to text various cleaning steps and impact on score.

Controls:
- CNN single model
- maxlen: 65
- min occurance vocab: 5
- glove.6B.100D
- epochs: 2
- cv: 3
- max features 20000

In [59]:
model_name = 'raw_LSTM'

## Import data

In [60]:
import os
import numpy as np
import pandas as pd

In [61]:
dir_path = os.path.realpath('..')

In [62]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [63]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


## Text cleaning

In [64]:
import string
import nltk
nltk.data.path.append("/Users/joaeechew/dev/nltk_data")

from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary

from os import listdir
from collections import Counter

In [65]:
def process_text(corpus, vocab, regex=r'[\w]+', digits=False, english_only=False, stop=False, lemmatize=False):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, and lemmatization. Returns processed corpus and vocab."""
    processed_corpus = []
    english_words = set(nltk.corpus.words.words())
    english_stopwords = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(regex)
    for row in corpus:
        tokens = tokenizer.tokenize(row)
        if digits:
            tokens = [t for t in tokens if not t.isdigit()]
        if english_only:
            tokens = [t for t in tokens if t in english_words]
        if stopwords:
            tokens = [t for t in tokens if not t in english_stopwords]
        if lemmatize:
            tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
        vocab.update(tokens)
        tokens = ' '.join(tokens)
        if tokens == '':
            tokens = 'cleaned'
        processed_corpus.append(tokens)
    return processed_corpus, vocab

In [66]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

In [67]:
regex = r'[\w|!]+'

In [68]:
# %%time
# vocab = Counter()
# df_train.comment_text, vocab = process_text(df_train.comment_text, vocab,
#                                             digits=False, english_only=False, stop=False, lemmatize=False)
# df_test.comment_text, vocab = process_text(df_test.comment_text, vocab,
#                                           digits=False, english_only=False, stop=False, lemmatize=False)

In [69]:
# print(vocab.most_common(100))
# # print(len(vocab))

In [70]:
# # keep tokens with a min occurrence
# min_occurance = 5
# vocab = [k for k,c in vocab.items() if c >= min_occurance]
# print(len(vocab))

In [71]:
path = 'data/processed/train_' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_train.to_csv(full_path, header=True, index=True)

In [72]:
path = 'data/processed/test' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_test.to_csv(full_path, header=True, index=True)

## Train test split

In [73]:
from sklearn.model_selection import train_test_split

In [74]:
seed = 42
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X = df_train.drop(target, axis=1)
y = df_train[target]
corpus = 'comment_text'

In [75]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

## Pre-processing

In [76]:
import pickle
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [77]:
%%time
# prepare tokenizer
t = Tokenizer(num_words=20000)
t.fit_on_texts(df_train[corpus])

#define vocab size and max len
vocab_size = len(t.word_index) + 1
max_length = 65

print('Vocabulary size: %d' % vocab_size)
print('Maximum length: %d' % max_length)

Vocabulary size: 153189
Maximum length: 65
CPU times: user 7.86 s, sys: 0 ns, total: 7.86 s
Wall time: 7.86 s


In [78]:
%%time
# integer encode the documents
encoded_Xtrain = t.texts_to_sequences(Xtrain[corpus].astype(str))
encoded_Xtest = t.texts_to_sequences(Xtest[corpus].astype(str))

CPU times: user 6.38 s, sys: 0 ns, total: 6.38 s
Wall time: 6.38 s


In [79]:
# pad documents

padded_train = pad_sequences(encoded_Xtrain, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_Xtest, maxlen=max_length, padding='post')

In [80]:
%%time
# load the whole embedding into memory
embeddings_index = dict()
f = open('/home/ec2-user/glove.6B.100d.txt', mode='rt', encoding='utf-8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.
CPU times: user 14.3 s, sys: 0 ns, total: 14.3 s
Wall time: 14.2 s


In [81]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [82]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Model fit

In [83]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline

In [84]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [85]:
# # Function to create model, required for KerasClassifier
# def create_model(optimizer='adam', vocab_size=vocab_size, max_length=max_length):
#     model = Sequential()
#     model.add(Embedding(vocab_size, 100, input_length=max_length))
#     model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Flatten())
#     model.add(Dense(10, activation='relu'))
# #     model.add(Dense(1, activation='sigmoid'))
#     model.add(Dense(6, activation='sigmoid'))  #multi-label (k-hot encoding)
#     # compile network
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     # summarize defined model
#     model.summary()
# #     plot_model(model, to_file='model.png', show_shapes=True)
#     return model

In [117]:
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

# Function to create model, required for KerasClassifier
def create_model(optimizer='adam', vocab_size=vocab_size, max_length=max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
    model.add(GlobalMaxPool1D())
    model.add(Dense(50, activation="relu"))
    model.add(Dropout(0.1))
    model.add(Dense(6, activation='sigmoid'))  #multi-label (k-hot encoding)
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
    return model

In [118]:
def save_model(model, model_path):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_path + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_path + ".h5")
    print("Saved model to disk")

In [119]:
np.random.seed(seed)

In [120]:
model = KerasClassifier(build_fn=create_model, epochs=2, verbose=1)

In [None]:
%%time
# fit the model
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# train the model
model.fit(padded_train, ytrain, validation_split=0.1)
trained_model = model.model

# save the model
model_path = os.path.join(dir_path, 'models', model_name)
save_model(trained_model, model_path)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 65, 100)           15318900  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 65, 100)           60400     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 50)                5050      
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 6)                 306       
Total params: 15,384,656
Trainable params: 15,384,656
Non-trainable params: 0
________________________________________________________________

## Evaluation

In [128]:
from sklearn.metrics import log_loss

In [137]:
print(trained_model.evaluate(padded_test, ytest, verbose=1))

[0.052028207666735951, 0.98183019338692057]


In [142]:
%%time
# pretty sure this needs to be looped and calculated column wise!

target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_pred = trained_model.predict(padded_test, verbose=1)
hold_out_preds = pd.DataFrame(y_pred, index=ytest.index, columns=target)
losses = []

for label in target:
    loss = log_loss(ytest[label], hold_out_preds[label])
    losses.append(loss)
    print("{} log loss is {} .".format(label, loss))
    
print("Combined log loss is {} .".format(np.mean(losses)))

toxic log loss is 0.1142052628730768 .
severe_toxic log loss is 0.023754155955640736 .
obscene log loss is 0.05827334177400192 .
threat log loss is 0.014348717349511727 .
insult log loss is 0.07234214832882768 .
identity_hate log loss is 0.02924561981465328 .
Combined log loss is 0.0520282076826187 .
CPU times: user 1min 6s, sys: 8.32 s, total: 1min 14s
Wall time: 12.1 s


## Submission

In [143]:
%%time
# integer encode and pad test df
encoded_submission = t.texts_to_sequences(df_test[corpus].astype(str))
padded_submission = pad_sequences(encoded_submission, maxlen=max_length, padding='post')

# Predict
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_pred_proba = trained_model.predict(padded_submission, verbose=1)
submission = pd.DataFrame(y_pred_proba, index=df_test.index, columns=target)

## Output submissions
path = 'data/submissions/' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)

CPU times: user 13min 30s, sys: 1min 43s, total: 15min 13s
Wall time: 2min 48s
