Using a default CNN single model to text various cleaning steps and impact on score.

Controls:
- CNN single model
- maxlen: 65
- min occurance vocab: 5
- glove.6B.100D
- epochs: 2
- cv: 3

In [203]:
model_name = 'raw'

## Import data

In [204]:
import os
import numpy as np
import pandas as pd

In [205]:
dir_path = os.path.realpath('..')

In [206]:
path = 'data/raw/train.csv'

full_path = os.path.join(dir_path, path)
df_train = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_train.shape))

Dataset has 95851 rows, 7 columns.


In [207]:
path = 'data/raw/test.csv'

full_path = os.path.join(dir_path, path)
df_test = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df_test.shape))

Dataset has 226998 rows, 1 columns.


## Text cleaning

In [208]:
import string
import nltk
nltk.data.path.append("/Users/joaeechew/dev/nltk_data")

from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary

from os import listdir
from collections import Counter

In [209]:
def process_text(corpus, vocab, regex=r'[\w]+', digits=False, english_only=False, stop=False, lemmatize=False):
    """Takes a corpus in list format and applies basic preprocessing steps of word tokenization,
     removing of english stop words, and lemmatization. Returns processed corpus and vocab."""
    processed_corpus = []
    english_words = set(nltk.corpus.words.words())
    english_stopwords = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(regex)
    for row in corpus:
        tokens = tokenizer.tokenize(row)
        if digits:
            tokens = [t for t in tokens if not t.isdigit()]
        if english_only:
            tokens = [t for t in tokens if t in english_words]
        if stopwords:
            tokens = [t for t in tokens if not t in english_stopwords]
        if lemmatize:
            tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
        vocab.update(tokens)
        tokens = ' '.join(tokens)
        if tokens == '':
            tokens = 'cleaned'
        processed_corpus.append(tokens)
    return processed_corpus, vocab

In [210]:
# fill NaN with string "unknown"
df_train.fillna('unknown',inplace=True)
df_test.fillna('unknown',inplace=True)

In [211]:
regex = r'[\w|!]+'

In [212]:
%%time
vocab = Counter()
df_train.comment_text, vocab = process_text(df_train.comment_text, vocab,
                                            digits=False, english_only=False, stop=False, lemmatize=False)
df_test.comment_text, vocab = process_text(df_test.comment_text, vocab,
                                          digits=False, english_only=False, stop=False, lemmatize=False)

CPU times: user 19.3 s, sys: 251 ms, total: 19.5 s
Wall time: 19.6 s


In [213]:
print(vocab.most_common(100))
print(len(vocab))

[('I', 380100), ('WIKI_LINK', 313510), ('Wikipedia', 241555), ('article', 125110), ('page', 119359), ('The', 101785), ('If', 85013), ('use', 70146), ('would', 63716), ('one', 60096), ('edit', 58999), ('like', 55648), ('talk', 51321), ('please', 49671), ('You', 47928), ('Please', 46481), ('may', 45442), ('deletion', 44264), ('WP', 43485), ('It', 42983), ('see', 41919), ('image', 41726), ('articles', 40624), ('think', 39389), ('This', 38673), ('also', 35316), ('make', 34685), ('know', 33175), ('information', 30491), ('time', 29978), ('people', 29955), ('used', 29384), ('edits', 28860), ('Image', 28460), ('copyright', 28066), ('pages', 27930), ('hi', 27761), ('deleted', 27634), ('Thanks', 27356), ('free', 27112), ('EXTERNAL_LINK', 26877), ('made', 26844), ('Thank', 26246), ('editing', 26227), ('policy', 25299), ('name', 23915), ('A', 23480), ('sources', 23425), ('questions', 23405), ('speedy', 22822), ('could', 22358), ('source', 22321), ('need', 21838), ('add', 21657), ('In', 21367), ('w

In [214]:
# keep tokens with a min occurrence
min_occurance = 5
vocab = [k for k,c in vocab.items() if c >= min_occurance]
print(len(vocab))

81266


In [215]:
path = 'data/processed/train_' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_train.to_csv(full_path, header=True, index=True)

In [216]:
path = 'data/processed/test' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

df_test.to_csv(full_path, header=True, index=True)

## Train test split

In [217]:
from sklearn.model_selection import train_test_split

In [218]:
seed = 42
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X = df_train.drop(target, axis=1)
y = df_train[target]
corpus = 'comment_text'

In [219]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

## Pre-processing

In [220]:
import pickle
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [221]:
%%time
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(vocab)

#define vocab size and max len
vocab_size = len(t.word_index) + 1
max_length = 65

print('Vocabulary size: %d' % vocab_size)
print('Maximum length: %d' % max_length)

Vocabulary size: 62208
Maximum length: 65
CPU times: user 820 ms, sys: 37.2 ms, total: 857 ms
Wall time: 859 ms


In [222]:
%%time
# integer encode the documents
encoded_Xtrain = t.texts_to_sequences(Xtrain[corpus].astype(str))
encoded_Xtest = t.texts_to_sequences(Xtest[corpus].astype(str))

CPU times: user 5.07 s, sys: 93.2 ms, total: 5.16 s
Wall time: 5.2 s


In [223]:
# pad documents

padded_train = pad_sequences(encoded_Xtrain, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_Xtest, maxlen=max_length, padding='post')

In [84]:
%%time
# load the whole embedding into memory
embeddings_index = dict()
f = open('/Users/joaeechew/dev/glove.6B/glove.6B.100d.txt', mode='rt', encoding='utf-8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.
CPU times: user 11 s, sys: 413 ms, total: 11.4 s
Wall time: 11.5 s


In [85]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [86]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Model fit

In [87]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.pipeline import Pipeline

In [105]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [123]:
# Function to create model, required for KerasClassifier
def create_model(optimizer='adam', vocab_size=vocab_size, max_length=max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(10, activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))
    model.add(Dense(6, activation='sigmoid'))  #multi-label (k-hot encoding)
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # summarize defined model
    model.summary()
#     plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [124]:
def save_model(model, model_path):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_path + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_path + ".h5")
    print("Saved model to disk")

In [125]:
np.random.seed(seed)

In [138]:
model = KerasClassifier(build_fn=create_model, epochs=2, verbose=1)

In [139]:
# Tuning the model
param_grid = {}

In [140]:
# Define pipeline
pipeline = Pipeline([
    ('clf', model)
])

In [141]:
%%time
# fit the model
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# train the model
grid = GridSearchCV(pipeline, param_grid=param_grid, verbose=1, cv=3)
grid_result = grid.fit(padded_train, ytrain)

# summarize results
print("Best score {} using {}".format(grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# save the model
trained_model = grid_result.best_estimator_.named_steps['clf'].model
model_path = os.path.join(dir_path, 'models', model_name)
save_model(trained_model, model_path)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 65, 100)           4394900   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 58, 32)            25632     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 29, 32)            0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 928)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 10)                9290      
_________________________________________________________________
dense_20 (Dense)             (None, 6)                 66        
Total params: 4,429,888
Trainable params: 4,429,888
Non-trainable params: 0
_______

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 17.2min finished


Epoch 1/2
Epoch 2/2


NameError: name 'label' is not defined

## Evaluation

In [143]:
from sklearn.metrics import log_loss

In [154]:
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

0.978506 (0.001198) with: {}


In [243]:
%%time

# evaluate model on test dataset
y_pred = trained_model.predict(padded_test, verbose=1)
loss = log_loss(ytest, y_pred)

print("Combined log loss is {} .".format(loss))

InvalidArgumentError: indices[1,5] = 61203 is not in [0, 43949)
	 [[Node: embedding_13/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_13/embeddings/read, embedding_13/Cast)]]

Caused by op 'embedding_13/Gather', defined at:
  File "//anaconda/envs/toxic/lib/python3.5/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "//anaconda/envs/toxic/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 281, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 232, in dispatch_shell
    handler(stream, idents, msg)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 397, in execute_request
    user_expressions, allow_stdin)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-141-8c9e1082ea42>", line 1, in <module>
    get_ipython().run_cell_magic('time', '', '# fit the model\ntarget = [\'toxic\', \'severe_toxic\', \'obscene\', \'threat\', \'insult\', \'identity_hate\']\n\n# train the model\ngrid = GridSearchCV(pipeline, param_grid=param_grid, verbose=1, cv=3)\ngrid_result = grid.fit(padded_train, ytrain)\n\n# summarize results\nprint("Best {} : {} using {}".format(label, grid_result.best_score_, grid_result.best_params_))\nmeans = grid_result.cv_results_[\'mean_test_score\']\nstds = grid_result.cv_results_[\'std_test_score\']\nparams = grid_result.cv_results_[\'params\']\nfor mean, stdev, param in zip(means, stds, params):\n    print("%f (%f) with: %r" % (mean, stdev, param))\n\n# save the model\ntrained_model = grid_result.best_estimator_.named_steps[\'clf\'].model\nmodel_path = os.path.join(dir_path, \'models\', model_name)\nsave_model(trained_model, model_path)')
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2131, in run_cell_magic
    result = fn(magic_arg_s, cell)
  File "<decorator-gen-62>", line 2, in time
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/IPython/core/magic.py", line 187, in <lambda>
    call = lambda f, *a, **k: f(*a, **k)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/IPython/core/magics/execution.py", line 1238, in time
    exec(code, glob, local_ns)
  File "<timed exec>", line 6, in <module>
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/sklearn/model_selection/_search.py", line 739, in fit
    self.best_estimator_.fit(X, y, **fit_params)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/sklearn/pipeline.py", line 250, in fit
    self._final_estimator.fit(Xt, y, **fit_params)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/keras/wrappers/scikit_learn.py", line 203, in fit
    return super(KerasClassifier, self).fit(x, y, **kwargs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/keras/wrappers/scikit_learn.py", line 136, in fit
    self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
  File "<ipython-input-123-573835b78977>", line 4, in create_model
    model.add(Embedding(vocab_size, 100, input_length=max_length))
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/keras/models.py", line 464, in add
    layer(x)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/keras/engine/topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/keras/layers/embeddings.py", line 134, in call
    out = K.gather(self.embeddings, inputs)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/keras/backend/tensorflow_backend.py", line 1193, in gather
    return tf.gather(reference, indices)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1207, in gather
    validate_indices=validate_indices, name=name)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "//anaconda/envs/toxic/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): indices[1,5] = 61203 is not in [0, 43949)
	 [[Node: embedding_13/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_13/embeddings/read, embedding_13/Cast)]]


## Submission

In [152]:
%%time
# integer encode and pad test df
encoded_submission = t.texts_to_sequences(df_test[corpus].astype(str))
padded_submission = pad_sequences(encoded_submission, maxlen=max_length, padding='post')

# Predict
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_pred_proba = trained_model.predict(padded_submission, verbose=1)
submission = pd.DataFrame(y_pred_proba, index=df_test.index, columns=target)

## Output submissions
path = 'data/submissions/' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)



NameError: name 'df' is not defined

In [153]:
submission = pd.DataFrame(y_pred_proba, index=df_test.index, columns=target)

## Output submissions
path = 'data/submissions/' + model_name + '.csv'

dir_path = os.path.realpath('..')
full_path = os.path.join(dir_path, path)

submission.to_csv(full_path, header=True, index=True)