Notes:
- Important parameters: kernel size, no. of feature maps
- 1-max pooling generally outperforms otehr types of pooling
- Dropout has little effect
- Gridsearch across kernel size in the range 1-10
- Search no. of filters from 100-600 and dropout of 0.0-0.5
- Explore tanh, relu, linear activation functions

## Import data

In [4]:
import os
import numpy as np
import pandas as pd

In [5]:
dir_path = os.path.realpath('..')

In [6]:
path = 'data/processed/train.csv'

full_path = os.path.join(dir_path, path)
df = pd.read_csv(full_path, header=0, index_col=0)
print("Dataset has {} rows, {} columns.".format(*df.shape))

Dataset has 95851 rows, 7 columns.


## Train test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
seed = 42
test_size = 0.2
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
X = df.drop(target, axis=1)
y = df[target]

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=test_size, random_state=seed)

## Pre-processing

In [10]:
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding

Using TensorFlow backend.


In [11]:
corpus = 'comment_text'

In [12]:
%%time
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(Xtrain[corpus].astype(str))
vocab_size = len(t.word_index) + 1

CPU times: user 3.37 s, sys: 34.2 ms, total: 3.4 s
Wall time: 3.43 s


In [13]:
%%time
# integer encode the documents
encoded_Xtrain = t.texts_to_sequences(Xtrain[corpus].astype(str))
encoded_Xtest = t.texts_to_sequences(Xtest[corpus].astype(str))

CPU times: user 3 s, sys: 35.4 ms, total: 3.04 s
Wall time: 3.05 s


In [14]:
# pad documents to a max length of 4 words
max_length = 4
padded_train = pad_sequences(encoded_Xtrain, maxlen=max_length, padding='post')
padded_test = pad_sequences(encoded_Xtest, maxlen=max_length, padding='post')

In [15]:
%%time
# load the whole embedding into memory
embeddings_index = dict()
f = open('/Users/joaeechew/dev/glove.6B/glove.6B.100d.txt', mode='rt', encoding='utf-8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.
CPU times: user 11.2 s, sys: 443 ms, total: 11.6 s
Wall time: 11.8 s


In [16]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 100))
for word, i in t.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

## Model fit

In [24]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=4, trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [25]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            3660500   
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 3,660,901
Trainable params: 401
Non-trainable params: 3,660,500
_________________________________________________________________


In [27]:
def save_model(model, model_path):
    # serialize model to JSON
    model_json = model.to_json()
    with open(model_path + ".json", "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights(model_path + ".h5")
    print("Saved model to disk")

In [31]:
%%time
# fit the model
target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for label in target:
    print('... Processing {}'.format(label))
    y = ytrain[label]
    model_name = 'glove_model_' + label
    
    # train the model
    model.fit(padded_train, y, epochs=1, verbose=1)
    
    # save the model
    model_name = 'glove_model_' + label
    model_path = os.path.join(dir_path, 'models', model_name)
    save_model(model, model_path)

... Processing toxic
Epoch 1/1
Saved model to disk
... Processing severe_toxic
Epoch 1/1
Saved model to disk
... Processing obscene
Epoch 1/1
Saved model to disk
... Processing threat
Epoch 1/1
Saved model to disk
... Processing insult
Epoch 1/1
Saved model to disk
... Processing identity_hate
Epoch 1/1
Saved model to disk
CPU times: user 35.1 s, sys: 15.9 s, total: 51 s
Wall time: 32.3 s


## Evaluation

In [32]:
from keras.models import model_from_json

In [36]:
from sklearn.metrics import log_loss

In [33]:
def load_model(model_path):
    # load json and create model
    json_file = open(model_path+'.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_path+".h5")
    print("Loaded model from disk")
    return loaded_model

In [61]:
%%time

target = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_pred = pd.DataFrame(index=ytest.index, columns=target)
scores =[]

for label in target:
    print('... Processing {}'.format(label))
    
    model_name = 'glove_model_' + label
    model_path = os.path.join(dir_path, 'models', model_name)

    # load the model
    loaded_model = load_model(model_path)
    
    # evaluate model on test dataset
    y_pred[label] = loaded_model.predict(padded_test, verbose=1, batch_size=1)
    loss = log_loss(ytest[label], y_pred[label])
    scores.append(loss)
    
    print("Log loss for {} is {} .".format(label, loss))

print("Combined log loss is {} .".format(np.mean(scores)))

... Processing toxic
Loaded model from disk
Log loss for toxic is 0.2454449793669313 .
... Processing severe_toxic
Loaded model from disk
Log loss for severe_toxic is 0.04806497953532837 .
... Processing obscene
Loaded model from disk
Log loss for obscene is 0.16479228807322147 .
... Processing threat
Loaded model from disk
Log loss for threat is 0.01971722442305306 .
... Processing insult
Loaded model from disk
Log loss for insult is 0.15244657976541817 .
... Processing identity_hate
Loaded model from disk
Log loss for identity_hate is 0.03865194528939449 .
Combined log loss is 0.11151966607555781 .
CPU times: user 1min 11s, sys: 10.5 s, total: 1min 22s
Wall time: 59.4 s
