In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import utils
import time

from keras.models import Model, Input
from keras.layers import Dense, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dropout, \
    Embedding, Concatenate, SpatialDropout1D, MaxPooling1D

from utils.preprocessing_utils import tokenize_sentences, convert_tokens_to_padded_sequence
from utils.dataset_utils import load_data_from_csv
from utils.embedding_utils import load_word2vec_embeddings, create_initial_embedding_matrix
from utils.training_utils import train_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [23]:
import importlib
import utils
importlib.reload(utils.embedding_utils)
importlib.reload(utils.dataset_utils)
importlib.reload(utils.preprocessing_utils)
importlib.reload(utils.training_utils)
importlib.reload(utils.keras_utils)

<module 'utils.keras_utils' from '/home/philipp/work/gitprojects/toxic-comment-experiments/utils/keras_utils.py'>

Global parameters which hold for all models

In [2]:
random_seed = 2018
classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
features = 'comment_text'
np.random.seed(random_seed)
path_train_data = 'data/kaggle/train.csv'
path_test_data = 'data/kaggle/test_complete.csv'
path_tokenizer = 'data/models/word_tokenizer.pickle'

embedding_length = 300
path_embeddings = 'data/embeddings/GoogleNews-vectors-negative300.bin.gz'

Load train and test data and pretrained word2vec embeddings

In [3]:
X_train, Y_train = load_data_from_csv(path_train_data, features, classes)
X_test, Y_test = load_data_from_csv(path_test_data, features, classes)

emb_idx, emb_mean, emb_std = load_word2vec_embeddings(path_embeddings)

Preprocessing and tokenizatin of train and test data

In [4]:
X_train_tok = tokenize_sentences(X_train)
del X_train
X_test_tok = tokenize_sentences(X_test)
del X_test

Create initial embedding matrix for neural network and word -> idx mapping

In [5]:
embedding_matrix, word_embedding_mapping = create_initial_embedding_matrix(X_train_tok, X_test_tok, emb_idx, emb_mean, emb_std, embedding_length, debug=True)
del emb_idx

Number of unique tokens: 326175
Number of tokens found in pretrained embeddings: 74211


Transform comments in train and test data to padded matrices

In [6]:
max_len_train = X_train_tok.apply(lambda x: len(x)).max()
max_len_test = X_test_tok.apply(lambda x: len(x)).max()
# limit length to 2000, otherwise we get a MemoryError
max_comment_length = 2000 #max(max_len_train, 2000)
X_train_input = convert_tokens_to_padded_sequence(X_train_tok, word_embedding_mapping, max_comment_length)
del X_train_tok
X_test_input = convert_tokens_to_padded_sequence(X_test_tok, word_embedding_mapping, max_comment_length)
del X_test_tok

### Singlelayer CNN with a single window size

This simple CNN consists of an embedding layer, a single convolution layer with a fixed window size and a fully connected hidden layer.

In [10]:
m1_kernel_size = 3
m1_hidden_dim = 100
m1_num_filters = 150
m1_dropout = 0.4
m1_batch_size = 64
m1_epochs = 5

m1_weights_path = 'data/models/cnn_simple/model{}.hdf5'

The network architecture

In [11]:
m1_input = Input((max_comment_length,))
m1_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m1_input)

m1_conv1 = Conv1D(kernel_size=m1_kernel_size, filters=m1_num_filters, padding='same')(m1_word_emb)
m1_conv1 = GlobalMaxPooling1D()(m1_conv1)

m1_fc2 = Dense(m1_hidden_dim, activation='relu')(m1_conv1)
m1_dropout2 = Dropout(m1_dropout)(m1_fc2)
m1_output = Dense(len(classes), activation='sigmoid')(m1_dropout2)

m1_model = Model(inputs=[m1_input], outputs=[m1_output])
m1_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 2000, 300)         97852800  
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2000, 150)         135150    
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 150)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               15100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 606       
Total para

Train model

In [12]:
m1_model, predictions = train_model(m1_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m1_epochs, m1_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    m1_weights_path.format(time.time()), random_seed)

Train on 159571 samples, validate on 63978 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.08070, saving model to data/models/cnn_simple/model1541939176.8092248

 train: ROC-AUC - epoch: 1 - score: 0.992765
 Tox: 0.9913826599695793 - STox: 0.9920993534512523 - Obs: 0.9953910449917467 - Thr: 0.9931456633073253 - Ins: 0.9920350283093564 - IdH: 0.9925364710971035

 val: ROC-AUC - epoch: 1 - score: 0.978407
 Tox: 0.9664115678676495 - STox: 0.9880273008151512 - Obs: 0.9742093918974234 - Thr: 0.9872715291905804 - Ins: 0.9729170176667217 - IdH: 0.9816030794004413
Epoch 2/5
Epoch 00002: val_loss improved from 0.08070 to 0.06677, saving model to data/models/cnn_simple/model1541939176.8092248

 train: ROC-AUC - epoch: 2 - score: 0.996276
 Tox: 0.9966820194189336 - STox: 0.9939639257929422 - Obs: 0.9976117826725686 - Thr: 0.997463846243245 - Ins: 0.995484808581766 - IdH: 0.9964485058560258

 val: ROC-AUC - epoch: 2 - score: 0.978750
 Tox: 0.9657501524942339 - STox: 0.98746530180867

### Singlelayer CNN with multiple window sizes
This CNN consists of an embedding layer, a convolution layer with multiple window sizes which get concatenated afterwards. On top of that there is a fully connected hidden layer.

In [17]:
m2_kernel_sizes = [3, 4, 5]
m2_hidden_dim = 100
m2_num_filters = [100, 100, 100]
m2_dropout = 0.4
m2_spatial_dropout = 0.2
m2_batch_size = 64
m2_epochs = 5

m2_weights_path = 'data/models/cnn_multiwindowsizes/model{}.hdf5'

The network architecture

In [18]:
m2_input = Input((max_comment_length,))
m2_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m2_input)
m2_word_emb = SpatialDropout1D(m2_spatial_dropout)(m2_word_emb)

m2_conv1 = Conv1D(kernel_size=m2_kernel_sizes[0], filters=m2_num_filters[0], padding='same')(m2_word_emb)
m2_conv1 = GlobalMaxPooling1D()(m2_conv1)

m2_conv2 = Conv1D(kernel_size=m2_kernel_sizes[1], filters=m2_num_filters[1], padding='same')(m2_word_emb)
m2_conv2 = GlobalMaxPooling1D()(m2_conv2)

m2_conv3 = Conv1D(kernel_size=m2_kernel_sizes[2], filters=m2_num_filters[2], padding='same')(m2_word_emb)
m2_conv3 = GlobalMaxPooling1D()(m2_conv3)

m2_concat4 = Concatenate()([m2_conv1, m2_conv2, m2_conv3])

m2_fc5 = Dense(m2_hidden_dim, activation='relu')(m2_concat4)
m2_fc5 = Dropout(m2_dropout)(m2_fc5)
m2_output = Dense(len(classes), activation='sigmoid')(m2_fc5)

m2_model = Model(inputs=[m2_input], outputs=[m2_output])
m2_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 2000)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 2000, 300)    97852800    input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 2000, 300)    0           embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 2000, 100)    90100       spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
conv1d_8 (

Train model

In [19]:
m2_model, predictions = train_model(m2_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m2_epochs, m2_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    m2_weights_path.format(time.time()), random_seed)

Train on 159571 samples, validate on 63978 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.08369, saving model to data/models/cnn_multiwindowsizes/model1541932408.2893183

 train: ROC-AUC - epoch: 0 - score: 0.992644
 Tox: 0.9916101666633795 - STox: 0.9920910052921812 - Obs: 0.995355572761399 - Thr: 0.9926842150983135 - Ins: 0.9914407411046907 - IdH: 0.9926831569318832

 val: ROC-AUC - epoch: 0 - score: 0.978808
 Tox: 0.9668416790454769 - STox: 0.9884707745738456 - Obs: 0.9747124942865073 - Thr: 0.9866755725097227 - Ins: 0.9731549503444628 - IdH: 0.9829946756818101
Epoch 2/5
Epoch 00002: val_loss improved from 0.08369 to 0.08026, saving model to data/models/cnn_multiwindowsizes/model1541932408.2893183

 train: ROC-AUC - epoch: 1 - score: 0.995974
 Tox: 0.9964034740653277 - STox: 0.9938659862305181 - Obs: 0.9973711965699472 - Thr: 0.9971531019710662 - Ins: 0.9944256670316292 - IdH: 0.9966244235582392

 val: ROC-AUC - epoch: 1 - score: 0.979081
 Tox: 0.9654433854945306 - S

### Multilayer CNN
This architecture consists of multiple convolutional layers with a fully connected hidden layer on top of it.

In [8]:
m3_kernel_sizes = [3, 3]
m3_hidden_dim = 100
m3_num_filters = [150, 150]
m3_dropout = 0.4
m3_spatial_dropout = 0.2
m3_batch_size = 64
m3_epochs = 5

m3_weights_path = 'data/models/cnn_multilayer/model{}.hdf5'

The model architecture

In [12]:
m3_input = Input((max_comment_length,))
m3_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m3_input)
m3_word_emb = SpatialDropout1D(m3_spatial_dropout)(m3_word_emb)

m3_conv1 = Conv1D(kernel_size=m3_kernel_sizes[0], filters=m3_num_filters[0], padding='same')(m3_word_emb)
m3_conv1 = MaxPooling1D(2, strides=2)(m3_conv1)

m3_conv2 = Conv1D(kernel_size=m3_kernel_sizes[1], filters=m3_num_filters[1], padding='same')(m3_conv1)
m3_conv2 = GlobalMaxPooling1D()(m3_conv2)

m3_fc3 = Dense(m3_hidden_dim, activation='relu')(m3_conv2)
m3_fc3 = Dropout(m3_dropout)(m3_fc3)
m3_output = Dense(len(classes), activation='sigmoid')(m3_fc3)

m3_model = Model(inputs=[m3_input], outputs=[m3_output])
m3_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 2000, 300)         97852800  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 2000, 300)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2000, 150)         135150    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1000, 150)         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1000, 150)         67650     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 150)               0         
__________

Train model

In [13]:
m3_model, predictions = train_model(m3_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m3_epochs, m3_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    m3_weights_path.format(time.time()), random_seed)

Train on 159571 samples, validate on 63978 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.08096, saving model to data/models/cnn_multilayer/model1541936318.5183825

 train: ROC-AUC - epoch: 1 - score: 0.989335
 Tox: 0.9896669404061503 - STox: 0.9917272759816061 - Obs: 0.9946295644743336 - Thr: 0.9845237557033232 - Ins: 0.9894297039516081 - IdH: 0.9860304096020925

 val: ROC-AUC - epoch: 1 - score: 0.974785
 Tox: 0.965427848158859 - STox: 0.9879879994364589 - Obs: 0.9758918435831798 - Thr: 0.9786274631197687 - Ins: 0.9696245200860109 - IdH: 0.9711480810290207
Epoch 2/5
Epoch 00002: val_loss improved from 0.08096 to 0.06994, saving model to data/models/cnn_multilayer/model1541936318.5183825

 train: ROC-AUC - epoch: 2 - score: 0.994595
 Tox: 0.9949774168710069 - STox: 0.9930609038188889 - Obs: 0.9969585800966493 - Thr: 0.9944555665935456 - Ins: 0.993353808031573 - IdH: 0.9947626537513652

 val: ROC-AUC - epoch: 2 - score: 0.977750
 Tox: 0.9658541824380198 - STox: 0.987521