In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import utils
import time

from keras.models import Model, Input
from keras.layers import Dense, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dropout, \
    Embedding, Concatenate, SpatialDropout1D, MaxPooling1D

from utils.preprocessing_utils import tokenize_sentences, convert_tokens_to_padded_sequence
from utils.dataset_utils import load_data_from_csv
from utils.embedding_utils import load_word2vec_embeddings, create_initial_embedding_matrix
from utils.training_utils import train_model, train_and_evaluate_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [20]:
import importlib
import utils
importlib.reload(utils.embedding_utils)
importlib.reload(utils.dataset_utils)
importlib.reload(utils.preprocessing_utils)
importlib.reload(utils.training_utils)
importlib.reload(utils.keras_utils)

<module 'utils.keras_utils' from '/home/philipp/work/gitprojects/toxic-comment-experiments/utils/keras_utils.py'>

Global parameters which hold for all models

In [2]:
random_seed = 2018
classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
features = 'comment_text'
np.random.seed(random_seed)
path_train_data = 'data/kaggle/train.csv'
path_test_data = 'data/kaggle/test_complete.csv'
path_tokenizer = 'data/models/word_tokenizer.pickle'

embedding_length = 300
path_embeddings = 'data/embeddings/GoogleNews-vectors-negative300.bin.gz'

Load train and test data and pretrained word2vec embeddings

In [3]:
X_train, Y_train = load_data_from_csv(path_train_data, features, classes)
X_test, Y_test = load_data_from_csv(path_test_data, features, classes)

emb_idx, emb_mean, emb_std = load_word2vec_embeddings(path_embeddings)

Preprocessing and tokenizatin of train and test data

In [4]:
X_train_tok = tokenize_sentences(X_train)
del X_train
X_test_tok = tokenize_sentences(X_test)
del X_test

Create initial embedding matrix for neural network and word -> idx mapping

In [5]:
embedding_matrix, word_embedding_mapping = create_initial_embedding_matrix(X_train_tok, X_test_tok, emb_idx, emb_mean, emb_std, embedding_length, debug=True)
del emb_idx

Number of unique tokens: 326175
Number of tokens found in pretrained embeddings: 74211


Transform comments in train and test data to padded matrices

In [6]:
max_len_train = X_train_tok.apply(lambda x: len(x)).max()
max_len_test = X_test_tok.apply(lambda x: len(x)).max()
# limit length to 2000, otherwise we get a MemoryError
max_comment_length = 2000
X_train_input = convert_tokens_to_padded_sequence(X_train_tok, word_embedding_mapping, max_comment_length)
del X_train_tok
X_test_input = convert_tokens_to_padded_sequence(X_test_tok, word_embedding_mapping, max_comment_length)
del X_test_tok

### Singlelayer CNN with a single window size

This simple CNN consists of an embedding layer, a single convolution layer with a fixed window size and a fully connected hidden layer.

In [7]:
m1_kernel_size = 3
m1_hidden_dim = 100
m1_num_filters = 150
m1_dropout = 0.4
m1_spatial_dropout = 0.2
m1_batch_size = 64
m1_epochs = 5

m1_weights_path = 'data/models/cnn_simple/model{}.hdf5'
m1_scores_path = 'data/scores/cnn_simple/scores_{}'

The network architecture

In [8]:
m1_input = Input((max_comment_length,))
m1_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m1_input)
m1_word_emb = SpatialDropout1D(m1_spatial_dropout)(m1_word_emb)

m1_conv1 = Conv1D(kernel_size=m1_kernel_size, filters=m1_num_filters, padding='same')(m1_word_emb)
m1_conv1 = GlobalMaxPooling1D()(m1_conv1)

m1_fc2 = Dense(m1_hidden_dim, activation='relu')(m1_conv1)
m1_dropout2 = Dropout(m1_dropout)(m1_fc2)
m1_output = Dense(len(classes), activation='sigmoid')(m1_dropout2)

m1_model = Model(inputs=[m1_input], outputs=[m1_output])
m1_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 300)         97852800  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 2000, 300)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2000, 150)         135150    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 150)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               15100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
__________

Train model

In [9]:
m1_scores = train_and_evaluate_model(m1_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m1_epochs, m1_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
np.save(m1_scores_path.format(time.time()), m1_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99328
 Tox: 0.99078 - STox: 0.99115 - Obs: 0.99486 - Thr: 0.98640 - Ins: 0.99088 - IdH: 0.98296
 train: F1 Score - epoch: 1 - score: 0.78430
 Tox: 0.84888 - STox: 0.22837 - Obs: 0.84299 - Thr: 0.00000 - Ins: 0.75970 - IdH: 0.08625

 val: ROC-AUC - epoch: 1 - score: 0.97821
 Tox: 0.96405 - STox: 0.98478 - Obs: 0.97667 - Thr: 0.97369 - Ins: 0.97125 - IdH: 0.96928
 val: F1 Score - epoch: 1 - score: 0.64034
 Tox: 0.66049 - STox: 0.26038 - Obs: 0.68823 - Thr: 0.00000 - Ins: 0.64561 - IdH: 0.11068
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99671
 Tox: 0.99578 - STox: 0.99362 - Obs: 0.99700 - Thr: 0.99628 - Ins: 0.99451 - IdH: 0.99456
 train: F1 Score - epoch: 2 - score: 0.80602
 Tox: 0.87107 - STox: 0.03075 - Obs: 0.85524 - Thr: 0.09109 - Ins: 0.78690 - IdH: 0.40217

 val: ROC-AUC - epoch: 2 - score: 0.97932
 Tox: 0.96315 - STox: 0.98674 - Obs: 0.97454 - Thr: 0.98653 - Ins: 0.97109 - IdH: 0.97744
 val: F1 Score - epoch: 2 - score: 0.64637
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99651
 Tox: 0.99586 - STox: 0.99360 - Obs: 0.99710 - Thr: 0.99400 - Ins: 0.99402 - IdH: 0.99304
 train: F1 Score - epoch: 2 - score: 0.82295
 Tox: 0.87953 - STox: 0.26795 - Obs: 0.88185 - Thr: 0.00000 - Ins: 0.80129 - IdH: 0.41236

 val: ROC-AUC - epoch: 2 - score: 0.97878
 Tox: 0.96097 - STox: 0.98564 - Obs: 0.97598 - Thr: 0.98296 - Ins: 0.96953 - IdH: 0.97527
 val: F1 Score - epoch: 2 - score: 0.63269
 Tox: 0.64135 - STox: 0.15656 - Obs: 0.68198 - Thr: 0.00000 - Ins: 0.64233 - IdH: 0.41020
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99812
 Tox: 0.99801 - STox: 0.99593 - Obs: 0.99840 - Thr: 0.99717 - Ins: 0.99664 - IdH: 0.99741
 train: F1 Score - epoch: 3 - score: 0.88145
 Tox: 0.93316 - STox: 0.60984 - Obs: 0.90794 - Thr: 0.42643 - Ins: 0.86412 - IdH: 0.54081

 val: ROC-AUC - epoch: 3 - score: 0.97815
 Tox: 0.95803 - STox: 0.98562 - Obs: 0.97481 - Thr: 0.98490 - Ins: 0.97088 - IdH: 0.97388
 val: F1 Score - epoch: 3 - score: 0.61095


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99817
 Tox: 0.99813 - STox: 0.99575 - Obs: 0.99825 - Thr: 0.99831 - Ins: 0.99644 - IdH: 0.99763
 train: F1 Score - epoch: 3 - score: 0.87156
 Tox: 0.93459 - STox: 0.42344 - Obs: 0.90537 - Thr: 0.52705 - Ins: 0.84549 - IdH: 0.39220

 val: ROC-AUC - epoch: 3 - score: 0.97795
 Tox: 0.96117 - STox: 0.98534 - Obs: 0.97475 - Thr: 0.98850 - Ins: 0.97026 - IdH: 0.97484
 val: F1 Score - epoch: 3 - score: 0.61327
 Tox: 0.60004 - STox: 0.26230 - Obs: 0.67140 - Thr: 0.44444 - Ins: 0.65475 - IdH: 0.36522
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99884
 Tox: 0.99918 - STox: 0.99617 - Obs: 0.99905 - Thr: 0.99872 - Ins: 0.99791 - IdH: 0.99824
 train: F1 Score - epoch: 4 - score: 0.90354
 Tox: 0.95317 - STox: 0.02225 - Obs: 0.93319 - Thr: 0.68293 - Ins: 0.89171 - IdH: 0.79609

 val: ROC-AUC - epoch: 4 - score: 0.97594
 Tox: 0.95678 - STox: 0.98007 - Obs: 0.97278 - Thr: 0.98190 - Ins: 0.96794 - IdH: 0.97398
 val: F1 Score - epoch: 4 - score: 0.59196


### Singlelayer CNN with multiple window sizes
This CNN consists of an embedding layer, a convolution layer with multiple window sizes which get concatenated afterwards. On top of that there is a fully connected hidden layer.

In [17]:
m2_kernel_sizes = [3, 4, 5]
m2_hidden_dim = 100
m2_num_filters = [100, 100, 100]
m2_dropout = 0.4
m2_spatial_dropout = 0.2
m2_batch_size = 64
m2_epochs = 5

m2_weights_path = 'data/models/cnn_multiwindowsizes/model{}.hdf5'

The network architecture

In [18]:
m2_input = Input((max_comment_length,))
m2_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m2_input)
m2_word_emb = SpatialDropout1D(m2_spatial_dropout)(m2_word_emb)

m2_conv1 = Conv1D(kernel_size=m2_kernel_sizes[0], filters=m2_num_filters[0], padding='same')(m2_word_emb)
m2_conv1 = GlobalMaxPooling1D()(m2_conv1)

m2_conv2 = Conv1D(kernel_size=m2_kernel_sizes[1], filters=m2_num_filters[1], padding='same')(m2_word_emb)
m2_conv2 = GlobalMaxPooling1D()(m2_conv2)

m2_conv3 = Conv1D(kernel_size=m2_kernel_sizes[2], filters=m2_num_filters[2], padding='same')(m2_word_emb)
m2_conv3 = GlobalMaxPooling1D()(m2_conv3)

m2_concat4 = Concatenate()([m2_conv1, m2_conv2, m2_conv3])

m2_fc5 = Dense(m2_hidden_dim, activation='relu')(m2_concat4)
m2_fc5 = Dropout(m2_dropout)(m2_fc5)
m2_output = Dense(len(classes), activation='sigmoid')(m2_fc5)

m2_model = Model(inputs=[m2_input], outputs=[m2_output])
m2_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 2000)         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 2000, 300)    97852800    input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 2000, 300)    0           embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 2000, 100)    90100       spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
conv1d_8 (

Train model

In [19]:
m2_model, predictions = train_model(m2_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m2_epochs, m2_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    m2_weights_path.format(time.time()), random_seed)

Train on 159571 samples, validate on 63978 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.08369, saving model to data/models/cnn_multiwindowsizes/model1541932408.2893183

 train: ROC-AUC - epoch: 0 - score: 0.992644
 Tox: 0.9916101666633795 - STox: 0.9920910052921812 - Obs: 0.995355572761399 - Thr: 0.9926842150983135 - Ins: 0.9914407411046907 - IdH: 0.9926831569318832

 val: ROC-AUC - epoch: 0 - score: 0.978808
 Tox: 0.9668416790454769 - STox: 0.9884707745738456 - Obs: 0.9747124942865073 - Thr: 0.9866755725097227 - Ins: 0.9731549503444628 - IdH: 0.9829946756818101
Epoch 2/5
Epoch 00002: val_loss improved from 0.08369 to 0.08026, saving model to data/models/cnn_multiwindowsizes/model1541932408.2893183

 train: ROC-AUC - epoch: 1 - score: 0.995974
 Tox: 0.9964034740653277 - STox: 0.9938659862305181 - Obs: 0.9973711965699472 - Thr: 0.9971531019710662 - Ins: 0.9944256670316292 - IdH: 0.9966244235582392

 val: ROC-AUC - epoch: 1 - score: 0.979081
 Tox: 0.9654433854945306 - S

### Multilayer CNN
This architecture consists of multiple convolutional layers with a fully connected hidden layer on top of it.

In [7]:
m3_kernel_sizes = [3, 5]
m3_hidden_dim = 100
m3_num_filters = [150, 150]
m3_dropout = 0.4
m3_spatial_dropout = 0.2
m3_batch_size = 64
m3_epochs = 5

m3_weights_path = 'data/models/cnn_multilayer/model{}.hdf5'

The model architecture

In [10]:
m3_input = Input((max_comment_length,))
m3_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m3_input)
m3_word_emb = SpatialDropout1D(m3_spatial_dropout)(m3_word_emb)

m3_conv1 = Conv1D(kernel_size=m3_kernel_sizes[0], filters=m3_num_filters[0], padding='same')(m3_word_emb)
#m3_conv1 = MaxPooling1D(2, strides=2)(m3_conv1)

m3_conv2 = Conv1D(kernel_size=m3_kernel_sizes[1], filters=m3_num_filters[1], padding='same')(m3_conv1)
m3_conv2 = GlobalMaxPooling1D()(m3_conv2)

m3_fc3 = Dense(m3_hidden_dim, activation='relu')(m3_conv2)
m3_fc3 = Dropout(m3_dropout)(m3_fc3)
m3_output = Dense(len(classes), activation='sigmoid')(m3_fc3)

m3_model = Model(inputs=[m3_input], outputs=[m3_output])
m3_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 2000, 300)         97852800  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 2000, 300)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2000, 150)         135150    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 2000, 150)         67650     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 150)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               15100     
__________

Train model

In [11]:
m3_model, predictions = train_model(m3_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m3_epochs, m3_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    m3_weights_path.format(time.time()), random_seed)

Train on 159571 samples, validate on 63978 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.07310, saving model to data/models/cnn_multilayer/model1542877635.380672.hdf5

 train: ROC-AUC - epoch: 1 - score: 0.990770
 Tox: 0.9891278966931427 - STox: 0.9910711924338175 - Obs: 0.9949118178382583 - Thr: 0.9897739807828515 - Ins: 0.9904676369194964 - IdH: 0.9892700169104734

 val: ROC-AUC - epoch: 1 - score: 0.976654
 Tox: 0.9647248585343671 - STox: 0.9867400360938721 - Obs: 0.9747659188617769 - Thr: 0.9851343052316428 - Ins: 0.971281625069828 - IdH: 0.9772788857071107
Epoch 2/5
Epoch 00002: val_loss improved from 0.07310 to 0.07120, saving model to data/models/cnn_multilayer/model1542877635.380672.hdf5

 train: ROC-AUC - epoch: 2 - score: 0.994574
 Tox: 0.9946285060504322 - STox: 0.9925260362551797 - Obs: 0.9964674296623429 - Thr: 0.9962919704316522 - Ins: 0.9928886130700111 - IdH: 0.9946384790644974

 val: ROC-AUC - epoch: 2 - score: 0.977149
 Tox: 0.9642103621079968 - STox:

### Dilated CNN
This architecture consists of multiple convolutional layers with a fully connected hidden layer on top of it.
The first convolutional is a non-dilated layer (dilation rate = 1) whereas layer 2 and 3 specify a gradually growing dilation rate.

In [9]:
m4_kernel_sizes = [3, 7, 14]
m4_dilation_rates = [1, 2, 4]
m4_hidden_dim = 100
m4_num_filters = [150, 150, 150]
m4_dropout = 0.4
m4_spatial_dropout = 0.2
m4_batch_size = 64
m4_epochs = 5

m4_weights_path = 'data/models/cnn_dilated/model{}.hdf5'

The model architecture

In [10]:
m4_input = Input((max_comment_length,))
m4_word_emb = Embedding(input_dim=len(embedding_matrix), output_dim=embedding_length, input_length=max_comment_length, weights=[embedding_matrix])(m4_input)
m4_word_emb = SpatialDropout1D(m4_spatial_dropout)(m4_word_emb)

m4_conv1 = Conv1D(kernel_size=m4_kernel_sizes[0], dilation_rate= m4_dilation_rates[0], filters=m4_num_filters[0], padding='same')(m4_word_emb)
m4_conv2 = Conv1D(kernel_size=m4_kernel_sizes[1], dilation_rate= m4_dilation_rates[1], filters=m4_num_filters[1], padding='same')(m4_conv1)
m4_conv3 = Conv1D(kernel_size=m4_kernel_sizes[2], dilation_rate= m4_dilation_rates[2], filters=m4_num_filters[2], padding='same')(m4_conv2)
m4_conv3 = GlobalMaxPooling1D()(m4_conv3)

m4_fc4 = Dense(m4_hidden_dim, activation='relu')(m4_conv3)
m4_fc4 = Dropout(m4_dropout)(m4_fc4)
m4_output = Dense(len(classes), activation='sigmoid')(m4_fc4)

m4_model = Model(inputs=[m4_input], outputs=[m4_output])
m4_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 2000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2000, 300)         97852800  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 2000, 300)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2000, 150)         135150    
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 2000, 150)         157650    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2000, 150)         315150    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 150)               0         
__________

Train model

In [12]:
m4_model, predictions = train_model(m4_model, X_train_input, Y_train, (X_test_input, Y_test), \
                                    m4_epochs, m4_batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    m4_weights_path.format(time.time()), random_seed)

Train on 159571 samples, validate on 63978 samples
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.07305, saving model to data/models/cnn_dilated/model1542264015.405812.hdf5

 train: ROC-AUC - epoch: 1 - score: 0.985956
 Tox: 0.9888663510080515 - STox: 0.9905932459404572 - Obs: 0.9930193682403294 - Thr: 0.9776521795480431 - Ins: 0.9880621906031104 - IdH: 0.9775403566044827

 val: ROC-AUC - epoch: 1 - score: 0.970267
 Tox: 0.9650888761129582 - STox: 0.9877613793340372 - Obs: 0.9755766844278064 - Thr: 0.9689464837069375 - Ins: 0.9664771805704888 - IdH: 0.9577527930048871
Epoch 2/5
Epoch 00002: val_loss did not improve

 train: ROC-AUC - epoch: 2 - score: 0.989872
 Tox: 0.9931744434306217 - STox: 0.9896209423819466 - Obs: 0.9958710009033012 - Thr: 0.9857293871979882 - Ins: 0.9897306483106407 - IdH: 0.9851046670503348

 val: ROC-AUC - epoch: 2 - score: 0.969027
 Tox: 0.9656227661977468 - STox: 0.9837255453864101 - Obs: 0.977951752386513 - Thr: 0.9640414818849161 - Ins: 0.96659038328

In [19]:
print(scores)

[[[[[0.74460029 0.        ]
    [0.77822825 0.        ]
    [0.69525732 0.        ]
    [0.74791305 0.        ]
    [0.7748494  0.        ]
    [0.76914494 0.        ]
    [0.70220877 0.        ]]]


  [[[0.7288697  0.        ]
    [0.72232498 0.        ]
    [0.82461435 0.        ]
    [0.76629272 0.        ]
    [0.49983283 0.        ]
    [0.74034114 0.        ]
    [0.81981217 0.        ]]]]



 [[[[0.74851886 0.        ]
    [0.71936153 0.        ]
    [0.71252383 0.        ]
    [0.73859954 0.        ]
    [0.85993976 0.        ]
    [0.75937671 0.        ]
    [0.70131181 0.        ]]]


  [[[0.74117915 0.        ]
    [0.72410208 0.        ]
    [0.85999329 0.        ]
    [0.76443229 0.        ]
    [0.54663992 0.        ]
    [0.73763937 0.        ]
    [0.81426793 0.        ]]]]



 [[[[0.74464543 0.        ]
    [0.70973131 0.        ]
    [0.72429645 0.        ]
    [0.74585292 0.        ]
    [0.81777108 0.        ]
    [0.75242737 0.        ]
    [0.71779347 0.        ]]