In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import utils
import time

from keras.models import Model, Input
from keras.layers import Dense, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dropout, \
    Embedding, Concatenate, SpatialDropout1D, MaxPooling1D

from utils.preprocessing_utils import tokenize_sentences, convert_tokens_to_padded_sequence
from utils.dataset_utils import load_data_from_csv
from utils.embedding_utils import create_embeddings_mapping
from utils.training_utils import train_and_evaluate_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
import importlib
import utils
importlib.reload(utils.embedding_utils)
importlib.reload(utils.dataset_utils)
importlib.reload(utils.preprocessing_utils)
importlib.reload(utils.training_utils)
importlib.reload(utils.keras_utils)

<module 'utils.keras_utils' from '/home/philipp/work/gitprojects/toxic-comment-experiments/utils/keras_utils.py'>

Global parameters which hold for all models

In [2]:
random_seed = 2018
classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
features = 'comment_text'
np.random.seed(random_seed)
path_train_data = 'data/kaggle/train.csv'
path_test_data = 'data/kaggle/test_complete.csv'

embedding_length = 300
path_embeddings = 'data/embeddings/GoogleNews-vectors-negative300.bin.gz'

batch_size = 64
epochs = 5

In [3]:
def convert_tokens_to_padded_indices(X_train_tok, X_test_tok, word_embedding_mapping):
    max_len_train = X_train_tok.apply(lambda x: len(x)).max()
    max_len_test = X_test_tok.apply(lambda x: len(x)).max()
    # limit length to 2000, otherwise we get a MemoryError
    max_comment_length = 2000
    X_train_input = convert_tokens_to_padded_sequence(X_train_tok, word_embedding_mapping, max_comment_length)
    X_test_input = convert_tokens_to_padded_sequence(X_test_tok, word_embedding_mapping, max_comment_length)
    return X_train_input, X_test_input, max_comment_length
    
def generate_model(num_tokens, max_comment_length):
    # hyperparameters
    kernel_sizes = [3, 4, 5]
    hidden_dim = 100
    num_filters = [100, 100, 100]
    dropout = 0.4
    spatial_dropout = 0.2
    
    # model
    input = Input((max_comment_length,))
    word_emb = Embedding(input_dim=num_tokens, output_dim=embedding_length, input_length=max_comment_length)(input)
    word_emb = SpatialDropout1D(spatial_dropout)(word_emb)

    conv1 = Conv1D(kernel_size=kernel_sizes[0], filters=num_filters[0], padding='same')(word_emb)
    conv1 = GlobalMaxPooling1D()(conv1)

    conv2 = Conv1D(kernel_size=kernel_sizes[1], filters=num_filters[1], padding='same')(word_emb)
    conv2 = GlobalMaxPooling1D()(conv2)

    conv3 = Conv1D(kernel_size=kernel_sizes[2], filters=num_filters[2], padding='same')(word_emb)
    conv3 = GlobalMaxPooling1D()(conv3)

    concat4 = Concatenate()([conv1, conv2, conv3])

    fc5 = Dense(hidden_dim, activation='relu')(concat4)
    fc5 = Dropout(dropout)(fc5)
    output = Dense(len(classes), activation='sigmoid')(fc5)

    model = Model(inputs=[input], outputs=[output])
    return model

Load train and test data and pretrained word2vec embeddings

In [4]:
X_train, Y_train = load_data_from_csv(path_train_data, features, classes)
X_test, Y_test = load_data_from_csv(path_test_data, features, classes)

### Baseline preprocessing
The following preprocessing techniques are applied for the baseline:
* transformation of all characters to lowercase
* tokenization using the NLTK TweetTokenizer

In [5]:
e1_X_train_tok = tokenize_sentences(X_train)
e1_X_test_tok = tokenize_sentences(X_test)

Create input matrix

In [6]:
e1_embeddings_mapping = create_embeddings_mapping(e1_X_train_tok, e1_X_test_tok)
e1_X_train_input, e1_X_test_input, e1_max_comment_length = \
    convert_tokens_to_padded_indices(e1_X_train_tok, e1_X_test_tok, e1_embeddings_mapping)
del e1_X_train_tok
del e1_X_test_tok

In [7]:
e1_model = generate_model(len(e1_embeddings_mapping)+1, e1_max_comment_length)
e1_scores = train_and_evaluate_model(e1_model, e1_X_train_input, Y_train, (e1_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e1_scores_path = 'data/scores/preprocessing/e1_scores_{}'.format(time.time())
np.save(e1_scores_path, e1_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99381
 Tox: 0.99149 - STox: 0.99233 - Obs: 0.99547 - Thr: 0.98915 - Ins: 0.99129 - IdH: 0.98754
 train: F1 Score - epoch: 1 - score: 0.78471
 Tox: 0.85237 - STox: 0.02585 - Obs: 0.84175 - Thr: 0.00000 - Ins: 0.76205 - IdH: 0.18444

 val: ROC-AUC - epoch: 1 - score: 0.97733
 Tox: 0.96147 - STox: 0.98600 - Obs: 0.97441 - Thr: 0.97437 - Ins: 0.97003 - IdH: 0.97144
 val: F1 Score - epoch: 1 - score: 0.63049
 Tox: 0.65222 - STox: 0.03675 - Obs: 0.67616 - Thr: 0.00000 - Ins: 0.63590 - IdH: 0.14948
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99709
 Tox: 0.99655 - STox: 0.99446 - Obs: 0.99756 - Thr: 0.99738 - Ins: 0.99510 - IdH: 0.99557
 train: F1 Score - epoch: 2 - score: 0.85290
 Tox: 0.90255 - STox: 0.44962 - Obs: 0.89142 - Thr: 0.22262 - Ins: 0.83725 - IdH: 0.59657

 val: ROC-AUC - epoch: 2 - score: 0.97998
 Tox: 0.96325 - STox: 0.98726 - Obs: 0.97498 - Thr: 0.98496 - Ins: 0.97221 - IdH: 0.97863
 val: F1 Score - epoch: 2 - score: 0.63462
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99710
 Tox: 0.99649 - STox: 0.99440 - Obs: 0.99744 - Thr: 0.99601 - Ins: 0.99485 - IdH: 0.99570
 train: F1 Score - epoch: 2 - score: 0.82098
 Tox: 0.87566 - STox: 0.14326 - Obs: 0.87976 - Thr: 0.05285 - Ins: 0.80681 - IdH: 0.43095

 val: ROC-AUC - epoch: 2 - score: 0.97896
 Tox: 0.96121 - STox: 0.98507 - Obs: 0.97502 - Thr: 0.98280 - Ins: 0.97052 - IdH: 0.97537
 val: F1 Score - epoch: 2 - score: 0.64368
 Tox: 0.66093 - STox: 0.10959 - Obs: 0.69049 - Thr: 0.12389 - Ins: 0.64052 - IdH: 0.38696
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99838
 Tox: 0.99813 - STox: 0.99590 - Obs: 0.99872 - Thr: 0.99792 - Ins: 0.99703 - IdH: 0.99806
 train: F1 Score - epoch: 3 - score: 0.88964
 Tox: 0.93707 - STox: 0.32040 - Obs: 0.92106 - Thr: 0.44136 - Ins: 0.87610 - IdH: 0.73875

 val: ROC-AUC - epoch: 3 - score: 0.97800
 Tox: 0.96049 - STox: 0.98608 - Obs: 0.97438 - Thr: 0.98205 - Ins: 0.96886 - IdH: 0.97468
 val: F1 Score - epoch: 3 - score: 0.61628


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99833
 Tox: 0.99819 - STox: 0.99613 - Obs: 0.99864 - Thr: 0.99837 - Ins: 0.99672 - IdH: 0.99771
 train: F1 Score - epoch: 3 - score: 0.88881
 Tox: 0.93671 - STox: 0.56479 - Obs: 0.91968 - Thr: 0.50952 - Ins: 0.86613 - IdH: 0.66526

 val: ROC-AUC - epoch: 3 - score: 0.97764
 Tox: 0.95977 - STox: 0.98589 - Obs: 0.97287 - Thr: 0.98211 - Ins: 0.96946 - IdH: 0.97676
 val: F1 Score - epoch: 3 - score: 0.61541
 Tox: 0.60541 - STox: 0.32800 - Obs: 0.66140 - Thr: 0.32530 - Ins: 0.64043 - IdH: 0.54123
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99904
 Tox: 0.99905 - STox: 0.99739 - Obs: 0.99928 - Thr: 0.99912 - Ins: 0.99833 - IdH: 0.99865
 train: F1 Score - epoch: 4 - score: 0.92086
 Tox: 0.95309 - STox: 0.73159 - Obs: 0.94139 - Thr: 0.68159 - Ins: 0.90883 - IdH: 0.81039

 val: ROC-AUC - epoch: 4 - score: 0.97595
 Tox: 0.95599 - STox: 0.98462 - Obs: 0.97148 - Thr: 0.98145 - Ins: 0.96652 - IdH: 0.97263
 val: F1 Score - epoch: 4 - score: 0.61624
