In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import utils
import time

from keras.models import Model, Input
from keras.layers import Dense, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dropout, \
    Embedding, Concatenate, SpatialDropout1D, MaxPooling1D

from utils.preprocessing_utils import tokenize_sentences, convert_tokens_to_padded_sequence, \
    remove_punctuation, remove_punctuation_weak, perform_stemming, perform_lemmatization, replace_unknown_tokens
from utils.dataset_utils import load_data_from_csv
from utils.embedding_utils import create_embeddings_mapping, load_word2vec_embeddings
from utils.training_utils import train_and_evaluate_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
import importlib
import utils
importlib.reload(utils.embedding_utils)
importlib.reload(utils.dataset_utils)
importlib.reload(utils.preprocessing_utils)
importlib.reload(utils.training_utils)
importlib.reload(utils.keras_utils)

<module 'utils.keras_utils' from '/home/philipp/work/gitprojects/toxic-comment-experiments/utils/keras_utils.py'>

Global parameters which hold for all models

In [2]:
random_seed = 2018
classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
features = 'comment_text'
np.random.seed(random_seed)
path_train_data = 'data/kaggle/train.csv'
path_test_data = 'data/kaggle/test_complete.csv'

embedding_length = 300
path_embeddings = 'data/embeddings/GoogleNews-vectors-negative300.bin.gz'

batch_size = 64
epochs = 5

In [3]:
def convert_tokens_to_padded_indices(X_train_tok, X_test_tok, word_embedding_mapping):
    max_len_train = X_train_tok.apply(lambda x: len(x)).max()
    max_len_test = X_test_tok.apply(lambda x: len(x)).max()
    # limit length to 2000, otherwise we get a MemoryError
    print("maximum comment length: {}".format(max(max_len_train, max_len_test)))
    max_comment_length = min(2000, max(max_len_train, max_len_test))
    X_train_input = convert_tokens_to_padded_sequence(X_train_tok, word_embedding_mapping, max_comment_length)
    X_test_input = convert_tokens_to_padded_sequence(X_test_tok, word_embedding_mapping, max_comment_length)
    return X_train_input, X_test_input, max_comment_length
    
def generate_model(num_tokens, max_comment_length):
    # hyperparameters
    kernel_sizes = [3, 4, 5]
    hidden_dim = 100
    num_filters = [100, 100, 100]
    dropout = 0.4
    spatial_dropout = 0.2
    
    # model
    input = Input((max_comment_length,))
    word_emb = Embedding(input_dim=num_tokens, output_dim=embedding_length, input_length=max_comment_length)(input)
    word_emb = SpatialDropout1D(spatial_dropout)(word_emb)

    conv1 = Conv1D(kernel_size=kernel_sizes[0], filters=num_filters[0], padding='same')(word_emb)
    conv1 = GlobalMaxPooling1D()(conv1)

    conv2 = Conv1D(kernel_size=kernel_sizes[1], filters=num_filters[1], padding='same')(word_emb)
    conv2 = GlobalMaxPooling1D()(conv2)

    conv3 = Conv1D(kernel_size=kernel_sizes[2], filters=num_filters[2], padding='same')(word_emb)
    conv3 = GlobalMaxPooling1D()(conv3)

    concat4 = Concatenate()([conv1, conv2, conv3])

    fc5 = Dense(hidden_dim, activation='relu')(concat4)
    fc5 = Dropout(dropout)(fc5)
    output = Dense(len(classes), activation='sigmoid')(fc5)

    model = Model(inputs=[input], outputs=[output])
    return model

Load train and test data and pretrained word2vec embeddings

In [4]:
X_train, Y_train = load_data_from_csv(path_train_data, features, classes)
X_test, Y_test = load_data_from_csv(path_test_data, features, classes)

### Baseline preprocessing
The following preprocessing techniques are applied for the baseline:
* transformation of all characters to lowercase
* tokenization using the NLTK TweetTokenizer

In [5]:
e1_X_train_tok = tokenize_sentences(X_train)
e1_X_test_tok = tokenize_sentences(X_test)

Create input matrix

In [6]:
e1_embeddings_mapping = create_embeddings_mapping(e1_X_train_tok, e1_X_test_tok, debug=True)
e1_X_train_input, e1_X_test_input, e1_max_comment_length = \
    convert_tokens_to_padded_indices(e1_X_train_tok, e1_X_test_tok, e1_embeddings_mapping)
del e1_X_train_tok
del e1_X_test_tok

Number of unique tokens: 326175
maximum comment length: 3801


In [7]:
e1_model = generate_model(len(e1_embeddings_mapping)+1, e1_max_comment_length)
e1_scores = train_and_evaluate_model(e1_model, e1_X_train_input, Y_train, (e1_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e1_scores_path = 'data/scores/preprocessing/e1_scores_{}'.format(time.time())
np.save(e1_scores_path, e1_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99381
 Tox: 0.99149 - STox: 0.99233 - Obs: 0.99547 - Thr: 0.98915 - Ins: 0.99129 - IdH: 0.98754
 train: F1 Score - epoch: 1 - score: 0.78471
 Tox: 0.85237 - STox: 0.02585 - Obs: 0.84175 - Thr: 0.00000 - Ins: 0.76205 - IdH: 0.18444

 val: ROC-AUC - epoch: 1 - score: 0.97733
 Tox: 0.96147 - STox: 0.98600 - Obs: 0.97441 - Thr: 0.97437 - Ins: 0.97003 - IdH: 0.97144
 val: F1 Score - epoch: 1 - score: 0.63049
 Tox: 0.65222 - STox: 0.03675 - Obs: 0.67616 - Thr: 0.00000 - Ins: 0.63590 - IdH: 0.14948
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99709
 Tox: 0.99655 - STox: 0.99446 - Obs: 0.99756 - Thr: 0.99738 - Ins: 0.99510 - IdH: 0.99557
 train: F1 Score - epoch: 2 - score: 0.85290
 Tox: 0.90255 - STox: 0.44962 - Obs: 0.89142 - Thr: 0.22262 - Ins: 0.83725 - IdH: 0.59657

 val: ROC-AUC - epoch: 2 - score: 0.97998
 Tox: 0.96325 - STox: 0.98726 - Obs: 0.97498 - Thr: 0.98496 - Ins: 0.97221 - IdH: 0.97863
 val: F1 Score - epoch: 2 - score: 0.63462
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99710
 Tox: 0.99649 - STox: 0.99440 - Obs: 0.99744 - Thr: 0.99601 - Ins: 0.99485 - IdH: 0.99570
 train: F1 Score - epoch: 2 - score: 0.82098
 Tox: 0.87566 - STox: 0.14326 - Obs: 0.87976 - Thr: 0.05285 - Ins: 0.80681 - IdH: 0.43095

 val: ROC-AUC - epoch: 2 - score: 0.97896
 Tox: 0.96121 - STox: 0.98507 - Obs: 0.97502 - Thr: 0.98280 - Ins: 0.97052 - IdH: 0.97537
 val: F1 Score - epoch: 2 - score: 0.64368
 Tox: 0.66093 - STox: 0.10959 - Obs: 0.69049 - Thr: 0.12389 - Ins: 0.64052 - IdH: 0.38696
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99838
 Tox: 0.99813 - STox: 0.99590 - Obs: 0.99872 - Thr: 0.99792 - Ins: 0.99703 - IdH: 0.99806
 train: F1 Score - epoch: 3 - score: 0.88964
 Tox: 0.93707 - STox: 0.32040 - Obs: 0.92106 - Thr: 0.44136 - Ins: 0.87610 - IdH: 0.73875

 val: ROC-AUC - epoch: 3 - score: 0.97800
 Tox: 0.96049 - STox: 0.98608 - Obs: 0.97438 - Thr: 0.98205 - Ins: 0.96886 - IdH: 0.97468
 val: F1 Score - epoch: 3 - score: 0.61628


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99833
 Tox: 0.99819 - STox: 0.99613 - Obs: 0.99864 - Thr: 0.99837 - Ins: 0.99672 - IdH: 0.99771
 train: F1 Score - epoch: 3 - score: 0.88881
 Tox: 0.93671 - STox: 0.56479 - Obs: 0.91968 - Thr: 0.50952 - Ins: 0.86613 - IdH: 0.66526

 val: ROC-AUC - epoch: 3 - score: 0.97764
 Tox: 0.95977 - STox: 0.98589 - Obs: 0.97287 - Thr: 0.98211 - Ins: 0.96946 - IdH: 0.97676
 val: F1 Score - epoch: 3 - score: 0.61541
 Tox: 0.60541 - STox: 0.32800 - Obs: 0.66140 - Thr: 0.32530 - Ins: 0.64043 - IdH: 0.54123
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99904
 Tox: 0.99905 - STox: 0.99739 - Obs: 0.99928 - Thr: 0.99912 - Ins: 0.99833 - IdH: 0.99865
 train: F1 Score - epoch: 4 - score: 0.92086
 Tox: 0.95309 - STox: 0.73159 - Obs: 0.94139 - Thr: 0.68159 - Ins: 0.90883 - IdH: 0.81039

 val: ROC-AUC - epoch: 4 - score: 0.97595
 Tox: 0.95599 - STox: 0.98462 - Obs: 0.97148 - Thr: 0.98145 - Ins: 0.96652 - IdH: 0.97263
 val: F1 Score - epoch: 4 - score: 0.61624


### Strip more than 3 of the same characters in a row
In addition to the baseline techniques all characters which occur more than 3 times in a row are stripped to a length of 3.

In [7]:
e2_X_train_tok = tokenize_sentences(X_train, reduce_len=True)
e2_X_test_tok = tokenize_sentences(X_test, reduce_len=True)

Create input matrix

In [8]:
e2_embeddings_mapping = create_embeddings_mapping(e2_X_train_tok, e2_X_test_tok, debug=True)
e2_X_train_input, e2_X_test_input, e2_max_comment_length = \
    convert_tokens_to_padded_indices(e2_X_train_tok, e2_X_test_tok, e2_embeddings_mapping)
del e2_X_train_tok
del e2_X_test_tok

Number of unique tokens: 325464
maximum comment length: 3801


In [8]:
e2_model = generate_model(len(e2_embeddings_mapping)+1, e2_max_comment_length)
e2_scores = train_and_evaluate_model(e2_model, e2_X_train_input, Y_train, (e2_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e2_scores_path = 'data/scores/preprocessing/e2_scores_{}'.format(time.time())
np.save(e2_scores_path, e2_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99365
 Tox: 0.99139 - STox: 0.99226 - Obs: 0.99538 - Thr: 0.98887 - Ins: 0.99139 - IdH: 0.98502
 train: F1 Score - epoch: 1 - score: 0.78571
 Tox: 0.85259 - STox: 0.08894 - Obs: 0.84118 - Thr: 0.00000 - Ins: 0.77251 - IdH: 0.01549

 val: ROC-AUC - epoch: 1 - score: 0.97748
 Tox: 0.96131 - STox: 0.98625 - Obs: 0.97495 - Thr: 0.97707 - Ins: 0.97112 - IdH: 0.96805
 val: F1 Score - epoch: 1 - score: 0.62819
 Tox: 0.64749 - STox: 0.11111 - Obs: 0.67641 - Thr: 0.00000 - Ins: 0.64308 - IdH: 0.02216
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99708
 Tox: 0.99665 - STox: 0.99436 - Obs: 0.99765 - Thr: 0.99726 - Ins: 0.99499 - IdH: 0.99561
 train: F1 Score - epoch: 2 - score: 0.85625
 Tox: 0.91358 - STox: 0.42182 - Obs: 0.88444 - Thr: 0.28227 - Ins: 0.83337 - IdH: 0.62198

 val: ROC-AUC - epoch: 2 - score: 0.97889
 Tox: 0.96294 - STox: 0.98554 - Obs: 0.97393 - Thr: 0.98297 - Ins: 0.97057 - IdH: 0.97786
 val: F1 Score - epoch: 2 - score: 0.60929
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99696
 Tox: 0.99653 - STox: 0.99427 - Obs: 0.99714 - Thr: 0.99595 - Ins: 0.99449 - IdH: 0.99474
 train: F1 Score - epoch: 2 - score: 0.82349
 Tox: 0.88468 - STox: 0.14389 - Obs: 0.87916 - Thr: 0.01247 - Ins: 0.80748 - IdH: 0.33754

 val: ROC-AUC - epoch: 2 - score: 0.97837
 Tox: 0.96082 - STox: 0.98525 - Obs: 0.97445 - Thr: 0.98292 - Ins: 0.96997 - IdH: 0.97430
 val: F1 Score - epoch: 2 - score: 0.63208
 Tox: 0.64052 - STox: 0.11404 - Obs: 0.68128 - Thr: 0.00935 - Ins: 0.65048 - IdH: 0.34038
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99836
 Tox: 0.99837 - STox: 0.99554 - Obs: 0.99864 - Thr: 0.99736 - Ins: 0.99678 - IdH: 0.99764
 train: F1 Score - epoch: 3 - score: 0.88921
 Tox: 0.94228 - STox: 0.33317 - Obs: 0.92068 - Thr: 0.39303 - Ins: 0.86982 - IdH: 0.72066

 val: ROC-AUC - epoch: 3 - score: 0.97786
 Tox: 0.95956 - STox: 0.98655 - Obs: 0.97353 - Thr: 0.98416 - Ins: 0.96922 - IdH: 0.97587
 val: F1 Score - epoch: 3 - score: 0.61808


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99834
 Tox: 0.99831 - STox: 0.99567 - Obs: 0.99863 - Thr: 0.99815 - Ins: 0.99686 - IdH: 0.99759
 train: F1 Score - epoch: 3 - score: 0.88867
 Tox: 0.94145 - STox: 0.48540 - Obs: 0.91872 - Thr: 0.49619 - Ins: 0.87400 - IdH: 0.55014

 val: ROC-AUC - epoch: 3 - score: 0.97747
 Tox: 0.95926 - STox: 0.98649 - Obs: 0.97185 - Thr: 0.98537 - Ins: 0.96961 - IdH: 0.97639
 val: F1 Score - epoch: 3 - score: 0.61084
 Tox: 0.60182 - STox: 0.31201 - Obs: 0.64567 - Thr: 0.31469 - Ins: 0.64955 - IdH: 0.48231
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99901
 Tox: 0.99911 - STox: 0.99713 - Obs: 0.99935 - Thr: 0.99876 - Ins: 0.99822 - IdH: 0.99856
 train: F1 Score - epoch: 4 - score: 0.91460
 Tox: 0.95027 - STox: 0.60680 - Obs: 0.94306 - Thr: 0.60548 - Ins: 0.90038 - IdH: 0.79358

 val: ROC-AUC - epoch: 4 - score: 0.97679
 Tox: 0.95773 - STox: 0.98466 - Obs: 0.97272 - Thr: 0.98309 - Ins: 0.96724 - IdH: 0.97433
 val: F1 Score - epoch: 4 - score: 0.62137


### Remove all punctuation

In [5]:
e3_X_train_tok = tokenize_sentences(remove_punctuation(X_train))
e3_X_test_tok = tokenize_sentences(remove_punctuation(X_test))

Create input matrix

In [6]:
e3_embeddings_mapping = create_embeddings_mapping(e3_X_train_tok, e3_X_test_tok, debug=True)
e3_X_train_input, e3_X_test_input, e3_max_comment_length = \
    convert_tokens_to_padded_indices(e3_X_train_tok, e3_X_test_tok, e3_embeddings_mapping)
del e3_X_train_tok
del e3_X_test_tok

Number of unique tokens: 268928
maximum comment length: 2321


In [7]:
e3_model = generate_model(len(e3_embeddings_mapping)+1, e3_max_comment_length)
e3_scores = train_and_evaluate_model(e3_model, e3_X_train_input, Y_train, (e3_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e3_scores_path = 'data/scores/preprocessing/e3_scores_{}'.format(time.time())
np.save(e3_scores_path, e3_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99333
 Tox: 0.99160 - STox: 0.99210 - Obs: 0.99555 - Thr: 0.98698 - Ins: 0.99139 - IdH: 0.98236
 train: F1 Score - epoch: 1 - score: 0.78475
 Tox: 0.85675 - STox: 0.04255 - Obs: 0.83737 - Thr: 0.00000 - Ins: 0.76687 - IdH: 0.00000

 val: ROC-AUC - epoch: 1 - score: 0.97729
 Tox: 0.96303 - STox: 0.98676 - Obs: 0.97548 - Thr: 0.97457 - Ins: 0.97205 - IdH: 0.96336
 val: F1 Score - epoch: 1 - score: 0.63036
 Tox: 0.65346 - STox: 0.07125 - Obs: 0.67011 - Thr: 0.00000 - Ins: 0.64940 - IdH: 0.00000
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99683
 Tox: 0.99611 - STox: 0.99431 - Obs: 0.99755 - Thr: 0.99668 - Ins: 0.99462 - IdH: 0.99524
 train: F1 Score - epoch: 2 - score: 0.84463
 Tox: 0.90718 - STox: 0.27103 - Obs: 0.88679 - Thr: 0.29816 - Ins: 0.82430 - IdH: 0.37644

 val: ROC-AUC - epoch: 2 - score: 0.97956
 Tox: 0.96338 - STox: 0.98673 - Obs: 0.97493 - Thr: 0.98366 - Ins: 0.97231 - IdH: 0.97782
 val: F1 Score - epoch: 2 - score: 0.61563
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99700
 Tox: 0.99647 - STox: 0.99432 - Obs: 0.99737 - Thr: 0.99606 - Ins: 0.99449 - IdH: 0.99559
 train: F1 Score - epoch: 2 - score: 0.83362
 Tox: 0.89862 - STox: 0.00000 - Obs: 0.88003 - Thr: 0.22744 - Ins: 0.80982 - IdH: 0.48918

 val: ROC-AUC - epoch: 2 - score: 0.97961
 Tox: 0.96318 - STox: 0.98581 - Obs: 0.97571 - Thr: 0.98424 - Ins: 0.97198 - IdH: 0.97484
 val: F1 Score - epoch: 2 - score: 0.64265
 Tox: 0.65475 - STox: 0.00543 - Obs: 0.68390 - Thr: 0.17255 - Ins: 0.64834 - IdH: 0.45124
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99834
 Tox: 0.99809 - STox: 0.99579 - Obs: 0.99867 - Thr: 0.99803 - Ins: 0.99680 - IdH: 0.99790
 train: F1 Score - epoch: 3 - score: 0.87798
 Tox: 0.93287 - STox: 0.09133 - Obs: 0.91959 - Thr: 0.46547 - Ins: 0.86595 - IdH: 0.60544

 val: ROC-AUC - epoch: 3 - score: 0.97881
 Tox: 0.96107 - STox: 0.98671 - Obs: 0.97507 - Thr: 0.98333 - Ins: 0.97156 - IdH: 0.97365
 val: F1 Score - epoch: 3 - score: 0.63346


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99834
 Tox: 0.99822 - STox: 0.99566 - Obs: 0.99863 - Thr: 0.99836 - Ins: 0.99691 - IdH: 0.99787
 train: F1 Score - epoch: 3 - score: 0.88988
 Tox: 0.93530 - STox: 0.53852 - Obs: 0.91950 - Thr: 0.51462 - Ins: 0.87313 - IdH: 0.67160

 val: ROC-AUC - epoch: 3 - score: 0.97856
 Tox: 0.96105 - STox: 0.98580 - Obs: 0.97450 - Thr: 0.98417 - Ins: 0.97147 - IdH: 0.97258
 val: F1 Score - epoch: 3 - score: 0.62194
 Tox: 0.62396 - STox: 0.33943 - Obs: 0.64999 - Thr: 0.29139 - Ins: 0.65005 - IdH: 0.47005
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99911
 Tox: 0.99922 - STox: 0.99666 - Obs: 0.99931 - Thr: 0.99907 - Ins: 0.99837 - IdH: 0.99866
 train: F1 Score - epoch: 4 - score: 0.92158
 Tox: 0.95920 - STox: 0.64182 - Obs: 0.94144 - Thr: 0.70419 - Ins: 0.90713 - IdH: 0.81934

 val: ROC-AUC - epoch: 4 - score: 0.97729
 Tox: 0.95741 - STox: 0.98496 - Obs: 0.97372 - Thr: 0.98294 - Ins: 0.96969 - IdH: 0.97268
 val: F1 Score - epoch: 4 - score: 0.62652


### Remove all punctuation except for .,!?

In [5]:
e4_X_train_tok = tokenize_sentences(remove_punctuation_weak(X_train))
e4_X_test_tok = tokenize_sentences(remove_punctuation_weak(X_test))

Create input matrix

In [6]:
e4_embeddings_mapping = create_embeddings_mapping(e4_X_train_tok, e4_X_test_tok, debug=True)
e4_X_train_input, e4_X_test_input, e4_max_comment_length = \
    convert_tokens_to_padded_indices(e4_X_train_tok, e4_X_test_tok, e4_embeddings_mapping)
del e4_X_train_tok
del e4_X_test_tok

Number of unique tokens: 290831
maximum comment length: 2321


In [7]:
e4_model = generate_model(len(e4_embeddings_mapping)+1, e4_max_comment_length)
e4_scores = train_and_evaluate_model(e4_model, e4_X_train_input, Y_train, (e4_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e4_scores_path = 'data/scores/preprocessing/e4_scores_{}'.format(time.time())
np.save(e4_scores_path, e4_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99329
 Tox: 0.99136 - STox: 0.99187 - Obs: 0.99530 - Thr: 0.98518 - Ins: 0.99105 - IdH: 0.98312
 train: F1 Score - epoch: 1 - score: 0.78028
 Tox: 0.84805 - STox: 0.08206 - Obs: 0.83765 - Thr: 0.00000 - Ins: 0.76497 - IdH: 0.00000

 val: ROC-AUC - epoch: 1 - score: 0.97752
 Tox: 0.96323 - STox: 0.98622 - Obs: 0.97510 - Thr: 0.97273 - Ins: 0.97100 - IdH: 0.96397
 val: F1 Score - epoch: 1 - score: 0.63057
 Tox: 0.65768 - STox: 0.09069 - Obs: 0.67263 - Thr: 0.00000 - Ins: 0.63938 - IdH: 0.00000
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99699
 Tox: 0.99639 - STox: 0.99457 - Obs: 0.99760 - Thr: 0.99715 - Ins: 0.99490 - IdH: 0.99502
 train: F1 Score - epoch: 2 - score: 0.85115
 Tox: 0.90829 - STox: 0.38803 - Obs: 0.89056 - Thr: 0.33225 - Ins: 0.83508 - IdH: 0.40663

 val: ROC-AUC - epoch: 2 - score: 0.97992
 Tox: 0.96443 - STox: 0.98664 - Obs: 0.97479 - Thr: 0.98567 - Ins: 0.97188 - IdH: 0.97840
 val: F1 Score - epoch: 2 - score: 0.62402
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99713
 Tox: 0.99655 - STox: 0.99446 - Obs: 0.99745 - Thr: 0.99610 - Ins: 0.99475 - IdH: 0.99539
 train: F1 Score - epoch: 2 - score: 0.81738
 Tox: 0.87573 - STox: 0.12693 - Obs: 0.87684 - Thr: 0.00000 - Ins: 0.79729 - IdH: 0.42574

 val: ROC-AUC - epoch: 2 - score: 0.97981
 Tox: 0.96223 - STox: 0.98582 - Obs: 0.97604 - Thr: 0.98378 - Ins: 0.97177 - IdH: 0.97619
 val: F1 Score - epoch: 2 - score: 0.64527
 Tox: 0.66470 - STox: 0.10884 - Obs: 0.68994 - Thr: 0.00000 - Ins: 0.64480 - IdH: 0.39413
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99839
 Tox: 0.99821 - STox: 0.99557 - Obs: 0.99877 - Thr: 0.99827 - Ins: 0.99682 - IdH: 0.99787
 train: F1 Score - epoch: 3 - score: 0.88702
 Tox: 0.93655 - STox: 0.27707 - Obs: 0.92279 - Thr: 0.48980 - Ins: 0.87183 - IdH: 0.70545

 val: ROC-AUC - epoch: 3 - score: 0.97876
 Tox: 0.96145 - STox: 0.98630 - Obs: 0.97560 - Thr: 0.98130 - Ins: 0.97064 - IdH: 0.97499
 val: F1 Score - epoch: 3 - score: 0.63202


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99841
 Tox: 0.99844 - STox: 0.99613 - Obs: 0.99874 - Thr: 0.99856 - Ins: 0.99695 - IdH: 0.99763
 train: F1 Score - epoch: 3 - score: 0.89032
 Tox: 0.94287 - STox: 0.42492 - Obs: 0.92040 - Thr: 0.45399 - Ins: 0.87778 - IdH: 0.60148

 val: ROC-AUC - epoch: 3 - score: 0.97856
 Tox: 0.96130 - STox: 0.98486 - Obs: 0.97420 - Thr: 0.98379 - Ins: 0.97139 - IdH: 0.97723
 val: F1 Score - epoch: 3 - score: 0.62260
 Tox: 0.62120 - STox: 0.21333 - Obs: 0.65678 - Thr: 0.28571 - Ins: 0.65542 - IdH: 0.48329
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99916
 Tox: 0.99926 - STox: 0.99730 - Obs: 0.99940 - Thr: 0.99906 - Ins: 0.99829 - IdH: 0.99869
 train: F1 Score - epoch: 4 - score: 0.92322
 Tox: 0.95764 - STox: 0.67815 - Obs: 0.94789 - Thr: 0.68881 - Ins: 0.90553 - IdH: 0.81346

 val: ROC-AUC - epoch: 4 - score: 0.97797
 Tox: 0.95922 - STox: 0.98449 - Obs: 0.97414 - Thr: 0.98337 - Ins: 0.97022 - IdH: 0.97438
 val: F1 Score - epoch: 4 - score: 0.63126


### Stemming

In [5]:
e5_X_train_tok = perform_stemming(tokenize_sentences(X_train))
e5_X_test_tok = perform_stemming(tokenize_sentences(X_test))

Create input matrix

In [6]:
e5_embeddings_mapping = create_embeddings_mapping(e5_X_train_tok, e5_X_test_tok, debug=True)
e5_X_train_input, e5_X_test_input, e5_max_comment_length = \
    convert_tokens_to_padded_indices(e5_X_train_tok, e5_X_test_tok, e5_embeddings_mapping)
del e5_X_train_tok
del e5_X_test_tok

Number of unique tokens: 270849
maximum comment length: 3801


In [7]:
e5_model = generate_model(len(e5_embeddings_mapping)+1, e5_max_comment_length)
e5_scores = train_and_evaluate_model(e5_model, e5_X_train_input, Y_train, (e5_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e5_scores_path = 'data/scores/preprocessing/e5_scores_{}'.format(time.time())
np.save(e5_scores_path, e5_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99356
 Tox: 0.99080 - STox: 0.99182 - Obs: 0.99552 - Thr: 0.98581 - Ins: 0.99118 - IdH: 0.98775
 train: F1 Score - epoch: 1 - score: 0.77589
 Tox: 0.83915 - STox: 0.02099 - Obs: 0.83376 - Thr: 0.00000 - Ins: 0.76680 - IdH: 0.12336

 val: ROC-AUC - epoch: 1 - score: 0.97857
 Tox: 0.96319 - STox: 0.98750 - Obs: 0.97499 - Thr: 0.97388 - Ins: 0.97196 - IdH: 0.97518
 val: F1 Score - epoch: 1 - score: 0.64041
 Tox: 0.66440 - STox: 0.05699 - Obs: 0.67771 - Thr: 0.00000 - Ins: 0.65493 - IdH: 0.14249
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99676
 Tox: 0.99594 - STox: 0.99442 - Obs: 0.99758 - Thr: 0.99725 - Ins: 0.99452 - IdH: 0.99521
 train: F1 Score - epoch: 2 - score: 0.84456
 Tox: 0.89348 - STox: 0.49161 - Obs: 0.88753 - Thr: 0.22263 - Ins: 0.82358 - IdH: 0.55636

 val: ROC-AUC - epoch: 2 - score: 0.98032
 Tox: 0.96340 - STox: 0.98756 - Obs: 0.97621 - Thr: 0.98807 - Ins: 0.97280 - IdH: 0.98096
 val: F1 Score - epoch: 2 - score: 0.63132
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99677
 Tox: 0.99598 - STox: 0.99400 - Obs: 0.99730 - Thr: 0.99592 - Ins: 0.99428 - IdH: 0.99525
 train: F1 Score - epoch: 2 - score: 0.81911
 Tox: 0.86954 - STox: 0.02343 - Obs: 0.87735 - Thr: 0.04090 - Ins: 0.80988 - IdH: 0.53527

 val: ROC-AUC - epoch: 2 - score: 0.97949
 Tox: 0.96091 - STox: 0.98628 - Obs: 0.97473 - Thr: 0.98797 - Ins: 0.97042 - IdH: 0.98101
 val: F1 Score - epoch: 2 - score: 0.64750
 Tox: 0.67012 - STox: 0.04700 - Obs: 0.67791 - Thr: 0.01843 - Ins: 0.64053 - IdH: 0.50669
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99810
 Tox: 0.99787 - STox: 0.99542 - Obs: 0.99844 - Thr: 0.99802 - Ins: 0.99608 - IdH: 0.99779
 train: F1 Score - epoch: 3 - score: 0.86907
 Tox: 0.92239 - STox: 0.06867 - Obs: 0.91159 - Thr: 0.43586 - Ins: 0.85212 - IdH: 0.66782

 val: ROC-AUC - epoch: 3 - score: 0.97880
 Tox: 0.96067 - STox: 0.98747 - Obs: 0.97395 - Thr: 0.98604 - Ins: 0.97004 - IdH: 0.97995
 val: F1 Score - epoch: 3 - score: 0.63084


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99828
 Tox: 0.99809 - STox: 0.99555 - Obs: 0.99847 - Thr: 0.99864 - Ins: 0.99659 - IdH: 0.99769
 train: F1 Score - epoch: 3 - score: 0.88329
 Tox: 0.93338 - STox: 0.53437 - Obs: 0.90889 - Thr: 0.55887 - Ins: 0.85928 - IdH: 0.68417

 val: ROC-AUC - epoch: 3 - score: 0.97841
 Tox: 0.96149 - STox: 0.98674 - Obs: 0.97229 - Thr: 0.98840 - Ins: 0.97093 - IdH: 0.97788
 val: F1 Score - epoch: 3 - score: 0.62294
 Tox: 0.61806 - STox: 0.34003 - Obs: 0.65393 - Thr: 0.33333 - Ins: 0.65535 - IdH: 0.55433
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99896
 Tox: 0.99883 - STox: 0.99716 - Obs: 0.99921 - Thr: 0.99912 - Ins: 0.99797 - IdH: 0.99875
 train: F1 Score - epoch: 4 - score: 0.91453
 Tox: 0.94685 - STox: 0.69748 - Obs: 0.93802 - Thr: 0.70999 - Ins: 0.89452 - IdH: 0.82487

 val: ROC-AUC - epoch: 4 - score: 0.97712
 Tox: 0.95649 - STox: 0.98530 - Obs: 0.97349 - Thr: 0.98616 - Ins: 0.96885 - IdH: 0.97516
 val: F1 Score - epoch: 4 - score: 0.62329


### Lemmatization

In [14]:
e6_X_train_tok = perform_lemmatization(tokenize_sentences(X_train))
e6_X_test_tok = perform_lemmatization(tokenize_sentences(X_test))

Create input matrix

In [16]:
e6_embeddings_mapping = create_embeddings_mapping(e6_X_train_tok, e6_X_test_tok, debug=True)
e6_X_train_input, e6_X_test_input, e6_max_comment_length = \
    convert_tokens_to_padded_indices(e6_X_train_tok, e6_X_test_tok, e6_embeddings_mapping)
del e6_X_train_tok
del e6_X_test_tok

Number of unique tokens: 314708
maximum comment length: 3801


In [17]:
e6_model = generate_model(len(e6_embeddings_mapping)+1, e6_max_comment_length)
e6_scores = train_and_evaluate_model(e6_model, e6_X_train_input, Y_train, (e6_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e6_scores_path = 'data/scores/preprocessing/e6_scores_{}'.format(time.time())
np.save(e6_scores_path, e6_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99301
 Tox: 0.99127 - STox: 0.99163 - Obs: 0.99497 - Thr: 0.98754 - Ins: 0.99093 - IdH: 0.98866
 train: F1 Score - epoch: 1 - score: 0.77740
 Tox: 0.85407 - STox: 0.00000 - Obs: 0.80848 - Thr: 0.00000 - Ins: 0.74904 - IdH: 0.03352

 val: ROC-AUC - epoch: 1 - score: 0.97542
 Tox: 0.96182 - STox: 0.98472 - Obs: 0.97617 - Thr: 0.97525 - Ins: 0.96937 - IdH: 0.97136
 val: F1 Score - epoch: 1 - score: 0.58668
 Tox: 0.59857 - STox: 0.00543 - Obs: 0.60843 - Thr: 0.00000 - Ins: 0.63027 - IdH: 0.03562
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99687
 Tox: 0.99642 - STox: 0.99406 - Obs: 0.99745 - Thr: 0.99706 - Ins: 0.99471 - IdH: 0.99610
 train: F1 Score - epoch: 2 - score: 0.83701
 Tox: 0.88382 - STox: 0.25499 - Obs: 0.88526 - Thr: 0.29066 - Ins: 0.83069 - IdH: 0.51275

 val: ROC-AUC - epoch: 2 - score: 0.98009
 Tox: 0.96186 - STox: 0.98617 - Obs: 0.97724 - Thr: 0.98168 - Ins: 0.97311 - IdH: 0.97666
 val: F1 Score - epoch: 2 - score: 0.64764
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99690
 Tox: 0.99604 - STox: 0.99406 - Obs: 0.99752 - Thr: 0.99624 - Ins: 0.99424 - IdH: 0.99549
 train: F1 Score - epoch: 2 - score: 0.83233
 Tox: 0.87977 - STox: 0.39786 - Obs: 0.88839 - Thr: 0.44910 - Ins: 0.80246 - IdH: 0.51823

 val: ROC-AUC - epoch: 2 - score: 0.97990
 Tox: 0.96242 - STox: 0.98588 - Obs: 0.97668 - Thr: 0.98503 - Ins: 0.97221 - IdH: 0.97717
 val: F1 Score - epoch: 2 - score: 0.64829
 Tox: 0.65996 - STox: 0.25455 - Obs: 0.67976 - Thr: 0.34437 - Ins: 0.65369 - IdH: 0.51550
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99832
 Tox: 0.99822 - STox: 0.99564 - Obs: 0.99855 - Thr: 0.99834 - Ins: 0.99663 - IdH: 0.99762
 train: F1 Score - epoch: 3 - score: 0.89009
 Tox: 0.93742 - STox: 0.52325 - Obs: 0.91651 - Thr: 0.58840 - Ins: 0.86905 - IdH: 0.70317

 val: ROC-AUC - epoch: 3 - score: 0.97760
 Tox: 0.95912 - STox: 0.98633 - Obs: 0.97536 - Thr: 0.97998 - Ins: 0.96966 - IdH: 0.97204
 val: F1 Score - epoch: 3 - score: 0.62585


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99830
 Tox: 0.99818 - STox: 0.99546 - Obs: 0.99868 - Thr: 0.99823 - Ins: 0.99653 - IdH: 0.99780
 train: F1 Score - epoch: 3 - score: 0.88407
 Tox: 0.93737 - STox: 0.54153 - Obs: 0.91770 - Thr: 0.43750 - Ins: 0.85068 - IdH: 0.67695

 val: ROC-AUC - epoch: 3 - score: 0.97861
 Tox: 0.96055 - STox: 0.98503 - Obs: 0.97532 - Thr: 0.98534 - Ins: 0.96963 - IdH: 0.97787
 val: F1 Score - epoch: 3 - score: 0.62477
 Tox: 0.61375 - STox: 0.29082 - Obs: 0.68738 - Thr: 0.25705 - Ins: 0.65069 - IdH: 0.54181
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99898
 Tox: 0.99918 - STox: 0.99699 - Obs: 0.99929 - Thr: 0.99857 - Ins: 0.99792 - IdH: 0.99884
 train: F1 Score - epoch: 4 - score: 0.91480
 Tox: 0.94940 - STox: 0.67288 - Obs: 0.93971 - Thr: 0.53525 - Ins: 0.89568 - IdH: 0.83039

 val: ROC-AUC - epoch: 4 - score: 0.97742
 Tox: 0.95892 - STox: 0.98619 - Obs: 0.97337 - Thr: 0.98055 - Ins: 0.97014 - IdH: 0.97574
 val: F1 Score - epoch: 4 - score: 0.59950


### Replace unknown tokens

In [5]:
emb_idx, emb_mean, emb_std = load_word2vec_embeddings('data/embeddings/GoogleNews-vectors-negative300.bin')
e7_X_train_tok = replace_unknown_tokens(tokenize_sentences(X_train), emb_idx)
e7_X_test_tok = replace_unknown_tokens(tokenize_sentences(X_test), emb_idx)
del emb_idx

Create input matrix

In [6]:
e7_embeddings_mapping = create_embeddings_mapping(e7_X_train_tok, e7_X_test_tok, debug=True)
e7_X_train_input, e7_X_test_input, e7_max_comment_length = \
    convert_tokens_to_padded_indices(e7_X_train_tok, e7_X_test_tok, e7_embeddings_mapping)
del e7_X_train_tok
del e7_X_test_tok

Number of unique tokens: 74212
maximum comment length: 3801


In [7]:
e7_model = generate_model(len(e7_embeddings_mapping)+1, e7_max_comment_length)
e7_scores = train_and_evaluate_model(e7_model, e7_X_train_input, Y_train, (e7_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e7_scores_path = 'data/scores/preprocessing/e7_scores_{}'.format(time.time())
np.save(e7_scores_path, e7_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99093
 Tox: 0.98757 - STox: 0.99093 - Obs: 0.99302 - Thr: 0.98440 - Ins: 0.98902 - IdH: 0.97853
 train: F1 Score - epoch: 1 - score: 0.75169
 Tox: 0.82016 - STox: 0.04485 - Obs: 0.80819 - Thr: 0.00000 - Ins: 0.73617 - IdH: 0.00709

 val: ROC-AUC - epoch: 1 - score: 0.97519
 Tox: 0.96038 - STox: 0.98468 - Obs: 0.97225 - Thr: 0.97784 - Ins: 0.96891 - IdH: 0.95431
 val: F1 Score - epoch: 1 - score: 0.62847
 Tox: 0.66246 - STox: 0.08101 - Obs: 0.67247 - Thr: 0.00000 - Ins: 0.62510 - IdH: 0.00839
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99542
 Tox: 0.99408 - STox: 0.99286 - Obs: 0.99594 - Thr: 0.99603 - Ins: 0.99275 - IdH: 0.99216
 train: F1 Score - epoch: 2 - score: 0.81312
 Tox: 0.86483 - STox: 0.35872 - Obs: 0.86449 - Thr: 0.13258 - Ins: 0.80373 - IdH: 0.33762

 val: ROC-AUC - epoch: 2 - score: 0.97853
 Tox: 0.96380 - STox: 0.98605 - Obs: 0.97287 - Thr: 0.98449 - Ins: 0.97021 - IdH: 0.96519
 val: F1 Score - epoch: 2 - score: 0.64433
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99542
 Tox: 0.99366 - STox: 0.99278 - Obs: 0.99592 - Thr: 0.99442 - Ins: 0.99279 - IdH: 0.99115
 train: F1 Score - epoch: 2 - score: 0.78939
 Tox: 0.84709 - STox: 0.00250 - Obs: 0.85106 - Thr: 0.22302 - Ins: 0.76623 - IdH: 0.40698

 val: ROC-AUC - epoch: 2 - score: 0.97713
 Tox: 0.96050 - STox: 0.98613 - Obs: 0.97110 - Thr: 0.98316 - Ins: 0.96844 - IdH: 0.96151
 val: F1 Score - epoch: 2 - score: 0.64742
 Tox: 0.67504 - STox: 0.00541 - Obs: 0.68407 - Thr: 0.20161 - Ins: 0.63340 - IdH: 0.38787
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99735
 Tox: 0.99698 - STox: 0.99386 - Obs: 0.99742 - Thr: 0.99826 - Ins: 0.99516 - IdH: 0.99593
 train: F1 Score - epoch: 3 - score: 0.85019
 Tox: 0.90640 - STox: 0.07574 - Obs: 0.88950 - Thr: 0.61975 - Ins: 0.83045 - IdH: 0.59806

 val: ROC-AUC - epoch: 3 - score: 0.97752
 Tox: 0.96115 - STox: 0.98587 - Obs: 0.97286 - Thr: 0.98527 - Ins: 0.96884 - IdH: 0.96419
 val: F1 Score - epoch: 3 - score: 0.64476


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99755
 Tox: 0.99746 - STox: 0.99418 - Obs: 0.99775 - Thr: 0.99804 - Ins: 0.99560 - IdH: 0.99612
 train: F1 Score - epoch: 3 - score: 0.86491
 Tox: 0.92267 - STox: 0.43475 - Obs: 0.89466 - Thr: 0.44411 - Ins: 0.84320 - IdH: 0.57102

 val: ROC-AUC - epoch: 3 - score: 0.97630
 Tox: 0.95886 - STox: 0.98571 - Obs: 0.96976 - Thr: 0.98426 - Ins: 0.96780 - IdH: 0.96123
 val: F1 Score - epoch: 3 - score: 0.62715
 Tox: 0.63577 - STox: 0.35474 - Obs: 0.65594 - Thr: 0.39241 - Ins: 0.63269 - IdH: 0.45964
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99833
 Tox: 0.99829 - STox: 0.99578 - Obs: 0.99849 - Thr: 0.99868 - Ins: 0.99697 - IdH: 0.99760
 train: F1 Score - epoch: 4 - score: 0.88594
 Tox: 0.92854 - STox: 0.54015 - Obs: 0.91251 - Thr: 0.49779 - Ins: 0.87281 - IdH: 0.74518

 val: ROC-AUC - epoch: 4 - score: 0.97575
 Tox: 0.95754 - STox: 0.98553 - Obs: 0.97032 - Thr: 0.98403 - Ins: 0.96799 - IdH: 0.95903
 val: F1 Score - epoch: 4 - score: 0.63932
