In [12]:
import pandas as pd
import numpy as np
import tensorflow as tf
import utils
import time

from keras.models import Model, Input
from keras.layers import Dense, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dropout, \
    Embedding, Concatenate, SpatialDropout1D, MaxPooling1D

from utils.preprocessing_utils import tokenize_sentences, convert_tokens_to_padded_sequence, \
    remove_punctuation, remove_punctuation_weak, perform_stemming, perform_lemmatization
from utils.dataset_utils import load_data_from_csv
from utils.embedding_utils import create_embeddings_mapping
from utils.training_utils import train_and_evaluate_model

In [11]:
import importlib
import utils
importlib.reload(utils.embedding_utils)
importlib.reload(utils.dataset_utils)
importlib.reload(utils.preprocessing_utils)
importlib.reload(utils.training_utils)
importlib.reload(utils.keras_utils)

<module 'utils.keras_utils' from '/home/philipp/work/gitprojects/toxic-comment-experiments/utils/keras_utils.py'>

Global parameters which hold for all models

In [2]:
random_seed = 2018
classes = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
features = 'comment_text'
np.random.seed(random_seed)
path_train_data = 'data/kaggle/train.csv'
path_test_data = 'data/kaggle/test_complete.csv'

embedding_length = 300
path_embeddings = 'data/embeddings/GoogleNews-vectors-negative300.bin.gz'

batch_size = 64
epochs = 5

In [3]:
def convert_tokens_to_padded_indices(X_train_tok, X_test_tok, word_embedding_mapping):
    max_len_train = X_train_tok.apply(lambda x: len(x)).max()
    max_len_test = X_test_tok.apply(lambda x: len(x)).max()
    # limit length to 2000, otherwise we get a MemoryError
    print("maximum comment length: {}".format(max(max_len_train, max_len_test)))
    max_comment_length = min(2000, max(max_len_train, max_len_test))
    X_train_input = convert_tokens_to_padded_sequence(X_train_tok, word_embedding_mapping, max_comment_length)
    X_test_input = convert_tokens_to_padded_sequence(X_test_tok, word_embedding_mapping, max_comment_length)
    return X_train_input, X_test_input, max_comment_length
    
def generate_model(num_tokens, max_comment_length):
    # hyperparameters
    kernel_sizes = [3, 4, 5]
    hidden_dim = 100
    num_filters = [100, 100, 100]
    dropout = 0.4
    spatial_dropout = 0.2
    
    # model
    input = Input((max_comment_length,))
    word_emb = Embedding(input_dim=num_tokens, output_dim=embedding_length, input_length=max_comment_length)(input)
    word_emb = SpatialDropout1D(spatial_dropout)(word_emb)

    conv1 = Conv1D(kernel_size=kernel_sizes[0], filters=num_filters[0], padding='same')(word_emb)
    conv1 = GlobalMaxPooling1D()(conv1)

    conv2 = Conv1D(kernel_size=kernel_sizes[1], filters=num_filters[1], padding='same')(word_emb)
    conv2 = GlobalMaxPooling1D()(conv2)

    conv3 = Conv1D(kernel_size=kernel_sizes[2], filters=num_filters[2], padding='same')(word_emb)
    conv3 = GlobalMaxPooling1D()(conv3)

    concat4 = Concatenate()([conv1, conv2, conv3])

    fc5 = Dense(hidden_dim, activation='relu')(concat4)
    fc5 = Dropout(dropout)(fc5)
    output = Dense(len(classes), activation='sigmoid')(fc5)

    model = Model(inputs=[input], outputs=[output])
    return model

Load train and test data and pretrained word2vec embeddings

In [4]:
X_train, Y_train = load_data_from_csv(path_train_data, features, classes)
X_test, Y_test = load_data_from_csv(path_test_data, features, classes)

### Baseline preprocessing
The following preprocessing techniques are applied for the baseline:
* transformation of all characters to lowercase
* tokenization using the NLTK TweetTokenizer

In [5]:
e1_X_train_tok = tokenize_sentences(X_train)
e1_X_test_tok = tokenize_sentences(X_test)

Create input matrix

In [6]:
e1_embeddings_mapping = create_embeddings_mapping(e1_X_train_tok, e1_X_test_tok, debug=True)
e1_X_train_input, e1_X_test_input, e1_max_comment_length = \
    convert_tokens_to_padded_indices(e1_X_train_tok, e1_X_test_tok, e1_embeddings_mapping)
del e1_X_train_tok
del e1_X_test_tok

In [7]:
e1_model = generate_model(len(e1_embeddings_mapping)+1, e1_max_comment_length)
e1_scores = train_and_evaluate_model(e1_model, e1_X_train_input, Y_train, (e1_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e1_scores_path = 'data/scores/preprocessing/e1_scores_{}'.format(time.time())
np.save(e1_scores_path, e1_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99381
 Tox: 0.99149 - STox: 0.99233 - Obs: 0.99547 - Thr: 0.98915 - Ins: 0.99129 - IdH: 0.98754
 train: F1 Score - epoch: 1 - score: 0.78471
 Tox: 0.85237 - STox: 0.02585 - Obs: 0.84175 - Thr: 0.00000 - Ins: 0.76205 - IdH: 0.18444

 val: ROC-AUC - epoch: 1 - score: 0.97733
 Tox: 0.96147 - STox: 0.98600 - Obs: 0.97441 - Thr: 0.97437 - Ins: 0.97003 - IdH: 0.97144
 val: F1 Score - epoch: 1 - score: 0.63049
 Tox: 0.65222 - STox: 0.03675 - Obs: 0.67616 - Thr: 0.00000 - Ins: 0.63590 - IdH: 0.14948
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99709
 Tox: 0.99655 - STox: 0.99446 - Obs: 0.99756 - Thr: 0.99738 - Ins: 0.99510 - IdH: 0.99557
 train: F1 Score - epoch: 2 - score: 0.85290
 Tox: 0.90255 - STox: 0.44962 - Obs: 0.89142 - Thr: 0.22262 - Ins: 0.83725 - IdH: 0.59657

 val: ROC-AUC - epoch: 2 - score: 0.97998
 Tox: 0.96325 - STox: 0.98726 - Obs: 0.97498 - Thr: 0.98496 - Ins: 0.97221 - IdH: 0.97863
 val: F1 Score - epoch: 2 - score: 0.63462
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99710
 Tox: 0.99649 - STox: 0.99440 - Obs: 0.99744 - Thr: 0.99601 - Ins: 0.99485 - IdH: 0.99570
 train: F1 Score - epoch: 2 - score: 0.82098
 Tox: 0.87566 - STox: 0.14326 - Obs: 0.87976 - Thr: 0.05285 - Ins: 0.80681 - IdH: 0.43095

 val: ROC-AUC - epoch: 2 - score: 0.97896
 Tox: 0.96121 - STox: 0.98507 - Obs: 0.97502 - Thr: 0.98280 - Ins: 0.97052 - IdH: 0.97537
 val: F1 Score - epoch: 2 - score: 0.64368
 Tox: 0.66093 - STox: 0.10959 - Obs: 0.69049 - Thr: 0.12389 - Ins: 0.64052 - IdH: 0.38696
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99838
 Tox: 0.99813 - STox: 0.99590 - Obs: 0.99872 - Thr: 0.99792 - Ins: 0.99703 - IdH: 0.99806
 train: F1 Score - epoch: 3 - score: 0.88964
 Tox: 0.93707 - STox: 0.32040 - Obs: 0.92106 - Thr: 0.44136 - Ins: 0.87610 - IdH: 0.73875

 val: ROC-AUC - epoch: 3 - score: 0.97800
 Tox: 0.96049 - STox: 0.98608 - Obs: 0.97438 - Thr: 0.98205 - Ins: 0.96886 - IdH: 0.97468
 val: F1 Score - epoch: 3 - score: 0.61628


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99833
 Tox: 0.99819 - STox: 0.99613 - Obs: 0.99864 - Thr: 0.99837 - Ins: 0.99672 - IdH: 0.99771
 train: F1 Score - epoch: 3 - score: 0.88881
 Tox: 0.93671 - STox: 0.56479 - Obs: 0.91968 - Thr: 0.50952 - Ins: 0.86613 - IdH: 0.66526

 val: ROC-AUC - epoch: 3 - score: 0.97764
 Tox: 0.95977 - STox: 0.98589 - Obs: 0.97287 - Thr: 0.98211 - Ins: 0.96946 - IdH: 0.97676
 val: F1 Score - epoch: 3 - score: 0.61541
 Tox: 0.60541 - STox: 0.32800 - Obs: 0.66140 - Thr: 0.32530 - Ins: 0.64043 - IdH: 0.54123
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99904
 Tox: 0.99905 - STox: 0.99739 - Obs: 0.99928 - Thr: 0.99912 - Ins: 0.99833 - IdH: 0.99865
 train: F1 Score - epoch: 4 - score: 0.92086
 Tox: 0.95309 - STox: 0.73159 - Obs: 0.94139 - Thr: 0.68159 - Ins: 0.90883 - IdH: 0.81039

 val: ROC-AUC - epoch: 4 - score: 0.97595
 Tox: 0.95599 - STox: 0.98462 - Obs: 0.97148 - Thr: 0.98145 - Ins: 0.96652 - IdH: 0.97263
 val: F1 Score - epoch: 4 - score: 0.61624


### Strip more than 3 of the same characters in a row
In addition to the baseline techniques all characters which occur more than 3 times in a row are stripped to a length of 3.

In [10]:
e2_X_train_tok = tokenize_sentences(X_train, reduce_len=True)
e2_X_test_tok = tokenize_sentences(X_test, reduce_len=True)

Create input matrix

In [11]:
e2_embeddings_mapping = create_embeddings_mapping(e2_X_train_tok, e2_X_test_tok, debug=True)
e2_X_train_input, e2_X_test_input, e2_max_comment_length = \
    convert_tokens_to_padded_indices(e2_X_train_tok, e2_X_test_tok, e2_embeddings_mapping)
del e2_X_train_tok
del e2_X_test_tok

maximum comment length: 3801


In [8]:
e2_model = generate_model(len(e2_embeddings_mapping)+1, e2_max_comment_length)
e2_scores = train_and_evaluate_model(e2_model, e2_X_train_input, Y_train, (e2_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e2_scores_path = 'data/scores/preprocessing/e2_scores_{}'.format(time.time())
np.save(e2_scores_path, e2_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99365
 Tox: 0.99139 - STox: 0.99226 - Obs: 0.99538 - Thr: 0.98887 - Ins: 0.99139 - IdH: 0.98502
 train: F1 Score - epoch: 1 - score: 0.78571
 Tox: 0.85259 - STox: 0.08894 - Obs: 0.84118 - Thr: 0.00000 - Ins: 0.77251 - IdH: 0.01549

 val: ROC-AUC - epoch: 1 - score: 0.97748
 Tox: 0.96131 - STox: 0.98625 - Obs: 0.97495 - Thr: 0.97707 - Ins: 0.97112 - IdH: 0.96805
 val: F1 Score - epoch: 1 - score: 0.62819
 Tox: 0.64749 - STox: 0.11111 - Obs: 0.67641 - Thr: 0.00000 - Ins: 0.64308 - IdH: 0.02216
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99708
 Tox: 0.99665 - STox: 0.99436 - Obs: 0.99765 - Thr: 0.99726 - Ins: 0.99499 - IdH: 0.99561
 train: F1 Score - epoch: 2 - score: 0.85625
 Tox: 0.91358 - STox: 0.42182 - Obs: 0.88444 - Thr: 0.28227 - Ins: 0.83337 - IdH: 0.62198

 val: ROC-AUC - epoch: 2 - score: 0.97889
 Tox: 0.96294 - STox: 0.98554 - Obs: 0.97393 - Thr: 0.98297 - Ins: 0.97057 - IdH: 0.97786
 val: F1 Score - epoch: 2 - score: 0.60929
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99696
 Tox: 0.99653 - STox: 0.99427 - Obs: 0.99714 - Thr: 0.99595 - Ins: 0.99449 - IdH: 0.99474
 train: F1 Score - epoch: 2 - score: 0.82349
 Tox: 0.88468 - STox: 0.14389 - Obs: 0.87916 - Thr: 0.01247 - Ins: 0.80748 - IdH: 0.33754

 val: ROC-AUC - epoch: 2 - score: 0.97837
 Tox: 0.96082 - STox: 0.98525 - Obs: 0.97445 - Thr: 0.98292 - Ins: 0.96997 - IdH: 0.97430
 val: F1 Score - epoch: 2 - score: 0.63208
 Tox: 0.64052 - STox: 0.11404 - Obs: 0.68128 - Thr: 0.00935 - Ins: 0.65048 - IdH: 0.34038
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99836
 Tox: 0.99837 - STox: 0.99554 - Obs: 0.99864 - Thr: 0.99736 - Ins: 0.99678 - IdH: 0.99764
 train: F1 Score - epoch: 3 - score: 0.88921
 Tox: 0.94228 - STox: 0.33317 - Obs: 0.92068 - Thr: 0.39303 - Ins: 0.86982 - IdH: 0.72066

 val: ROC-AUC - epoch: 3 - score: 0.97786
 Tox: 0.95956 - STox: 0.98655 - Obs: 0.97353 - Thr: 0.98416 - Ins: 0.96922 - IdH: 0.97587
 val: F1 Score - epoch: 3 - score: 0.61808


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99834
 Tox: 0.99831 - STox: 0.99567 - Obs: 0.99863 - Thr: 0.99815 - Ins: 0.99686 - IdH: 0.99759
 train: F1 Score - epoch: 3 - score: 0.88867
 Tox: 0.94145 - STox: 0.48540 - Obs: 0.91872 - Thr: 0.49619 - Ins: 0.87400 - IdH: 0.55014

 val: ROC-AUC - epoch: 3 - score: 0.97747
 Tox: 0.95926 - STox: 0.98649 - Obs: 0.97185 - Thr: 0.98537 - Ins: 0.96961 - IdH: 0.97639
 val: F1 Score - epoch: 3 - score: 0.61084
 Tox: 0.60182 - STox: 0.31201 - Obs: 0.64567 - Thr: 0.31469 - Ins: 0.64955 - IdH: 0.48231
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99901
 Tox: 0.99911 - STox: 0.99713 - Obs: 0.99935 - Thr: 0.99876 - Ins: 0.99822 - IdH: 0.99856
 train: F1 Score - epoch: 4 - score: 0.91460
 Tox: 0.95027 - STox: 0.60680 - Obs: 0.94306 - Thr: 0.60548 - Ins: 0.90038 - IdH: 0.79358

 val: ROC-AUC - epoch: 4 - score: 0.97679
 Tox: 0.95773 - STox: 0.98466 - Obs: 0.97272 - Thr: 0.98309 - Ins: 0.96724 - IdH: 0.97433
 val: F1 Score - epoch: 4 - score: 0.62137


### Remove all punctuation

In [5]:
e3_X_train_tok = tokenize_sentences(remove_punctuation(X_train))
e3_X_test_tok = tokenize_sentences(remove_punctuation(X_test))

Create input matrix

In [6]:
e3_embeddings_mapping = create_embeddings_mapping(e3_X_train_tok, e3_X_test_tok, debug=True)
e3_X_train_input, e3_X_test_input, e3_max_comment_length = \
    convert_tokens_to_padded_indices(e3_X_train_tok, e3_X_test_tok, e3_embeddings_mapping)
del e3_X_train_tok
del e3_X_test_tok

Number of unique tokens: 334853
maximum comment length: 2321


In [7]:
e3_model = generate_model(len(e3_embeddings_mapping)+1, e3_max_comment_length)
e3_scores = train_and_evaluate_model(e3_model, e3_X_train_input, Y_train, (e3_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e3_scores_path = 'data/scores/preprocessing/e3_scores_{}'.format(time.time())
np.save(e3_scores_path, e3_scores)

RUN 1/5


  "This may consume a large amount of memory." % num_elements)


Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99330
 Tox: 0.99115 - STox: 0.99201 - Obs: 0.99535 - Thr: 0.98797 - Ins: 0.99123 - IdH: 0.98452
 train: F1 Score - epoch: 1 - score: 0.78401
 Tox: 0.85155 - STox: 0.07160 - Obs: 0.84369 - Thr: 0.00000 - Ins: 0.76292 - IdH: 0.00142

 val: ROC-AUC - epoch: 1 - score: 0.97712
 Tox: 0.96193 - STox: 0.98595 - Obs: 0.97566 - Thr: 0.97505 - Ins: 0.97192 - IdH: 0.96573
 val: F1 Score - epoch: 1 - score: 0.63171
 Tox: 0.65375 - STox: 0.07353 - Obs: 0.67777 - Thr: 0.00000 - Ins: 0.64585 - IdH: 0.00000
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99704
 Tox: 0.99636 - STox: 0.99441 - Obs: 0.99771 - Thr: 0.99730 - Ins: 0.99474 - IdH: 0.99517
 train: F1 Score - epoch: 2 - score: 0.85206
 Tox: 0.90934 - STox: 0.43763 - Obs: 0.89238 - Thr: 0.24028 - Ins: 0.83174 - IdH: 0.43182

 val: ROC-AUC - epoch: 2 - score: 0.97916
 Tox: 0.96150 - STox: 0.98700 - Obs: 0.97486 - Thr: 0.98430 - Ins: 0.97096 - IdH: 0.97774
 val: F1 Score - epoch: 2 - score: 0.62349
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99692
 Tox: 0.99658 - STox: 0.99455 - Obs: 0.99740 - Thr: 0.99507 - Ins: 0.99434 - IdH: 0.99473
 train: F1 Score - epoch: 2 - score: 0.82556
 Tox: 0.89381 - STox: 0.18352 - Obs: 0.88063 - Thr: 0.19521 - Ins: 0.80480 - IdH: 0.08548

 val: ROC-AUC - epoch: 2 - score: 0.97857
 Tox: 0.96092 - STox: 0.98546 - Obs: 0.97580 - Thr: 0.98052 - Ins: 0.97019 - IdH: 0.97195
 val: F1 Score - epoch: 2 - score: 0.63917
 Tox: 0.66114 - STox: 0.14925 - Obs: 0.69688 - Thr: 0.19841 - Ins: 0.63557 - IdH: 0.09788
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99829
 Tox: 0.99810 - STox: 0.99573 - Obs: 0.99871 - Thr: 0.99790 - Ins: 0.99667 - IdH: 0.99763
 train: F1 Score - epoch: 3 - score: 0.88530
 Tox: 0.93837 - STox: 0.26128 - Obs: 0.91562 - Thr: 0.50808 - Ins: 0.86657 - IdH: 0.70833

 val: ROC-AUC - epoch: 3 - score: 0.97783
 Tox: 0.95926 - STox: 0.98533 - Obs: 0.97507 - Thr: 0.98097 - Ins: 0.97031 - IdH: 0.97281
 val: F1 Score - epoch: 3 - score: 0.62290


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99836
 Tox: 0.99836 - STox: 0.99603 - Obs: 0.99860 - Thr: 0.99839 - Ins: 0.99700 - IdH: 0.99815
 train: F1 Score - epoch: 3 - score: 0.88667
 Tox: 0.93014 - STox: 0.47412 - Obs: 0.92016 - Thr: 0.56540 - Ins: 0.87604 - IdH: 0.66899

 val: ROC-AUC - epoch: 3 - score: 0.97740
 Tox: 0.95897 - STox: 0.98447 - Obs: 0.97288 - Thr: 0.98072 - Ins: 0.96926 - IdH: 0.97300
 val: F1 Score - epoch: 3 - score: 0.63419
 Tox: 0.64657 - STox: 0.29607 - Obs: 0.65643 - Thr: 0.40729 - Ins: 0.64541 - IdH: 0.48707
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99923
 Tox: 0.99931 - STox: 0.99723 - Obs: 0.99944 - Thr: 0.99931 - Ins: 0.99852 - IdH: 0.99874
 train: F1 Score - epoch: 4 - score: 0.92725
 Tox: 0.96207 - STox: 0.68974 - Obs: 0.94931 - Thr: 0.69975 - Ins: 0.91327 - IdH: 0.81125

 val: ROC-AUC - epoch: 4 - score: 0.97608
 Tox: 0.95527 - STox: 0.98465 - Obs: 0.97178 - Thr: 0.97818 - Ins: 0.96739 - IdH: 0.97033
 val: F1 Score - epoch: 4 - score: 0.62216


### Remove all punctuation except for .,!?

In [8]:
e4_X_train_tok = tokenize_sentences(remove_punctuation_weak(X_train))
e4_X_test_tok = tokenize_sentences(remove_punctuation_weak(X_test))

Create input matrix

In [9]:
e4_embeddings_mapping = create_embeddings_mapping(e4_X_train_tok, e4_X_test_tok, debug=True)
e4_X_train_input, e4_X_test_input, e4_max_comment_length = \
    convert_tokens_to_padded_indices(e4_X_train_tok, e4_X_test_tok, e4_embeddings_mapping)
del e4_X_train_tok
del e4_X_test_tok

Number of unique tokens: 330219
maximum comment length: 2321


In [10]:
e4_model = generate_model(len(e4_embeddings_mapping)+1, e4_max_comment_length)
e4_scores = train_and_evaluate_model(e4_model, e4_X_train_input, Y_train, (e4_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e4_scores_path = 'data/scores/preprocessing/e4_scores_{}'.format(time.time())
np.save(e4_scores_path, e4_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99369
 Tox: 0.99138 - STox: 0.99135 - Obs: 0.99508 - Thr: 0.98850 - Ins: 0.99103 - IdH: 0.98522
 train: F1 Score - epoch: 1 - score: 0.79733
 Tox: 0.86069 - STox: 0.34879 - Obs: 0.84042 - Thr: 0.00000 - Ins: 0.78165 - IdH: 0.16367

 val: ROC-AUC - epoch: 1 - score: 0.97749
 Tox: 0.96204 - STox: 0.98719 - Obs: 0.97535 - Thr: 0.97516 - Ins: 0.96956 - IdH: 0.96443
 val: F1 Score - epoch: 1 - score: 0.61813
 Tox: 0.63366 - STox: 0.37823 - Obs: 0.64572 - Thr: 0.00000 - Ins: 0.63720 - IdH: 0.16337
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99694
 Tox: 0.99671 - STox: 0.99397 - Obs: 0.99748 - Thr: 0.99609 - Ins: 0.99481 - IdH: 0.99549
 train: F1 Score - epoch: 2 - score: 0.83858
 Tox: 0.89299 - STox: 0.23770 - Obs: 0.88560 - Thr: 0.21818 - Ins: 0.83382 - IdH: 0.37082

 val: ROC-AUC - epoch: 2 - score: 0.97944
 Tox: 0.96155 - STox: 0.98585 - Obs: 0.97621 - Thr: 0.98109 - Ins: 0.97157 - IdH: 0.97482
 val: F1 Score - epoch: 2 - score: 0.63944
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99673
 Tox: 0.99611 - STox: 0.99421 - Obs: 0.99750 - Thr: 0.99593 - Ins: 0.99437 - IdH: 0.99422
 train: F1 Score - epoch: 2 - score: 0.83320
 Tox: 0.89767 - STox: 0.06344 - Obs: 0.88462 - Thr: 0.11024 - Ins: 0.82213 - IdH: 0.17636

 val: ROC-AUC - epoch: 2 - score: 0.97927
 Tox: 0.96316 - STox: 0.98787 - Obs: 0.97661 - Thr: 0.98084 - Ins: 0.97161 - IdH: 0.97337
 val: F1 Score - epoch: 2 - score: 0.63009
 Tox: 0.64554 - STox: 0.05432 - Obs: 0.66689 - Thr: 0.08929 - Ins: 0.64633 - IdH: 0.20219
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99839
 Tox: 0.99818 - STox: 0.99595 - Obs: 0.99874 - Thr: 0.99877 - Ins: 0.99684 - IdH: 0.99744
 train: F1 Score - epoch: 3 - score: 0.89138
 Tox: 0.93763 - STox: 0.51949 - Obs: 0.92143 - Thr: 0.52353 - Ins: 0.87528 - IdH: 0.65548

 val: ROC-AUC - epoch: 3 - score: 0.97802
 Tox: 0.95925 - STox: 0.98584 - Obs: 0.97539 - Thr: 0.98132 - Ins: 0.96972 - IdH: 0.97435
 val: F1 Score - epoch: 3 - score: 0.62954


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99847
 Tox: 0.99845 - STox: 0.99607 - Obs: 0.99880 - Thr: 0.99828 - Ins: 0.99691 - IdH: 0.99783
 train: F1 Score - epoch: 3 - score: 0.89552
 Tox: 0.94396 - STox: 0.64878 - Obs: 0.92354 - Thr: 0.38387 - Ins: 0.86683 - IdH: 0.72322

 val: ROC-AUC - epoch: 3 - score: 0.97888
 Tox: 0.96117 - STox: 0.98575 - Obs: 0.97563 - Thr: 0.98509 - Ins: 0.97144 - IdH: 0.97681
 val: F1 Score - epoch: 3 - score: 0.62272
 Tox: 0.61179 - STox: 0.36214 - Obs: 0.67707 - Thr: 0.23567 - Ins: 0.65125 - IdH: 0.56340
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99922
 Tox: 0.99943 - STox: 0.99729 - Obs: 0.99942 - Thr: 0.99907 - Ins: 0.99852 - IdH: 0.99880
 train: F1 Score - epoch: 4 - score: 0.92418
 Tox: 0.96181 - STox: 0.63479 - Obs: 0.94784 - Thr: 0.59916 - Ins: 0.90974 - IdH: 0.78954

 val: ROC-AUC - epoch: 4 - score: 0.97700
 Tox: 0.95732 - STox: 0.98444 - Obs: 0.97286 - Thr: 0.98217 - Ins: 0.96989 - IdH: 0.97297
 val: F1 Score - epoch: 4 - score: 0.61461


### Stemming

In [5]:
e5_X_train_tok = perform_stemming(tokenize_sentences(X_train))
e5_X_test_tok = perform_stemming(tokenize_sentences(X_test))

Create input matrix

In [6]:
e5_embeddings_mapping = create_embeddings_mapping(e5_X_train_tok, e5_X_test_tok, debug=True)
e5_X_train_input, e5_X_test_input, e5_max_comment_length = \
    convert_tokens_to_padded_indices(e5_X_train_tok, e5_X_test_tok, e5_embeddings_mapping)
del e5_X_train_tok
del e5_X_test_tok

Number of unique tokens: 270849
maximum comment length: 3801


In [7]:
e5_model = generate_model(len(e5_embeddings_mapping)+1, e5_max_comment_length)
e5_scores = train_and_evaluate_model(e5_model, e5_X_train_input, Y_train, (e5_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e5_scores_path = 'data/scores/preprocessing/e5_scores_{}'.format(time.time())
np.save(e5_scores_path, e5_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99356
 Tox: 0.99080 - STox: 0.99182 - Obs: 0.99552 - Thr: 0.98581 - Ins: 0.99118 - IdH: 0.98775
 train: F1 Score - epoch: 1 - score: 0.77589
 Tox: 0.83915 - STox: 0.02099 - Obs: 0.83376 - Thr: 0.00000 - Ins: 0.76680 - IdH: 0.12336

 val: ROC-AUC - epoch: 1 - score: 0.97857
 Tox: 0.96319 - STox: 0.98750 - Obs: 0.97499 - Thr: 0.97388 - Ins: 0.97196 - IdH: 0.97518
 val: F1 Score - epoch: 1 - score: 0.64041
 Tox: 0.66440 - STox: 0.05699 - Obs: 0.67771 - Thr: 0.00000 - Ins: 0.65493 - IdH: 0.14249
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99676
 Tox: 0.99594 - STox: 0.99442 - Obs: 0.99758 - Thr: 0.99725 - Ins: 0.99452 - IdH: 0.99521
 train: F1 Score - epoch: 2 - score: 0.84456
 Tox: 0.89348 - STox: 0.49161 - Obs: 0.88753 - Thr: 0.22263 - Ins: 0.82358 - IdH: 0.55636

 val: ROC-AUC - epoch: 2 - score: 0.98032
 Tox: 0.96340 - STox: 0.98756 - Obs: 0.97621 - Thr: 0.98807 - Ins: 0.97280 - IdH: 0.98096
 val: F1 Score - epoch: 2 - score: 0.63132
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99677
 Tox: 0.99598 - STox: 0.99400 - Obs: 0.99730 - Thr: 0.99592 - Ins: 0.99428 - IdH: 0.99525
 train: F1 Score - epoch: 2 - score: 0.81911
 Tox: 0.86954 - STox: 0.02343 - Obs: 0.87735 - Thr: 0.04090 - Ins: 0.80988 - IdH: 0.53527

 val: ROC-AUC - epoch: 2 - score: 0.97949
 Tox: 0.96091 - STox: 0.98628 - Obs: 0.97473 - Thr: 0.98797 - Ins: 0.97042 - IdH: 0.98101
 val: F1 Score - epoch: 2 - score: 0.64750
 Tox: 0.67012 - STox: 0.04700 - Obs: 0.67791 - Thr: 0.01843 - Ins: 0.64053 - IdH: 0.50669
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99810
 Tox: 0.99787 - STox: 0.99542 - Obs: 0.99844 - Thr: 0.99802 - Ins: 0.99608 - IdH: 0.99779
 train: F1 Score - epoch: 3 - score: 0.86907
 Tox: 0.92239 - STox: 0.06867 - Obs: 0.91159 - Thr: 0.43586 - Ins: 0.85212 - IdH: 0.66782

 val: ROC-AUC - epoch: 3 - score: 0.97880
 Tox: 0.96067 - STox: 0.98747 - Obs: 0.97395 - Thr: 0.98604 - Ins: 0.97004 - IdH: 0.97995
 val: F1 Score - epoch: 3 - score: 0.63084


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99828
 Tox: 0.99809 - STox: 0.99555 - Obs: 0.99847 - Thr: 0.99864 - Ins: 0.99659 - IdH: 0.99769
 train: F1 Score - epoch: 3 - score: 0.88329
 Tox: 0.93338 - STox: 0.53437 - Obs: 0.90889 - Thr: 0.55887 - Ins: 0.85928 - IdH: 0.68417

 val: ROC-AUC - epoch: 3 - score: 0.97841
 Tox: 0.96149 - STox: 0.98674 - Obs: 0.97229 - Thr: 0.98840 - Ins: 0.97093 - IdH: 0.97788
 val: F1 Score - epoch: 3 - score: 0.62294
 Tox: 0.61806 - STox: 0.34003 - Obs: 0.65393 - Thr: 0.33333 - Ins: 0.65535 - IdH: 0.55433
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99896
 Tox: 0.99883 - STox: 0.99716 - Obs: 0.99921 - Thr: 0.99912 - Ins: 0.99797 - IdH: 0.99875
 train: F1 Score - epoch: 4 - score: 0.91453
 Tox: 0.94685 - STox: 0.69748 - Obs: 0.93802 - Thr: 0.70999 - Ins: 0.89452 - IdH: 0.82487

 val: ROC-AUC - epoch: 4 - score: 0.97712
 Tox: 0.95649 - STox: 0.98530 - Obs: 0.97349 - Thr: 0.98616 - Ins: 0.96885 - IdH: 0.97516
 val: F1 Score - epoch: 4 - score: 0.62329


### Lemmatization

In [14]:
e6_X_train_tok = perform_lemmatization(tokenize_sentences(X_train))
e6_X_test_tok = perform_lemmatization(tokenize_sentences(X_test))

Create input matrix

In [16]:
e6_embeddings_mapping = create_embeddings_mapping(e6_X_train_tok, e6_X_test_tok, debug=True)
e6_X_train_input, e6_X_test_input, e6_max_comment_length = \
    convert_tokens_to_padded_indices(e6_X_train_tok, e6_X_test_tok, e6_embeddings_mapping)
del e6_X_train_tok
del e6_X_test_tok

Number of unique tokens: 314708
maximum comment length: 3801


In [17]:
e6_model = generate_model(len(e6_embeddings_mapping)+1, e6_max_comment_length)
e6_scores = train_and_evaluate_model(e6_model, e6_X_train_input, Y_train, (e6_X_test_input, Y_test), \
                                    epochs, batch_size, 'adam', 'binary_crossentropy', ['accuracy'], \
                                    random_seed, runs=5)
e6_scores_path = 'data/scores/preprocessing/e6_scores_{}'.format(time.time())
np.save(e6_scores_path, e6_scores)

RUN 1/5
Train on 159571 samples, validate on 63978 samples
Epoch 1/5

  'precision', 'predicted', average, warn_for)



 train: ROC-AUC - epoch: 1 - score: 0.99301
 Tox: 0.99127 - STox: 0.99163 - Obs: 0.99497 - Thr: 0.98754 - Ins: 0.99093 - IdH: 0.98866
 train: F1 Score - epoch: 1 - score: 0.77740
 Tox: 0.85407 - STox: 0.00000 - Obs: 0.80848 - Thr: 0.00000 - Ins: 0.74904 - IdH: 0.03352

 val: ROC-AUC - epoch: 1 - score: 0.97542
 Tox: 0.96182 - STox: 0.98472 - Obs: 0.97617 - Thr: 0.97525 - Ins: 0.96937 - IdH: 0.97136
 val: F1 Score - epoch: 1 - score: 0.58668
 Tox: 0.59857 - STox: 0.00543 - Obs: 0.60843 - Thr: 0.00000 - Ins: 0.63027 - IdH: 0.03562
Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99687
 Tox: 0.99642 - STox: 0.99406 - Obs: 0.99745 - Thr: 0.99706 - Ins: 0.99471 - IdH: 0.99610
 train: F1 Score - epoch: 2 - score: 0.83701
 Tox: 0.88382 - STox: 0.25499 - Obs: 0.88526 - Thr: 0.29066 - Ins: 0.83069 - IdH: 0.51275

 val: ROC-AUC - epoch: 2 - score: 0.98009
 Tox: 0.96186 - STox: 0.98617 - Obs: 0.97724 - Thr: 0.98168 - Ins: 0.97311 - IdH: 0.97666
 val: F1 Score - epoch: 2 - score: 0.64764
 Tox: 0.6

Epoch 2/5
 train: ROC-AUC - epoch: 2 - score: 0.99690
 Tox: 0.99604 - STox: 0.99406 - Obs: 0.99752 - Thr: 0.99624 - Ins: 0.99424 - IdH: 0.99549
 train: F1 Score - epoch: 2 - score: 0.83233
 Tox: 0.87977 - STox: 0.39786 - Obs: 0.88839 - Thr: 0.44910 - Ins: 0.80246 - IdH: 0.51823

 val: ROC-AUC - epoch: 2 - score: 0.97990
 Tox: 0.96242 - STox: 0.98588 - Obs: 0.97668 - Thr: 0.98503 - Ins: 0.97221 - IdH: 0.97717
 val: F1 Score - epoch: 2 - score: 0.64829
 Tox: 0.65996 - STox: 0.25455 - Obs: 0.67976 - Thr: 0.34437 - Ins: 0.65369 - IdH: 0.51550
Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99832
 Tox: 0.99822 - STox: 0.99564 - Obs: 0.99855 - Thr: 0.99834 - Ins: 0.99663 - IdH: 0.99762
 train: F1 Score - epoch: 3 - score: 0.89009
 Tox: 0.93742 - STox: 0.52325 - Obs: 0.91651 - Thr: 0.58840 - Ins: 0.86905 - IdH: 0.70317

 val: ROC-AUC - epoch: 3 - score: 0.97760
 Tox: 0.95912 - STox: 0.98633 - Obs: 0.97536 - Thr: 0.97998 - Ins: 0.96966 - IdH: 0.97204
 val: F1 Score - epoch: 3 - score: 0.62585


Epoch 3/5
 train: ROC-AUC - epoch: 3 - score: 0.99830
 Tox: 0.99818 - STox: 0.99546 - Obs: 0.99868 - Thr: 0.99823 - Ins: 0.99653 - IdH: 0.99780
 train: F1 Score - epoch: 3 - score: 0.88407
 Tox: 0.93737 - STox: 0.54153 - Obs: 0.91770 - Thr: 0.43750 - Ins: 0.85068 - IdH: 0.67695

 val: ROC-AUC - epoch: 3 - score: 0.97861
 Tox: 0.96055 - STox: 0.98503 - Obs: 0.97532 - Thr: 0.98534 - Ins: 0.96963 - IdH: 0.97787
 val: F1 Score - epoch: 3 - score: 0.62477
 Tox: 0.61375 - STox: 0.29082 - Obs: 0.68738 - Thr: 0.25705 - Ins: 0.65069 - IdH: 0.54181
Epoch 4/5
 train: ROC-AUC - epoch: 4 - score: 0.99898
 Tox: 0.99918 - STox: 0.99699 - Obs: 0.99929 - Thr: 0.99857 - Ins: 0.99792 - IdH: 0.99884
 train: F1 Score - epoch: 4 - score: 0.91480
 Tox: 0.94940 - STox: 0.67288 - Obs: 0.93971 - Thr: 0.53525 - Ins: 0.89568 - IdH: 0.83039

 val: ROC-AUC - epoch: 4 - score: 0.97742
 Tox: 0.95892 - STox: 0.98619 - Obs: 0.97337 - Thr: 0.98055 - Ins: 0.97014 - IdH: 0.97574
 val: F1 Score - epoch: 4 - score: 0.59950
