In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time, datetime
import sklearn, sklearn.metrics, sklearn.decomposition
import collections
import matplotlib.pyplot as plt

train_file = '../DataSets/Toxic/dev_train.csv'
test_file = '../DataSets/Toxic/dev_valid.csv'

In [2]:
src_train = pd.read_csv(train_file)
src_test = pd.read_csv(test_file)

In [3]:
src_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
2,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
3,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
4,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0


In [4]:
lens = np.array([len(x) for x in src_train.comment_text])
np.percentile(lens, q=[50, 75, 90, 95, 99])

array([  205.,   435.,   891.,  1369.,  3474.])

In [5]:
chars = sorted(dict(collections.Counter([z for x in src_train.comment_text for z in x])).items(),
               key=lambda x:x[1], reverse=True)

In [6]:
chars_pct = np.cumsum([x[1] for x in chars]) / np.sum([x[1] for x in chars])

In [7]:
chars_res = list(zip([x[0] for x in chars], chars_pct))

In [8]:
def transform_text(text, charmap, seq_len):
    if type(text) is list:
        return np.array([transform_text(x, charmap, seq_len) for x in text])
    
    tmp = [charmap[x] for x in text if x in charmap]
    if len(tmp) >= seq_len:
        return np.array(tmp[:seq_len])
    return np.array(tmp + [0]*(seq_len - len(tmp)))

In [9]:
#v1
#charset_cutoff = 0.997 (74)
#char_seq_len = 200
#arch_description = [(20, 3, 1, 5, 5), (30, 3, 1, 5, 5)], relu
#fc_description = [20], elu
#batch_size - 2048, 1 step per batch
#epoch - 215 sec, size - 95kb
#00 - 0.304/29.3/90.5, 01 - 0.298/36.0/90.5, 02 - 0.293/41.3/90.5, 03 - 0.285/46.6/90.5, 04 - 0.268/56.1/90.8,
#05 - 0.237/68.7/91.4, 06 - 0.215/73.3/92.6, 07 - 0.203/76.2/93.1, 08 - 0.196/77.9/93.3, 09 - 0.191/79.2/93.5,
#10 - 0.186/80.1/93.7, 11 - 0.182/80.8/93.8, 12 - 0.180/81.3/93.9, 13 - 0.179/81.7/93.8, 14 - 0.176/81.9/93.9,
#15 - 0.174/82.3/94.0, 16 - 0.173/82.6/94.0

In [10]:
def prepare_char_transform(chars_stats, seq_length, charset_size):
    charmap = {x[0]:(i+1) for (i, x) in enumerate(chars_stats[:charset_size])}
    return lambda x: transform_text(list(x), charmap, seq_length)

def build_charcnn_graph(input_shape, cnn_arch, fc_arch):
    tf.reset_default_graph()

    with tf.name_scope('Input'):
        tf_in_x = tf.placeholder(tf.int32, shape=(None, input_shape[0]))
        tf_in_y = tf.placeholder(tf.int32, shape=(None,))

        tf_x1hot = tf.one_hot(tf_in_x, input_shape[1])
        tf_temp = tf_x1hot

    for (i, (conv_filters, conv_width, conv_stride, pool_width, pool_stride)) in enumerate(cnn_arch):
        with tf.name_scope('Conv-MaxPool-{:02d}'.format(i)):
            tf_temp = tf.layers.conv1d(tf_temp, conv_filters, conv_width, conv_stride, activation=tf.nn.relu)
            tf_temp = tf.layers.max_pooling1d(tf_temp, pool_width, pool_stride)

    with tf.name_scope('FC'):
        tf_temp = tf.contrib.layers.flatten(tf_temp)
        for sz in fc_arch:
            tf_temp = tf.layers.dense(tf_temp, sz, activation=tf.nn.elu)
        tf_final = tf.layers.dense(tf_temp, 2)
        tf_prob = tf.nn.softmax(tf_final)
        tf_predicted = tf.cast(tf.argmax(tf_prob, axis=1), dtype=tf.int32)

    with tf.name_scope('LOSS'):
        tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_final))
        tf_train = tf.train.AdamOptimizer(1e-3).minimize(tf_loss)
        
        tf_rocauc, tf_upd_rocuac = tf.metrics.auc(labels=tf_in_y, predictions=tf_prob[:,1], num_thresholds=10000)
        tf_gini = tf_rocauc * 2 - 1
        tf_accuracy, tf_upd_accuracy = tf.metrics.accuracy(labels=tf_in_y, predictions=tf_predicted)
        tf_update_metrics = tf.group(tf_upd_rocuac, tf_upd_accuracy)
        
        tfsummary_logloss = tf.summary.scalar('Log-Loss', tf_loss)
        tfsummary_gini = tf.summary.scalar('1-Gini', 1-tf_gini)
        tfsummary_accuracy = tf.summary.scalar('1-Accuracy', 1-tf_accuracy)
        tfsummary = tf.summary.merge([tfsummary_logloss, tfsummary_gini, tfsummary_accuracy])

    return {'in':{'data':tf_in_x, 'label':tf_in_y},
            'out':{'logit':tf_final, 'prob':tf_prob},
            'run':{'loss': tf_loss, 'upd_metrics':tf_update_metrics,
                   'gini':tf_gini, 'accuracy':tf_accuracy,
                   'train': tf_train, 'summary':tfsummary}}

In [13]:
#transform_fun = prepare_char_transform(chars_res, 100, 50)
#graph_descr = build_charcnn_graph((100, 50), [(20, 4, 1, 20, 20)], [20])
#model_name = '24Toxic03CCNN_v0'

#transform_fun = prepare_char_transform(chars_res, 100, 50)
#graph_descr = build_charcnn_graph((100, 50), [(40, 4, 1, 20, 20)], [20])
#model_name = '24Toxic03CCNN_v1'

transform_fun = prepare_char_transform(chars_res, 200, 50)
graph_descr = build_charcnn_graph((200, 50), [(40, 4, 1, 20, 20)], [20])
model_name = '24Toxic03CCNN_v1m4'

tffw_graph = tf.summary.FileWriter('D:/Jupyter/Logs/Graph_{}'.format(model_name), tf.get_default_graph())
model_ckpt_name = '../Models/{0}/model'.format(model_name)+'-{:02d}.ckpt'

print('Graph created')

batch_steps = 1
batch_size  = 2048
calc_batch_size = 2048

train_set = (src_train.comment_text.values, src_train.toxic.values)
test_set = (src_test.comment_text.values, src_test.toxic.values)
test_y = test_set[1]
stat_set = test_set

set2dict = lambda x: {graph_descr['in']['data']: transform_fun(x[0]), graph_descr['in']['label']: x[1]}

stat_dict = set2dict(stat_set)
print('Preparation complete')

Graph created
Preparation complete


In [14]:
num_epochs = 50

dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw_run = tf.summary.FileWriter('D:/Jupyter/Logs/Run_{0}-{1}'.format(model_name, dt_now), tf.get_default_graph())
tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    tfs.run(tf.local_variables_initializer())
    
    for n in range(num_epochs):
        t0 = time.perf_counter()
        
        modutils.runEpoch(tfs, train_set, batch_size, set2dict, graph_descr['run']['train'],
                     op_loss=graph_descr['run']['loss'], verbatim=True)
        
        #test_res = run_tf_calc(tfs, test_set, calc_batch_size, set2dict,
        #                       [graph_descr['run']['loss'], graph_descr['out']['prob']])
        
        #test_loss = np.sum([x[1] * x[2][0] for x in test_res]) / np.sum([x[1] for x in test_res])
        #test_p = np.concatenate([x[2][1] for x in test_res])
        #gini = sklearn.metrics.roc_auc_score(test_y, test_p[:,1])*2-1
        #accur = sklearn.metrics.accuracy_score(test_y, 1*(test_p[:,1]>0.5))
        
        tfs.run(graph_descr['run']['upd_metrics'], stat_dict)
        all_stats = tfs.run([graph_descr['run']['loss'], graph_descr['run']['gini'],
                             graph_descr['run']['accuracy'], graph_descr['run']['summary']], stat_dict)
        tffw_run.add_summary(all_stats[-1], n)
        
        t1 = time.perf_counter()
        
        p = tfsSaver.save(tfs, model_ckpt_name.format(n))
        print('Model saved at checkpoint: {0}'.format(p))        
        print('Epoch {0}: {1:.3f} in {2:.2f} sec, gini={3:.3f}, accur={4:.3f}'.format(n, all_stats[0], t1-t0,
                                                                                      all_stats[1], all_stats[2]))
print('\nDone')

Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-00.ckpt
Epoch 0: 0.307 in 438.96 sec, gini=0.332, accur=0.905
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-01.ckpt
Epoch 1: 0.292 in 433.27 sec, gini=0.352, accur=0.905
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-02.ckpt
Epoch 2: 0.282 in 445.94 sec, gini=0.390, accur=0.905
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-03.ckpt
Epoch 3: 0.269 in 447.91 sec, gini=0.440, accur=0.906
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-04.ckpt
Epoch 4: 0.250 in 429.22 sec, gini=0.491, accur=0.907
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-05.ckpt
Epoch 5: 0.230 in 148990.56 sec, gini=0.534, accur=0.908
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-06.ckpt
Epoch 6: 0.216 in 433.07 sec, gini=0.569, accur=0.911
Model saved at checkpoint: ../Models/24Toxic03CCNN_v1m4/model-07.ckpt
Epoch 7: 0.206 in 908.70 sec, gini=0.597, accur=0.913
Model

KeyboardInterrupt: 

In [None]:
#v1m4
#build_charcnn_graph((200, 50), [(40, 4, 1, 20, 20)], [20])
#batch_size - 2048, 1 step per batch
#epoch - 160 sec (? on wire), size - 180kb
#stop criterion - epoch 50 or 3 epoch no decrease of loss
#comment: starts worse, overall the same

#v1m3
#build_charcnn_graph((100, 100), [(40, 4, 1, 20, 20)], [20])
#batch_size - 2048, 1 step per batch
#epoch - 160 sec (? on wire), size - 226kb
#stop criterion - epoch 50 or 3 epoch no decrease of loss
#comment: starts worse, overall the same

#v1m2
#build_charcnn_graph((100, 50), [(40, 4, 1, 20, 20)], [80])
#batch_size - 2048, 1 step per batch
#epoch - ? sec (75 on wire), size - 247kb
#stop criterion - epoch 50 or 3 epoch no decrease of loss
#comment: significantly improves quality vs v1

#v1m1
#build_charcnn_graph((100, 50), [(160, 4, 1, 20, 20)], [20])
#batch_size - 2048, 1 step per batch
#epoch - ? sec (120 on wire), size - 527kb
#stop criterion - epoch 50 or 3 epoch no decrease of loss
#comment: significantly improves quality vs v1

#v1
#build_charcnn_graph((100, 50), [(40, 4, 1, 20, 20)], [20])
#batch_size - 2048, 1 step per batch
#epoch - ? sec (70 on wire), size - 132kb
#stop criterion - epoch 50 or 3 epoch no decrease of loss

#v0
#build_charcnn_graph((100, 50), [(20, 4, 1, 20, 20)], [20])
#batch_size - 2048, 1 step per batch
#epoch - 95 sec (65 on wire), size - 66kb
#stop criterion - epoch 50 or 3 epoch no decrease of loss
#00 - 0.308/25.1/90.5, 01 - 0.299/27.0/90.5, 02 - 0.292/31.8/90.5, 03 - 0.283/36.8/90.5, 04 - 0.272/41.2/90.5
#05 - 0.257/45.2/90.6, 06 - 0.241/48.7/90.7, 07 - 0.228/51.6/90.9, 08 - 0.221/54.1/91.0, 09 - 0.215/56.1/91.2
#10 - 0.211/57.9/91.4, 11 - 0.207/59.4/91.5, 12 - 0.204/60.7/91.6, 13 - 0.201/62.0/91.7, 14 - 0.199/63.0/91.8
#15 - 0.197/64.0/91.9, 20 - 0.193/67.3/92.3, 25 - 0.192/69.5/92.5, 30 - 0.192/70.8/92.7, 35 - 0.192/71.8/92.8
#40 - 0.195/72.5/92.9, 45 - 0.195/73.1/92.9, 49 - 0.195/73.4/93.0