In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time, datetime
import sklearn, sklearn.metrics, sklearn.decomposition
import collections
import matplotlib.pyplot as plt
import seaborn

train_file = '../DataSets/Toxic/dev_train.csv'
test_file = '../DataSets/Toxic/dev_valid.csv'

In [2]:
src_train = pd.read_csv(train_file)
src_test = pd.read_csv(test_file)

lens = np.array([len(x) for x in src_train.comment_text])
print(np.percentile(lens, q=[50, 75, 90, 95, 99]))

chars = sorted(dict(collections.Counter([z for x in src_train.comment_text for z in x])).items(),
               key=lambda x:x[1], reverse=True)
chars_pct = np.cumsum([x[1] for x in chars]) / np.sum([x[1] for x in chars])
chars_res = list(zip([x[0] for x in chars], chars_pct))

src_train.head()

[  205.   435.   891.  1369.  3474.]


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
2,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
3,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
4,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0


In [6]:
def transform_text(text, charmap, seq_len):
    if type(text) is list:
        return np.array([transform_text(x, charmap, seq_len) for x in text])
    
    tmp = [charmap[x] for x in text if x in charmap]
    if len(tmp) >= seq_len:
        return np.array(tmp[:seq_len])
    return np.array(tmp + [0]*(seq_len - len(tmp)))

def prepare_char_transform(chars_stats, seq_length, charset_size):
    charmap = {x[0]:(i+1) for (i, x) in enumerate(chars_stats[:charset_size])}
    return lambda x: transform_text(list(x), charmap, seq_length)

def build_charcnnl1_graph(input_shape, cnn_arch, fc_arch, reg=0.1):
    tf.reset_default_graph()

    with tf.name_scope('Input'):
        tf_in_x = tf.placeholder(tf.int32, shape=(None, input_shape[0]))
        tf_in_y = tf.placeholder(tf.int32, shape=(None,))
        tf_in_cnn_keep = tf.placeholder_with_default(0.0, shape=(), name='Dropout-CNN')
        tf_in_cnn_input_keep = tf.placeholder_with_default(0.0, shape=(), name='Dropout-Input')
        tf_in_fc_keep = tf.placeholder_with_default(0.0, shape=(), name='Dropout-FC')
        tf_in_fc_final_keep = tf.placeholder_with_default(0.0, shape=(), name='Dropout-Final')
        tf_in_training = tf.placeholder_with_default(False, shape=(), name='Training')

        tf_x1hot = tf.one_hot(tf_in_x, input_shape[1])
        tf_temp = tf_x1hot
        
    tf_conv1kernel = None

    for (i, (conv_filters, conv_width, conv_stride, pool_width, pool_stride)) in enumerate(cnn_arch):
        with tf.name_scope('Conv-MaxPool-{:02d}'.format(i)):
            if i > 0:
                tf_temp = tf.layers.dropout(tf_temp, rate=tf_in_cnn_keep, training=tf_in_training)
            else:
                tf_temp = tf.layers.dropout(tf_temp, rate=tf_in_cnn_input_keep, training=tf_in_training)
            tf_temp = tf.layers.conv1d(tf_temp, conv_filters, conv_width, conv_stride, activation=tf.nn.relu)
            if i == 0:
                tf_conv1kernel = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'conv1d')[0]
            tf_temp = tf.layers.max_pooling1d(tf_temp, pool_width, pool_stride)

    with tf.name_scope('FC'):
        tf_temp = tf.contrib.layers.flatten(tf_temp)
        for sz in fc_arch:
            tf_temp = tf.layers.dropout(tf_temp, rate=tf_in_fc_keep, training=tf_in_training)
            tf_temp = tf.layers.dense(tf_temp, sz, activation=tf.nn.elu)
            
        tf_temp = tf.layers.dropout(tf_temp, rate=tf_in_fc_final_keep, training=tf_in_training)
        tf_final = tf.layers.dense(tf_temp, 2)
        tf_prob = tf.nn.softmax(tf_final)
        tf_predicted = tf.cast(tf.argmax(tf_prob, axis=1), dtype=tf.int32)

    with tf.name_scope('LOSS'):
        tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_final))
        tf_l1loss = tf.reduce_mean(tf.abs(tf_conv1kernel))
        tf_finloss = tf_loss + tf_l1loss * reg
        tf_train = tf.train.AdamOptimizer(1e-3).minimize(tf_finloss)
        
        tf_rocauc, tf_upd_rocuac = tf.metrics.auc(labels=tf_in_y, predictions=tf_prob[:,1], num_thresholds=10000)
        tf_gini = tf_rocauc * 2 - 1
        tf_accuracy, tf_upd_accuracy = tf.metrics.accuracy(labels=tf_in_y, predictions=tf_predicted)
        tf_update_metrics = tf.group(tf_upd_rocuac, tf_upd_accuracy)
        
        tfsummary_logloss = tf.summary.scalar('Log-Loss', tf_loss)
        tfsummary_gini = tf.summary.scalar('1-Gini', 1-tf_gini)
        tfsummary_accuracy = tf.summary.scalar('1-Accuracy', 1-tf_accuracy)
        tfsummary = tf.summary.merge([tfsummary_logloss, tfsummary_gini, tfsummary_accuracy])

    return {'in':{'data':tf_in_x, 'label':tf_in_y, 'keep-cnn':tf_in_cnn_keep, 'keep-fc':tf_in_fc_keep,
                  'keep-final': tf_in_fc_final_keep, 'dropout': tf_in_training},
            'out':{'logit':tf_final, 'prob':tf_prob},
            'run':{'loss': tf_loss, 'reg-loss': tf_l1loss, 'upd_metrics':tf_update_metrics,
                   'gini':tf_gini, 'accuracy':tf_accuracy,
                   'train': tf_train, 'summary':tfsummary}}

In [13]:
transform_fun = prepare_char_transform(chars_res, 100, 60)
graph_descr = build_charcnnl1_graph((100, 61), [(80, 5, 1, 20, 20)], [], reg=2.0)
model_name = '24Toxic05CCNNL1_v0'

tffw_graph = tf.summary.FileWriter('D:/Jupyter/Logs/Graph_{}'.format(model_name), tf.get_default_graph())
model_ckpt_name = '../Models/{0}/model'.format(model_name)+'-{:02d}.ckpt'

print('Graph created')

batch_steps = 1
batch_size  = 64
calc_batch_size = 2048

train_set = (src_train.comment_text.values, src_train.toxic.values)
valid_set = (src_test.comment_text.values, src_test.toxic.values)
valid_y = valid_set[1]

stat_set_train = (src_train.comment_text.values[:len(valid_y)], src_train.toxic.values[:len(valid_y)])
stat_set_valid = valid_set

trainset2dict = lambda x: {graph_descr['in']['data']: transform_fun(x[0]), graph_descr['in']['label']: x[1],
                           graph_descr['in']['keep-final']: 0.5, graph_descr['in']['keep-fc']: 0.5,
                           graph_descr['in']['keep-cnn']: 0.5, graph_descr['in']['dropout']: True}
validset2dict = lambda x: {graph_descr['in']['data']: transform_fun(x[0]), graph_descr['in']['label']: x[1]}

stat_train_dict = validset2dict(stat_set_train)
stat_valid_dict = validset2dict(stat_set_valid)
print('Preparation complete')

Graph created
Preparation complete


In [14]:
num_epochs = 50

dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw_train = tf.summary.FileWriter('D:/Jupyter/Logs/Run_{0}-{1}-T'.format(model_name, dt_now), tf.get_default_graph())
tffw_valid = tf.summary.FileWriter('D:/Jupyter/Logs/Run_{0}-{1}-V'.format(model_name, dt_now), tf.get_default_graph())
tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    tfs.run(tf.local_variables_initializer())
    
    for n in range(num_epochs):
        t0 = time.perf_counter()
        
        modutils.runEpoch(tfs, train_set, batch_size, trainset2dict, graph_descr['run']['train'],
                     op_loss=graph_descr['run']['loss'], verbatim=True)
        train_loss, _ = tfs.run([graph_descr['run']['loss'], graph_descr['run']['upd_metrics']], stat_train_dict)
        train_stats = tfs.run([graph_descr['run']['gini'], graph_descr['run']['accuracy'], graph_descr['run']['summary']], stat_train_dict)
        tffw_train.add_summary(train_stats[-1], n)
        
        valid_loss, valid_regloss, _ = tfs.run([graph_descr['run']['loss'], graph_descr['run']['reg-loss'], graph_descr['run']['upd_metrics']], stat_valid_dict)
        valid_stats = tfs.run([graph_descr['run']['gini'], graph_descr['run']['accuracy'], graph_descr['run']['summary']], stat_valid_dict)
        tffw_valid.add_summary(valid_stats[-1], n)
        
        t1 = time.perf_counter()
        
        p = tfsSaver.save(tfs, model_ckpt_name.format(n))
        print('Model saved at checkpoint: {0}'.format(p))        
        print('Epoch {0}: {1:.3f}:{2:.2f} in {3:.2f} sec, gini={4:.3f}, accur={5:.3f}'.format(n, valid_loss, valid_regloss,
                                                                                              t1-t0,valid_stats[0], valid_stats[1]))
print('\nDone')

Model saved at checkpoint: ../Models/24Toxic05CCNNL1_v0/model-00.ckpt
Epoch 0: 0.215:0.01 in 278.05 sec, gini=0.742, accur=0.926
Model saved at checkpoint: ../Models/24Toxic05CCNNL1_v0/model-01.ckpt
Epoch 1: 0.194:0.01 in 277.89 sec, gini=0.757, accur=0.931
Model saved at checkpoint: ../Models/24Toxic05CCNNL1_v0/model-02.ckpt
Epoch 2: 0.185:0.02 in 279.74 sec, gini=0.776, accur=0.933
Model saved at checkpoint: ../Models/24Toxic05CCNNL1_v0/model-03.ckpt
Epoch 3: 0.181:0.02 in 297.17 sec, gini=0.786, accur=0.935
31488/79336:	0.216 -> 0.220	0.11 sec

KeyboardInterrupt: 