In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time, datetime
import sklearn, sklearn.metrics, sklearn.decomposition
import collections
import matplotlib.pyplot as plt

train_file = '../DataSets/Toxic/dev_train.csv'
test_file = '../DataSets/Toxic/dev_valid.csv'

In [2]:
src_train = pd.read_csv(train_file)
src_test = pd.read_csv(test_file)

In [3]:
lens = np.array([len(x) for x in src_train.comment_text])
np.percentile(lens, q=[50, 75, 90, 95, 99])

array([  207.,   437.,   896.,  1376.,  3508.])

In [4]:
chars = sorted(dict(collections.Counter([z for x in src_train.comment_text for z in x])).items(),
               key=lambda x:x[1], reverse=True)
chars_pct = np.cumsum([x[1] for x in chars]) / np.sum([x[1] for x in chars])
chars_res = list(zip([x[0] for x in chars], chars_pct))

In [5]:
def transform_text(text, charmap, seq_len):
    if type(text) is list:
        return np.array([transform_text(x, charmap, seq_len) for x in text])
    
    tmp = [charmap[x] for x in text if x in charmap]
    if len(tmp) >= seq_len:
        return np.array(tmp[:seq_len])
    return np.array(tmp + [0]*(seq_len - len(tmp)))

In [6]:
def prepare_char_transform(chars_stats, seq_length, charset_size):
    charmap = {x[0]:(i+1) for (i, x) in enumerate(chars_stats[:charset_size])}
    return lambda x: transform_text(list(x), charmap, seq_length)

def build_charrnn_graph(input_shape, rnn_arch, fc_arch):
    RNNCell = lambda n: tf.nn.rnn_cell.GRUCell(num_units=n, activation=tf.nn.elu)
    
    tf.reset_default_graph()

    with tf.name_scope('Input'):
        tf_in_x = tf.placeholder(tf.int32, shape=(None, input_shape[0]))
        tf_in_y = tf.placeholder(tf.int32, shape=(None,))

        tf_x1hot = tf.one_hot(tf_in_x, input_shape[1])
        tf_temp = tf_x1hot
        
    with tf.name_scope('RNN'):
        rnnCell = tf.nn.rnn_cell.MultiRNNCell([RNNCell(s) for s in rnn_arch], state_is_tuple=True)
    
        tf_AllStates0, tf_FinState0 = tf.nn.dynamic_rnn(rnnCell, inputs=tf_x1hot, dtype=tf.float32, time_major=False)
        tf_FinState = tf_FinState0[-1] #get latest layer in RNN
        tf_AllStates = tf_AllStates0
        
    with tf.name_scope('SEQ-FC'):
        tf_NextForecast = tf.layers.dense(tf_AllStates, input_shape[1])
        tf_NextProb = tf.nn.softmax(tf_NextForecast)
        tf_NextPredicted = tf.cast(tf.argmax(tf_NextProb, axis=2), dtype=tf.int32)

    with tf.name_scope('FC'):
        tf_temp = tf_FinState
        for sz in fc_arch:
            tf_temp = tf.layers.dense(tf_temp, sz, activation=tf.nn.elu)
        tf_final = tf.layers.dense(tf_temp, 2)
        tf_prob = tf.nn.softmax(tf_final)
        tf_predicted = tf.cast(tf.argmax(tf_prob, axis=1), dtype=tf.int32)

    with tf.name_scope('LOSS'):
        tf_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_y, logits=tf_final))
        tf_train = tf.train.AdamOptimizer(1e-3).minimize(tf_loss)
        
        tf_loss_seq = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_in_x[:,1:], logits=tf_NextForecast[:,:-1,:]))
        tf_train_seq = tf.train.AdamOptimizer(1e-3).minimize(tf_loss_seq)
        
        tf_rocauc, tf_upd_rocuac = tf.metrics.auc(labels=tf_in_y, predictions=tf_prob[:,1], num_thresholds=10000)
        tf_gini = tf_rocauc * 2 - 1
        tf_accuracy, tf_upd_accuracy = tf.metrics.accuracy(labels=tf_in_y, predictions=tf_predicted)
        tf_seq_accuracy, tf_upd_seq_accuracy = tf.metrics.accuracy(labels=tf_in_x[:,1:], predictions=tf_NextPredicted[:,:-1])
        tf_update_metrics = tf.group(tf_upd_rocuac, tf_upd_accuracy, tf_upd_seq_accuracy)
        
        tfsummary_logloss = tf.summary.scalar('Log-Loss', tf_loss)
        tfsummary_gini = tf.summary.scalar('1-Gini', 1-tf_gini)
        tfsummary_accuracy = tf.summary.scalar('1-Accuracy', 1-tf_accuracy)
        tfsummary = tf.summary.merge([tfsummary_logloss, tfsummary_gini, tfsummary_accuracy])

    return {'in':{'data':tf_in_x, 'label':tf_in_y},
            'out':{'logit':tf_final, 'prob':tf_prob},
            'run':{'loss': tf_loss, 'seq-loss':tf_loss_seq, 'upd_metrics':tf_update_metrics,
                   'gini':tf_gini, 'accuracy':tf_accuracy, 'seq-accuracy':tf_seq_accuracy,
                   'train': tf_train, 'seq-train':tf_train_seq, 'summary':tfsummary}}

In [11]:
transform_fun = prepare_char_transform(chars_res, 200, 50)
graph_descr = build_charrnn_graph((200, 51), [40], [20])
model_name = '24Toxic04CRNN_v1'

tffw_graph = tf.summary.FileWriter('D:/Jupyter/Logs/Graph_{}'.format(model_name), tf.get_default_graph())
model_ckpt_name = '../Models/{0}/model'.format(model_name)+'-{:02d}.ckpt'

print('Graph created')

batch_steps = 1
batch_size  = 128
calc_batch_size = 2048

train_set = (src_train.comment_text.values, src_train.toxic.values)
test_set = (src_test.comment_text.values, src_test.toxic.values)
test_y = test_set[1]
stat_set = test_set

set2dict = lambda x: {graph_descr['in']['data']: transform_fun(x[0]), graph_descr['in']['label']: x[1]}

stat_dict = set2dict(stat_set)
print('Preparation complete')

Graph created
Preparation complete


In [13]:
num_epochs = 50

dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tffw_valid = tf.summary.FileWriter('D:/Jupyter/Logs/Run_{0}-{1}-V'.format(model_name, dt_now), tf.get_default_graph())
tfsSaver = tf.train.Saver(max_to_keep=5)

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    tfs.run(tf.local_variables_initializer())
    
    for n in range(num_epochs):
        t0 = time.perf_counter()
        
        modutils.runEpoch(tfs, train_set, batch_size, set2dict, graph_descr['run']['train'],
                     op_loss=graph_descr['run']['loss'], verbatim=True)
        
        #test_res = run_tf_calc(tfs, test_set, calc_batch_size, set2dict,
        #                       [graph_descr['run']['loss'], graph_descr['out']['prob']])
        
        #test_loss = np.sum([x[1] * x[2][0] for x in test_res]) / np.sum([x[1] for x in test_res])
        #test_p = np.concatenate([x[2][1] for x in test_res])
        #gini = sklearn.metrics.roc_auc_score(test_y, test_p[:,1])*2-1
        #accur = sklearn.metrics.accuracy_score(test_y, 1*(test_p[:,1]>0.5))
        
        tfs.run(graph_descr['run']['upd_metrics'], stat_dict)
        (loss, summary) = tfs.run([graph_descr['run']['loss'], graph_descr['run']['summary']], stat_dict)
        tffw_valid.add_summary(summary, n)
        t1 = time.perf_counter()
        
        p = tfsSaver.save(tfs, model_ckpt_name.format(n))
        print('Model saved at checkpoint: {0}'.format(p))        
        print('Epoch {0}: {1:.3f} in {2:.2f} sec'.format(n, loss, t1-t0))
print('\nDone')

Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-00.ckpt
Epoch 0: 0.243 in 163.46 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-01.ckpt
Epoch 1: 0.214 in 162.82 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-02.ckpt
Epoch 2: 0.205 in 168.62 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-03.ckpt
Epoch 3: 0.203 in 158.80 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-04.ckpt
Epoch 4: 0.204 in 153.92 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-05.ckpt
Epoch 5: 0.191 in 159.60 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-06.ckpt
Epoch 6: 0.186 in 153.86 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-07.ckpt
Epoch 7: 0.184 in 158.61 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-08.ckpt
Epoch 8: 0.180 in 160.82 sec
Model saved at checkpoint: ../Models/24Toxic04CRNN_v1/model-09.ckpt
Epoch 9: 0.181 in 162.13 sec
Model saved at checkpoint: ../

KeyboardInterrupt: 