In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import sklearn, sklearn.metrics

import modutils

#data_dir = '../DataSets/Churn/'
data_dir = '../DataSets/kkbox/churn/raw/'
logm_fmt = data_dir + 'user_logs/uldtm_{0}.csv'
used_log = '201701'

In [2]:
src0 = pd.read_csv(logm_fmt.format(used_log))

In [3]:
features = ['used', 'ln_tot', 'ln_sec', 'pct_low', 'pct_100', 'pct_unq', 'avg_sec_n']
target = ['nxt_used']
devX = src0[src0.date < 20170131][features].values.reshape(-1, 30, len(features))
devY = src0.nxt_used[src0.date < 20170131].values.reshape(-1, 30)

In [4]:
(trainX, trainY), (validX, validY), (testX, testY) = modutils.splitSample((devX, devY), pcts=[0.2, 0.2, 0.6])

In [21]:
SEQ_LENGTH = 30
SEQ_FEATURES = len(features)
RNN_SIZE = [64, 64]
HIDDEN_LAYER = 16

RCell = lambda n: tf.nn.rnn_cell.GRUCell(num_units=n, activation=tf.nn.elu)

tf.reset_default_graph()

rnnCell = tf.nn.rnn_cell.MultiRNNCell([RCell(s) for s in RNN_SIZE], state_is_tuple=True)

with tf.name_scope(name='INPUT'):
    tfi_x = tf.placeholder(shape=(None, SEQ_LENGTH, SEQ_FEATURES), dtype=tf.float32)
    tfi_y = tf.placeholder(shape=(None, SEQ_LENGTH), dtype=tf.int32)
    tfi_l = tf.placeholder(shape=(1,), dtype=tf.int32)
    
    tfX = tfi_x
    tfY = tf.one_hot(tfi_y, 2, dtype=tf.float32)
    tfActual = tf.cast(tfi_y, dtype=tf.float32)

with tf.name_scope(name='RNN'):
    tfRNN_Hist, _ = tf.nn.dynamic_rnn(rnnCell, inputs=tfX, dtype=tf.float32, time_major=False)
    
    tfRNN_HistHid = tf.layers.dense(tfRNN_Hist, HIDDEN_LAYER, activation=tf.nn.elu)
    tfRNN_HistRes = tf.layers.dense(tfRNN_HistHid, 2)

with tf.name_scope(name='LOSS'):
    tfLoss0 = tf.nn.softmax_cross_entropy_with_logits(labels=tfY[:,5:,:], logits=tfRNN_HistRes[:,5:,:])
    tfLoss = tf.reduce_mean(tfLoss0)
    tfOptimizer = tf.train.AdamOptimizer(1e-3)
    tfTrain = tfOptimizer.minimize(tfLoss)
    
    #tf_TLi = tf.constant(0)
    #tf_TLc = lambda i: tf.less(i, tfi_l)
    #def tf_TLbody(i):        
    #    return tfTrain1

    #tfTrainLoop = tf.while_loop(tf_TLc, tf_TLbody, [tf_TLi])

with tf.name_scope(name='OUTPUT'):
    #tfPredicted = tf.argmax(tfRNN_HistRes, axis=2)
    tfProbability = tf.nn.softmax(tfRNN_HistRes)[:,:,1]
    tfLogOdds = tf.log(tfProbability / (1-tfProbability))

tfsLoss = tf.summary.scalar('CrossEntropy', tfLoss)

tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_A', tf.get_default_graph())
print('Graph creation complete')

Graph creation complete


In [24]:
dt_now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
tfsSaver = tf.train.Saver(max_to_keep=2)
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/19CHURN04-RNN-{0}'.format(dt_now), tf.get_default_graph())

batch_size = 500
num_steps  = 1
num_epochs = 1000
checkpoints = 10 #every %x% epochs save the model

fmtstr = 'Epoch {0} ({1:1.3} sec): \t\tVL:{2:1.3f}\t\tGINI:{3:1.3f} ({4:1.3f})'
valid_batch = {tfi_x: validX, tfi_y: validY}
with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    for i in range(num_epochs):
        te0 = time.perf_counter()
        counter0 = 0
        num0 = trainY.shape[0] // batch_size
        if num0 * batch_size < trainY.shape[0]:
            num0 += 1
        for (mini_x, mini_y) in modutils.shuffleBatches((trainX, trainY), batchSize=batch_size):
            train_batch = {tfi_x:mini_x, tfi_y: mini_y}
            
            l0 = tfLoss.eval(feed_dict=train_batch)
            t0 = time.perf_counter()
            for j in range(num_steps):
                tfTrain.run(feed_dict=train_batch)
            t1 = time.perf_counter()
            l1 = tfLoss.eval(feed_dict=train_batch)
            counter0 += 1
            print('Train-step {3}/{4}: {0:.3f}->{1:.3f} in {2:.2f} sec'.format(l0, l1, t1-t0, counter0, num0), end='\r')
    
        [valid_p,lv] = tfs.run([tfProbability, tfLoss], feed_dict=valid_batch)
        [train_p,lt] = tfs.run([tfProbability, tfLoss], feed_dict={tfi_x:trainX, tfi_y:trainY})
        gv = sklearn.metrics.roc_auc_score(validY[:,-1], valid_p[:,-1]) * 2 -1
        gt = sklearn.metrics.roc_auc_score(trainY[:,-1], train_p[:,-1]) * 2 -1
        te1 = time.perf_counter()
        if i%checkpoints == 0 and i > 0:
            p = tfsSaver.save(tfs, 'D:/Jupyter/mltest/Models-13RNN02/model-{0:02d}.ckpt'.format(i))
            print('Model saved at checkpoint: {0}'.format(p))
                             
        print(fmtstr.format(i,te1-te0,lv,gv,gt))
    #valid_r = tfs.run(tfOutR, feed_dict=valid_batch)

Epoch 0 (1.25e+02 sec): 		VL:0.449		GINI:0.725 (0.722)
Epoch 1 (1.25e+02 sec): 		VL:0.446		GINI:0.725 (0.722)
Epoch 2 (1.25e+02 sec): 		VL:0.445		GINI:0.731 (0.728)
Epoch 3 (1.25e+02 sec): 		VL:0.443		GINI:0.736 (0.733)
Epoch 4 (1.26e+02 sec): 		VL:0.442		GINI:0.732 (0.729)
Epoch 5 (1.28e+02 sec): 		VL:0.441		GINI:0.737 (0.735)
Epoch 6 (1.27e+02 sec): 		VL:0.441		GINI:0.737 (0.735)
Epoch 7 (1.27e+02 sec): 		VL:0.441		GINI:0.738 (0.736)
Epoch 8 (1.26e+02 sec): 		VL:0.440		GINI:0.739 (0.737)
Epoch 9 (1.28e+02 sec): 		VL:0.440		GINI:0.735 (0.733)
Model saved at checkpoint: D:/Jupyter/mltest/Models-13RNN02/model-10.ckpt
Epoch 10 (1.28e+02 sec): 		VL:0.440		GINI:0.739 (0.737)
Epoch 11 (1.27e+02 sec): 		VL:0.440		GINI:0.735 (0.733)
Epoch 12 (1.27e+02 sec): 		VL:0.439		GINI:0.733 (0.731)
Epoch 13 (1.27e+02 sec): 		VL:0.439		GINI:0.739 (0.737)
Epoch 14 (1.29e+02 sec): 		VL:0.438		GINI:0.740 (0.738)
Epoch 15 (1.28e+02 sec): 		VL:0.439		GINI:0.740 (0.737)
Epoch 16 (1.29e+02 sec): 		VL:0.438		GIN

KeyboardInterrupt: 