In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

%matplotlib inline
pd.set_option('display.max_columns', 100)

In [2]:
def one_hot(lst, num_class=4) :
    return np.eye(num_class)[lst]

In [3]:
def making_data(csv) :
    activity = pd.read_csv(csv).drop("Unnamed: 0", axis=1)
    label = activity[["acc_id", "label"]]
    
    activity = activity[activity["label"] != "empty"]
    activity = activity.drop("label", axis=1)
    label = label[label["label"] != "empty"]
    
    activity = activity.sort_values(["acc_id","wk"])[['acc_id', 'wk', 'cnt_clear_bam', 'cnt_clear_inzone_light','cnt_clear_inzone_normal', 
                                                  'cnt_clear_inzone_skilled', 'cnt_clear_inzone_solo', 'cnt_clear_raid', 'cnt_clear_raid_light',
                                                  'cnt_dt', 'cnt_enter_bam', 'cnt_enter_inzone_light', 'cnt_enter_inzone_normal', 
                                                  'cnt_enter_inzone_skilled', 'cnt_enter_inzone_solo', 'cnt_enter_raid', 'cnt_enter_raid_light',
                                                  'cnt_use_buffitem', 'district_chat', 'duel_cnt', 'duel_win', 'faction_chat', 'game_combat_time', 
                                                  'gathering_cnt', 'get_money','guild_chat', 'item_hongmun', 'making_cnt', 'normal_chat', 
                                                  'npc_exp', 'npc_hongmun', 'party_chat', 'partybattle_cnt', 'partybattle_win', 'play_time', 
                                                  'quest_exp', 'quest_hongmun', 'whisper_chat','first_week', 'payment_amount']]
    label = label.sort_values("acc_id")
    
    label_lst = sorted(list(set([tuple(x) for x in label.values])))
    label = pd.DataFrame(label_lst, columns = ["acc_id", "label"])
    
    activity1 = activity[activity["wk"]==1].drop(["acc_id", "wk"], axis=1)
    activity2 = activity[activity["wk"]==2].drop(["acc_id", "wk"], axis=1)
    activity3 = activity[activity["wk"]==3].drop(["acc_id", "wk"], axis=1)
    activity4 = activity[activity["wk"]==4].drop(["acc_id", "wk"], axis=1)
    activity5 = activity[activity["wk"]==5].drop(["acc_id", "wk"], axis=1)
    activity6 = activity[activity["wk"]==6].drop(["acc_id", "wk"], axis=1)
    activity7 = activity[activity["wk"]==7].drop(["acc_id", "wk"], axis=1)
    activity8 = activity[activity["wk"]==8].drop(["acc_id", "wk"], axis=1)
    
    num_values = len(activity1.values[0])
    
    activity = np.concatenate([activity1.values.reshape([-1, 1, num_values]), activity2.values.reshape([-1, 1, num_values]), 
                               activity3.values.reshape([-1, 1, num_values]), activity4.values.reshape([-1, 1, num_values]),
                               activity5.values.reshape([-1, 1, num_values]), activity6.values.reshape([-1, 1, num_values]),
                               activity7.values.reshape([-1, 1, num_values]), activity8.values.reshape([-1, 1, num_values])], axis=1)
    
    label_dic = {"week":0 , "month" :1, "2month":2, "retained":3}

    label2 = label.sort_values(by="acc_id")
    label2["label"] = label2["label"].map(lambda x : label_dic[x])
    
    total_lst = activity
    label_dic = label2.label.tolist()
    total_label = one_hot(label_dic)
    
    return total_lst, total_label

 <br></br><br></br><br></br>

In [4]:
total_lst, total_label = making_data("OnlyExpanded.csv")

In [5]:
idx1 = len(total_lst)//5 *4

training_lst = np.array(total_lst[:idx1])
valid_lst = np.array(total_lst[idx1:])

training_label = np.array(total_label[:idx1])
valid_label = np.array(total_label[idx1:])

In [6]:
print(np.array(total_lst).shape)
print(np.array(total_label).shape)

(100000, 8, 38)
(100000, 4)


<br></br><br></br><br></br>

# Model

In [7]:
class RNN() :
    def __init__(self, sess, name):
        self.sess = sess
        self.name = name
        
    def build(self, batch_size, length, dim, is_embedding, emb_width, num_unit, is_fc, fc_num_unit, fc_activation, cost_function, output_dim) :
        with tf.variable_scope(self.name) :
            
            ## Setting ##
            self.batch_size = batch_size
            self.length = length
            self.dim = dim
            self.is_embedding = is_embedding
            self.emb_width = emb_width
            self.num_unit = num_unit
            self.is_fc = is_fc
            self.fc_num_unit = fc_num_unit
            self.fc_activation = fc_activation
            self.output_dim = output_dim
            
            self.X = tf.placeholder(tf.float32, [self.batch_size, self.length, self.dim])
            self.Y = tf.placeholder(tf.float32, [self.batch_size, self.output_dim])
            self.learning_rate =  tf.placeholder(tf.float32)
            self.training = tf.placeholder(tf.bool)
            #############
            
            
            ## Embedding ##
            if self.is_embedding :
                W_emb = tf.Variable(tf.random_normal([self.width, self.emb_width]))
                self.X = tf.concat(self.X, tf.matmul(self.X, W_emb), axis=2)
            ###############
            
            
            ## RNN ##
            f_cell1 = tf.nn.rnn_cell.BasicLSTMCell(self.num_unit)
            f_cell2 = tf.nn.rnn_cell.BasicLSTMCell(self.num_unit)
            f_multi_cell = tf.nn.rnn_cell.MultiRNNCell([f_cell1, f_cell2])
            
            b_cell1 = tf.nn.rnn_cell.BasicLSTMCell(self.num_unit)
            b_cell2 = tf.nn.rnn_cell.BasicLSTMCell(self.num_unit)
            b_multi_cell = tf.nn.rnn_cell.MultiRNNCell([b_cell1, b_cell2])
            
            f_output, f_state = tf.nn.dynamic_rnn(f_multi_cell, self.X, dtype=tf.float32,  scope="forward")
            b_output, b_state = tf.nn.dynamic_rnn(b_multi_cell, tf.reverse(self.X, axis=[1]), dtype=tf.float32, scope="backward")

            hidden1 = tf.concat([f_output[:,0,:], b_output[:,0,:]], axis=1)
            hidden2 = tf.concat([f_output[:,1,:], b_output[:,1,:]], axis=1)
            hidden3 = tf.concat([f_output[:,2,:], b_output[:,2,:]], axis=1)
            hidden4 = tf.concat([f_output[:,3,:], b_output[:,3,:]], axis=1)
            hidden5 = tf.concat([f_output[:,4,:], b_output[:,4,:]], axis=1)
            hidden6 = tf.concat([f_output[:,5,:], b_output[:,5,:]], axis=1)
            hidden7 = tf.concat([f_output[:,6,:], b_output[:,6,:]], axis=1)
            hidden8 = tf.concat([f_output[:,7,:], b_output[:,7,:]], axis=1)
            
            dense1 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden1, 32)))
            dense2 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden2, 32)))
            dense3 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden3, 32)))
            dense4 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden4, 32)))
            dense5 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden5, 32)))
            dense6 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden6, 32)))
            dense7 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden7, 32)))
            dense8 = tf.nn.relu(tf.contrib.layers.layer_norm(tf.layers.dense(hidden8, 32)))
            
            concat = tf.concat([hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7, hidden8], axis=1)
            rnn_result = tf.layers.dense(concat, 4)
            #########
            
            
            ## Classifier ##
            if is_fc : 
                dense= tf.layers.dense(rnn_result, self.fc_num_unit)
                norm = tf.contrib.layers.layer_norm(dense)
                relu = tf.nn.relu(norm)
                self.logit = tf.layers.dense(norm, 4)
            else :
                self.logit = tf.layers.dense(rnn_result, 4)
                
            self.softmax = tf.nn.softmax(self.logit)
            ################
            
            
            ## Learning ##
            if cost_function == "f1" :
                self.numerator = tf.reduce_sum(self.softmax*self.Y)
                self.denominator = tf.reduce_sum(self.softmax*self.Y + self.Y)
                self.cost = -2 * self.numerator / self.denominator
                
            else :
                self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logit, labels=self.Y))

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=self.name)
            with tf.control_dependencies(update_ops):
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
            
            self.prediction = tf.equal(tf.argmax(self.logit, 1), tf.argmax(self.Y, 1))     
            self.accuracy = tf.reduce_mean(tf.cast(self.prediction, tf.float32))    
            ##############
        
        
    def train(self, X_input, Y_input, learning_rate, training=True):
        feed_dict = {self.X: X_input, self.Y: Y_input, self.learning_rate: learning_rate, self.training: training}
        _, cost = self.sess.run([self.optimizer, self.cost], feed_dict=feed_dict)
        
        return _, cost
    
    def predict(self, X_input, training=False):
        feed_dict = {self.X: X_input, self.training: training}
        result = self.sess.run([self.logit], feed_dict=feed_dict)
            
        return result
    
    def evaluate(self, X_input, Y_input):
        size = X_input.shape[0]
            
        total_loss = 0
        total_acc = 0
            
        for idx in range(0, size, self.batch_size):
            X_batch = X_input[idx:idx + batch_size]
            Y_batch = Y_input[idx:idx + batch_size]
            feed_dict = {self.X: X_batch, self.Y: Y_batch, self.training: False}
                
            loss = self.cost
            accuracy = self.accuracy
                
            step_loss, step_acc = self.sess.run([loss, accuracy], feed_dict=feed_dict)
                
            total_loss += step_loss * X_batch.shape[0]
            total_acc += step_acc * X_batch.shape[0]
            
        total_loss /= size
        total_acc /= size
            
        return total_loss, total_acc

In [8]:
tf.reset_default_graph() 

<br></br><br></br><br></br> 

In [9]:
learning_rate1 = 0.02
learning_rate2 = 0.01
learning_rate3 = 0.005
learning_rate4 = 0.001

total_epoch = 80
batch_size = 500
input_dim = np.array(total_lst).shape[1]

In [10]:
idx = 0
is_pass = False
model_lst = []

for is_embedding in [False, True] :
    for emb_width in [64,128] :
        if not is_pass :
            is_pass = True
            continue
            
        for num_unit in [128, 256, 512] :
            for is_fc in [False, True] :
                for cost in ["accuracy"] :
                    print(idx) 
                    sess = tf.Session()
                    model = RNN(sess, "model{}".format(idx))
                    model.build(500, 8, 38, False, emb_width, num_unit, is_fc, 128, tf.nn.relu, cost, 4)
                    sess.run(tf.global_variables_initializer())

                    model_lst.append(model)
                    idx +=1
            
tl_ta_vl_va_lst = [[[],[],[],[]]]*len(model_lst)
print("Ready!")

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Ready!


In [11]:
print('Learning Started!')
print("")

# train my model
for epoch in range(total_epoch):
    avg_cost = [0]*len(model_lst)
    total_batch = int(len(training_lst) / batch_size)
    idx = 0
    
    if epoch == 0 :
        learning_rate = learning_rate1
    elif epoch == 10 :
        learning_rate = learning_rate2
    elif epoch == 20 :
        learning_rate = learning_rate3
    elif epoch == 80 :
        learning_rate = learning_rate4

    for i in range(total_batch):
        batch_xs, batch_ys = training_lst[idx:idx+batch_size],training_label[idx:idx+batch_size]
        
        for model_num, model in enumerate(model_lst) :
            _, c = model.train(batch_xs, batch_ys, learning_rate)
            avg_cost[model_num] += c / total_batch
        
        idx += batch_size
        if i%10 == 0 :
            print("log :", i)
            
    #train/valid cost & acc
    print("***epoch*** : ", epoch)
    for model_num, model in enumerate(model_lst) :
        train_cost, train_acc = model.evaluate(training_lst, training_label)
        valid_cost, valid_acc = model.evaluate(valid_lst, valid_label)

        tl_ta_vl_va_lst[model_num][0].append(train_cost)
        tl_ta_vl_va_lst[model_num][1].append(train_acc)
        tl_ta_vl_va_lst[model_num][2].append(valid_cost)
        tl_ta_vl_va_lst[model_num][3].append(valid_acc)

        print("-- train {:.5f}({:.1f}%), valid{:.5f}({:.1f}%)".format(train_cost, train_acc*100, valid_cost, valid_acc*100))
    
    for model in model_lst :
        print('Accuracy:', model.evaluate(valid_lst, valid_label)[1])
    print(" ")

print("")
print('Learning Finished!')

Learning Started!

log : 0
log : 10
log : 20
log : 30
log : 40
log : 50
log : 60
log : 70
log : 80
log : 90
log : 100
log : 110
log : 120
log : 130
log : 140
log : 150
***epoch*** :  0
-- train 0.83059(63.3%), valid0.82600(63.6%)
-- train 0.90061(58.2%), valid0.89339(58.3%)
-- train 0.87576(62.0%), valid0.86860(62.1%)
-- train 0.93798(56.0%), valid0.93427(56.4%)
-- train 0.96522(59.2%), valid0.95700(59.4%)
-- train 0.88608(62.1%), valid0.88272(62.3%)
-- train 0.83614(63.3%), valid0.83471(63.4%)
-- train 0.86483(61.5%), valid0.86169(62.1%)
-- train 0.92983(59.4%), valid0.92369(59.9%)
-- train 0.98180(56.8%), valid0.97637(56.9%)
-- train 0.92701(58.9%), valid0.91956(58.9%)
-- train 1.11252(52.2%), valid1.10198(52.2%)
-- train 0.80339(64.6%), valid0.80099(64.8%)
-- train 0.98844(57.8%), valid0.97931(57.8%)
-- train 0.86336(62.7%), valid0.85998(63.0%)
-- train 0.84767(63.0%), valid0.84063(63.4%)
-- train 0.97263(56.0%), valid0.97379(56.0%)
-- train 0.90883(59.6%), valid0.90092(59.6%)
Accur

Accuracy: 0.663750000298
Accuracy: 0.639650000632
Accuracy: 0.646850001812
Accuracy: 0.686149996519
Accuracy: 0.682250000536
Accuracy: 0.680799995363
Accuracy: 0.676650004089
Accuracy: 0.665649998188
Accuracy: 0.657450000942
Accuracy: 0.685299998522
Accuracy: 0.673949998617
Accuracy: 0.684699997306
Accuracy: 0.681849998236
Accuracy: 0.659700003266
Accuracy: 0.663850000501
 
log : 0
log : 10
log : 20
log : 30
log : 40
log : 50
log : 60
log : 70
log : 80
log : 90
log : 100
log : 110
log : 120
log : 130
log : 140
log : 150
***epoch*** :  6
-- train 0.68867(69.3%), valid0.71316(68.6%)
-- train 0.69927(69.1%), valid0.71289(68.7%)
-- train 0.71539(68.1%), valid0.73401(67.6%)
-- train 0.73555(67.4%), valid0.75082(67.0%)
-- train 0.77537(66.2%), valid0.78946(65.9%)
-- train 0.76009(66.2%), valid0.77447(65.9%)
-- train 0.68166(69.8%), valid0.71374(68.7%)
-- train 0.69232(69.2%), valid0.71192(68.2%)
-- train 0.70003(69.5%), valid0.73098(68.6%)
-- train 0.70412(68.7%), valid0.71997(68.4%)
-- trai

-- train 0.61158(72.3%), valid0.69739(69.5%)
-- train 0.65521(70.8%), valid0.69791(69.5%)
-- train 0.61919(72.0%), valid0.70804(69.4%)
-- train 0.65649(70.4%), valid0.69085(69.7%)
-- train 0.67625(69.5%), valid0.71608(68.5%)
-- train 0.68943(69.3%), valid0.72294(68.4%)
Accuracy: 0.69810000211
Accuracy: 0.696150003374
Accuracy: 0.697299996018
Accuracy: 0.688750000298
Accuracy: 0.671250000596
Accuracy: 0.676199999452
Accuracy: 0.694399999082
Accuracy: 0.695950001478
Accuracy: 0.697199995816
Accuracy: 0.692650000751
Accuracy: 0.68504999727
Accuracy: 0.678700000048
Accuracy: 0.695349995792
Accuracy: 0.694649998844
Accuracy: 0.694449998438
Accuracy: 0.696750000119
Accuracy: 0.684949998558
Accuracy: 0.683749999106
 
log : 0
log : 10
log : 20
log : 30
log : 40
log : 50
log : 60
log : 70
log : 80
log : 90
log : 100
log : 110
log : 120
log : 130
log : 140
log : 150
***epoch*** :  12
-- train 0.61064(72.4%), valid0.70302(69.6%)
-- train 0.64185(71.2%), valid0.69345(69.8%)
-- train 0.64622(71.0%)

-- train 0.71664(67.2%), valid0.76194(66.2%)
-- train 0.67788(68.8%), valid0.74578(67.1%)
-- train 0.59104(73.3%), valid0.75339(69.1%)
-- train 0.62370(72.1%), valid0.70271(69.6%)
-- train 0.60852(72.4%), valid0.73377(69.1%)
-- train 0.62508(71.9%), valid0.70059(69.1%)
-- train 0.65238(70.4%), valid0.71643(68.7%)
-- train 0.65250(70.7%), valid0.73684(68.2%)
-- train 0.56559(74.3%), valid0.73516(69.7%)
-- train 0.62560(71.7%), valid0.69557(69.6%)
-- train 0.59446(73.2%), valid0.73618(69.4%)
-- train 0.63436(71.6%), valid0.70532(69.7%)
-- train 0.64616(70.7%), valid0.71315(68.9%)
-- train 0.65715(70.4%), valid0.71462(68.7%)
Accuracy: 0.696950000525
Accuracy: 0.696750000119
Accuracy: 0.694799993932
Accuracy: 0.692550000548
Accuracy: 0.661900000274
Accuracy: 0.671400000155
Accuracy: 0.690999998152
Accuracy: 0.696499998868
Accuracy: 0.690650004148
Accuracy: 0.691199998558
Accuracy: 0.686999996006
Accuracy: 0.682049995661
Accuracy: 0.696900002658
Accuracy: 0.6956499964
Accuracy: 0.6939999997

KeyboardInterrupt: 

 <br></br><br></br><br></br> 

In [None]:
for idx in range(len(model_lst)) :
    plt.plot(tl_ta_vl_va_lst[idx][0], label='training'+str(idx))
    plt.plot(tl_ta_vl_va_lst[idx][2], label='valid'+str(idx))
    plt.title("model"+str(idx))
    plt.grid("on")
    plt.legend()
    plt.show()

In [None]:
for idx in range(len(model_lst)) :
    plt.plot(tl_ta_vl_va_lst[idx][1], label='training'+str(idx))
    plt.plot(tl_ta_vl_va_lst[idx][3], label='valid'+str(idx))
    plt.title("model"+str(idx))
    plt.grid("on")
    plt.legend()
    plt.show()

In [None]:
for idx in range(len(model_lst)) :
    plt.plot(tl_ta_vl_va_lst[idx][0], label='training'+str(idx))
    
plt.grid("on")
plt.legend()
plt.show()

In [None]:
for idx in range(len(model_lst)) :
    plt.plot(tl_ta_vl_va_lst[idx][2], label='training'+str(idx))
    
plt.grid("on")
plt.legend()
plt.show()

In [None]:
for idx in range(len(model_lst)) :
    plt.plot(tl_ta_vl_va_lst[idx][1], label='training'+str(idx))
    
plt.grid("on")
plt.legend()
plt.show()

In [None]:
for idx in range(len(model_lst)) :
    plt.plot(tl_ta_vl_va_lst[idx][3], label='training'+str(idx))
    
plt.grid("on")
plt.legend()
plt.show()

 <br></br><br></br><br></br>  

In [None]:
#tf.reset_default_graph() 

In [364]:
saver = tf.train.Saver()

for idx, model in enumerate(model_lst) :
    saver.save(model.sess, './model/MLP_default_model_{}'.format(idx))

print("Saved!")

'./advanced_RNN/original_user_vector/original'

<br></br><br></br><br></br>

# Test

In [365]:
def making_data(csv) :
    activity = pd.read_csv(csv).drop("Unnamed: 0", axis=1)
    
    activity = activity[activity["label"] == "empty"]
    activity = activity.drop("label", axis=1)
    
    activity = activity.sort_values(["acc_id","wk"])[['acc_id', 'wk', 'cnt_clear_bam', 'cnt_clear_inzone_light','cnt_clear_inzone_normal', 
                                                  'cnt_clear_inzone_skilled', 'cnt_clear_inzone_solo', 'cnt_clear_raid', 'cnt_clear_raid_light',
                                                  'cnt_dt', 'cnt_enter_bam', 'cnt_enter_inzone_light', 'cnt_enter_inzone_normal', 
                                                  'cnt_enter_inzone_skilled', 'cnt_enter_inzone_solo', 'cnt_enter_raid', 'cnt_enter_raid_light',
                                                  'cnt_use_buffitem', 'district_chat', 'duel_cnt', 'duel_win', 'faction_chat', 'game_combat_time', 
                                                  'gathering_cnt', 'get_money','guild_chat', 'item_hongmun', 'making_cnt', 'normal_chat', 
                                                  'npc_exp', 'npc_hongmun', 'party_chat', 'partybattle_cnt', 'partybattle_win', 'play_time', 
                                                  'quest_exp', 'quest_hongmun', 'whisper_chat','first_week', 'payment_amount']]
    
    activity1 = activity[activity["wk"]==1].drop(["acc_id", "wk"], axis=1)
    activity2 = activity[activity["wk"]==2].drop(["acc_id", "wk"], axis=1)
    activity3 = activity[activity["wk"]==3].drop(["acc_id", "wk"], axis=1)
    activity4 = activity[activity["wk"]==4].drop(["acc_id", "wk"], axis=1)
    activity5 = activity[activity["wk"]==5].drop(["acc_id", "wk"], axis=1)
    activity6 = activity[activity["wk"]==6].drop(["acc_id", "wk"], axis=1)
    activity7 = activity[activity["wk"]==7].drop(["acc_id", "wk"], axis=1)
    activity8 = activity[activity["wk"]==8].drop(["acc_id", "wk"], axis=1)
    
    label = activity["acc_id"].values
    activity = np.concatenate([activity1.values, activity2.values, activity3.values, activity4.values,
                               activity5.values, activity6.values, activity7.values, activity8.values], axis=1)

    total_lst = activity
    return total_lst, label

In [None]:
test_data, test_acc_id = making_data("OnlyExpanded.csv")

In [None]:
print(test_data.shape)

<br></br><br></br><br></br> 

# Predict

In [None]:
result = []
for model in model_lst :
    result.append(np.argmax(model.predict(test_data), axis=2)) 
    
result = list(map(lambda x : x.tolist()[0], result))
for r_lst in result :
    print("week: {}, month: {}, 2month: {}, retained: {}".format(r_lst.count(0), r_lst.count(1), r_lst.count(2), r_lst.count(3)))

In [None]:
label_df = pd.DataFrame(sorted(list(set(list(test_acc_id))))).rename(columns = {0 : "acc_id"})
result_df = pd.DataFrame(result).T

In [None]:
result_df2 = pd.concat([label_df, result_df], axis=1)
result_df2.head()