In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier

%matplotlib inline
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Model

In [None]:
class CNN() :
    def __init__(self, sess, name):
        self.sess = sess
        self.name = name
        
    def convolution(self, input_X, kernel_size, width, num_filter, activation=True) :
        conv = tf.layers.conv2d(input_X, filters=num_filter, kernel_size=[kernel_size, width], strides=1)
        
        if activation :
            norm = tf.contrib.layers.layer_norm(conv)
            relu = tf.nn.relu(norm)
        
            return relu
        return conv
        
    def build(self, batch_size, height, width, depth, is_embedding, emb_width, num_filter, is_fc, fc_num_unit, fc_activation, cost_function, output_dim) :
        with tf.variable_scope(self.name) :
            
            ## Setting ##
            self.batch_size = batch_size
            self.height = height
            self.width = width
            self.depth = depth
            self.is_embedding = is_embedding
            self.emb_width = emb_width
            self.num_filter = num_filter
            self.is_fc = is_fc
            self.fc_num_unit = fc_num_unit
            self.fc_activation = fc_activation
            self.output_dim = output_dim
            
            self.X = tf.placeholder(tf.float32, [None, self.height, self.width])
            self.Y = tf.placeholder(tf.float32, [None, self.output_dim])
            self.learning_rate =  tf.placeholder(tf.float32)
            self.training = tf.placeholder(tf.bool)
            #############
            
            
            ## Embedding ##
            if self.is_embedding :
                emb_dense = tf.layers.dense(self.X, self.emb_width*2)
                emb_norm = tf.contrib.layers.layer_norm(emb_dense)
                emb_relu = tf.nn.relu(emb_norm)
                emb_concat = tf.concat([self.X, tf.layers.dense(emb_relu, self.emb_width)], axis=2)
                reshaped_X = tf.reshape(emb_concat, [-1, self.height, self.emb_width+self.width, self.depth])
                self.width = self.emb_width+self.width
            else :
                reshaped_X = tf.reshape(self.X, [-1, self.height, self.width, self.depth])
            ###############
            
            
            ## Convolution ##
            conv1 = self.convolution(reshaped_X, 1, self.width, self.num_filter)
            conv1 = self.convolution(conv1, 4, 1, self.num_filter*2)
            conv1 = self.convolution(conv1, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv1.get_shape().as_list()
            conv1 = tf.reshape(conv1, [-1, height*width*depth])
            
            conv2 = self.convolution(reshaped_X, 2, self.width, self.num_filter)
            conv2 = self.convolution(conv2, 4, 1, self.num_filter*2)
            conv2 = self.convolution(conv2, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv2.get_shape().as_list()
            conv2 = tf.reshape(conv2, [-1, height*width*depth])
            
            conv3 = self.convolution(reshaped_X, 3, self.width, self.num_filter)
            conv3 = self.convolution(conv3, 4, 1, self.num_filter*2)
            conv3 = self.convolution(conv3, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv3.get_shape().as_list()
            conv3 = tf.reshape(conv3, [-1, height*width*depth])
            
            conv4 = self.convolution(reshaped_X, 4, self.width, self.num_filter)
            conv4 = self.convolution(conv4, 4, 1, self.num_filter*2)
            conv4 = self.convolution(conv4, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv4.get_shape().as_list()
            conv4 = tf.reshape(conv4, [-1, height*width*depth])
            
            conv5 = self.convolution(reshaped_X, 5, self.width, self.num_filter)
            conv5 = self.convolution(conv5, 4, 1, self.num_filter*2)
            conv5 = self.convolution(conv5, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv5.get_shape().as_list()
            conv5 = tf.reshape(conv5, [-1, height*width*depth])
            
            conv6 = self.convolution(reshaped_X, 6, self.width, self.num_filter)
            conv6 = self.convolution(conv6, 3, 1, self.num_filter*2)
            conv6 = self.convolution(conv6, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv6.get_shape().as_list()
            conv6 = tf.reshape(conv6, [-1, height*width*depth])
            
            conv7 = self.convolution(reshaped_X, 7, self.width, self.num_filter)
            conv7 = self.convolution(conv7, 2, 1, self.num_filter*2)
            conv7 = self.convolution(conv7, 1, 1, self.num_filter//4, activation=False)
            batch, height, width, depth = conv7.get_shape().as_list()
            conv7 = tf.reshape(conv7, [-1, height*width*depth])
            
            conv8 = self.convolution(reshaped_X, 8, self.width, self.num_filter)
            conv8 = self.convolution(conv8, 1, 1, self.num_filter*2)
            conv8 = self.convolution(conv8, 1, 1, self.num_filter//8, activation=False)
            batch, height, width, depth = conv8.get_shape().as_list()
            conv8 = tf.reshape(conv8, [-1, height*width*depth])
            #################
            
            
            ## Classifier ##
            conv_result = tf.concat([conv1, conv2, conv3, conv4, conv5, conv6, conv7, conv8], axis=1)
            
            if is_fc : 
                dense= tf.layers.dense(conv_result, self.fc_num_unit)
                norm = tf.contrib.layers.layer_norm(dense)
                relu = tf.nn.relu(norm)
                self.logit = tf.layers.dense(norm, self.output_dim)
            else :
                self.logit = tf.layers.dense(conv_result,  self.output_dim)
                
            self.softmax = tf.nn.softmax(self.logit)
            ################
            
            
            ## Learning ##
            if cost_function == "f1" :
                self.numerator = tf.reduce_sum(self.softmax*self.Y)
                self.denominator = tf.reduce_sum(self.softmax*self.Y + self.Y)
                self.cost = -2 * self.numerator / self.denominator
                
            else :
                self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logit, labels=self.Y))

            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope=self.name)
            with tf.control_dependencies(update_ops):
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
            
            self.prediction = tf.equal(tf.argmax(self.logit, 1), tf.argmax(self.Y, 1))     
            self.accuracy = tf.reduce_mean(tf.cast(self.prediction, tf.float32))    
            ##############
        
        
    def train(self, X_input, Y_input, learning_rate, training=True):
        feed_dict = {self.X: X_input, self.Y: Y_input, self.learning_rate: learning_rate, self.training: training}
        _, cost = self.sess.run([self.optimizer, self.cost], feed_dict=feed_dict)
        
        return _, cost
    
    def predict(self, X_input, training=False):
        feed_dict = {self.X: X_input, self.training: training}
        result = self.sess.run([self.logit], feed_dict=feed_dict)
            
        return result
    
    def evaluate(self, X_input, Y_input):
        size = X_input.shape[0]
            
        total_loss = 0
        total_acc = 0
            
        for idx in range(0, size, self.batch_size):
            X_batch = X_input[idx:idx + batch_size]
            Y_batch = Y_input[idx:idx + batch_size]
            feed_dict = {self.X: X_batch, self.Y: Y_batch, self.training: False}
                
            loss = self.cost
            accuracy = self.accuracy
                
            step_loss, step_acc = self.sess.run([loss, accuracy], feed_dict=feed_dict)
                
            total_loss += step_loss * X_batch.shape[0]
            total_acc += step_acc * X_batch.shape[0]
            
        total_loss /= size
        total_acc /= size
            
        return total_loss, total_acc

# Ensemble

In [None]:
def making_valid(cv, fold=5) :
    activity = cv.drop("label", axis=1)
    
    activity1 = activity[activity["wk"]==1].drop(["acc_id", "wk"], axis=1)
    activity2 = activity[activity["wk"]==2].drop(["acc_id", "wk"], axis=1)
    activity3 = activity[activity["wk"]==3].drop(["acc_id", "wk"], axis=1)
    activity4 = activity[activity["wk"]==4].drop(["acc_id", "wk"], axis=1)
    activity5 = activity[activity["wk"]==5].drop(["acc_id", "wk"], axis=1)
    activity6 = activity[activity["wk"]==6].drop(["acc_id", "wk"], axis=1)
    activity7 = activity[activity["wk"]==7].drop(["acc_id", "wk"], axis=1)
    activity8 = activity[activity["wk"]==8].drop(["acc_id", "wk"], axis=1)
    
    num_values = len(activity1.values[0])
    activity = np.concatenate([activity1.values.reshape([-1, 1, num_values]), activity2.values.reshape([-1, 1, num_values]), 
                               activity3.values.reshape([-1, 1, num_values]), activity4.values.reshape([-1, 1, num_values]),
                               activity5.values.reshape([-1, 1, num_values]), activity6.values.reshape([-1, 1, num_values]),
                               activity7.values.reshape([-1, 1, num_values]), activity8.values.reshape([-1, 1, num_values])], axis=1)
    
    return activity

In [None]:
def making_ensemble(cv, pred, pred_id, fold=5) :
    activity = cv.drop("label", axis=1)
    label = pd.concat([pd.DataFrame(sorted(list(set(pred_id.acc_id.tolist()))), columns = ["acc_id"]), pred], axis=1)

    activity = pd.merge(activity, label, how='left', on='acc_id')
    activity = activity[activity["label"] < 0.7].drop("label", axis=1)
    acc_id = activity[["acc_id"]]
    
    activity1 = activity[activity["wk"]==1].drop(["acc_id", "wk"], axis=1)
    activity2 = activity[activity["wk"]==2].drop(["acc_id", "wk"], axis=1)
    activity3 = activity[activity["wk"]==3].drop(["acc_id", "wk"], axis=1)
    activity4 = activity[activity["wk"]==4].drop(["acc_id", "wk"], axis=1)
    activity5 = activity[activity["wk"]==5].drop(["acc_id", "wk"], axis=1)
    activity6 = activity[activity["wk"]==6].drop(["acc_id", "wk"], axis=1)
    activity7 = activity[activity["wk"]==7].drop(["acc_id", "wk"], axis=1)
    activity8 = activity[activity["wk"]==8].drop(["acc_id", "wk"], axis=1)
    
    num_values = len(activity1.values[0])
    activity = np.concatenate([activity1.values.reshape([-1, 1, num_values]), activity2.values.reshape([-1, 1, num_values]), 
                               activity3.values.reshape([-1, 1, num_values]), activity4.values.reshape([-1, 1, num_values]),
                               activity5.values.reshape([-1, 1, num_values]), activity6.values.reshape([-1, 1, num_values]),
                               activity7.values.reshape([-1, 1, num_values]), activity8.values.reshape([-1, 1, num_values])], axis=1)

    return activity, acc_id

In [None]:
def making_df(pred, pred_id) :
    label = pd.concat([pd.DataFrame(sorted(list(set(pred_id.acc_id.tolist()))), columns = ["acc_id"]), pred], axis=1)

    return label

In [None]:
cv_data = get_data_cv("OnlyExpanded.csv")

In [None]:
result_df_lst = []

for cv_num in range(1) :
    print("")
    
    if cv_num == 0 :
        valid_num = 4
    elif cv_num == 1 :
        valid_num = 3
    elif cv_num == 2 :
        valid_num = 2
    elif cv_num == 3 :
        valid_num = 1
    elif cv_num == 4 :
        valid_num = 0
        
    
    # 초기 분기
    valid_cv_lst = making_valid(cv_data[valid_num])
    valid_cv_acc_id = cv_data[valid_num][["acc_id"]]
    
    result0 = [model_lst[cv_num][0].sess.run(model_lst[cv_num][0].softmax, 
                                             feed_dict = {model_lst[cv_num][0].X : valid_cv_lst, model_lst[cv_num][0].training :False})][0][:,1]
    result0 = list(result0)
    
    result1 = [model_lst[cv_num][1].sess.run(model_lst[cv_num][1].softmax, 
                                             feed_dict = {model_lst[cv_num][1].X : valid_cv_lst, model_lst[cv_num][1].training :False})][0][:,1]
    result1 = list(result1)
    
    result2 = [model_lst[cv_num][2].sess.run(model_lst[cv_num][2].softmax, 
                                             feed_dict = {model_lst[cv_num][2].X : valid_cv_lst, model_lst[cv_num][2].training :False})][0][:,1]
    result2 = list(result2)
    
    result3 = [model_lst[cv_num][3].sess.run(model_lst[cv_num][3].softmax, 
                                             feed_dict = {model_lst[cv_num][3].X : valid_cv_lst, model_lst[cv_num][3].training :False})][0][:,1]
    result3 = list(result3)
    
    result_df0 = pd.DataFrame(result0).rename(columns = {0 : "label"})
    result_df1 = pd.DataFrame(result1).rename(columns = {0 : "label"})
    result_df2 = pd.DataFrame(result2).rename(columns = {0 : "label"})
    result_df3 = pd.DataFrame(result3).rename(columns = {0 : "label"})

    
    # retain 분기
    valid_cv_lst3, valid_cv_acc_id3 = making_ensemble(cv_data[valid_num], result_df3, valid_cv_acc_id)

    result4 = [model_lst[cv_num][4].sess.run(model_lst[cv_num][4].softmax, 
                                             feed_dict = {model_lst[cv_num][4].X : valid_cv_lst3, 
                                                          model_lst[cv_num][4].training :False})][0][:,1]
    result4 = list(result4)

    result5 = [model_lst[cv_num][5].sess.run(model_lst[cv_num][5].softmax, 
                                             feed_dict = {model_lst[cv_num][5].X : valid_cv_lst3, model_lst[cv_num][5].training :False})][0][:,1]
    result5 = list(result5)
    
    result6 = [model_lst[cv_num][6].sess.run(model_lst[cv_num][6].softmax, 
                                             feed_dict = {model_lst[cv_num][6].X : valid_cv_lst3, model_lst[cv_num][6].training :False})][0][:,1]
    result6 = list(result6)
    
    result_df4 = pd.DataFrame(result4).rename(columns = {0 : "label"})
    result_df5 = pd.DataFrame(result5).rename(columns = {0 : "label"})
    result_df6 = pd.DataFrame(result6).rename(columns = {0 : "label"})
    
    valid_cv_lst6, valid_cv_acc_id6 = making_ensemble(cv_data[valid_num], result_df6, valid_cv_acc_id3)
    valid_cv_lst4, valid_cv_acc_id4 = making_ensemble(cv_data[valid_num], result_df4, valid_cv_acc_id3)
    valid_cv_lst5, valid_cv_acc_id5 = making_ensemble(cv_data[valid_num], result_df5, valid_cv_acc_id3)
    
    result7 = [model_lst[cv_num][7].sess.run(model_lst[cv_num][7].softmax, 
                                             feed_dict = {model_lst[cv_num][7].X : valid_cv_lst6, model_lst[cv_num][7].training :False})][0][:,1]
    result7 = list(result7)
    
    result8 = [model_lst[cv_num][8].sess.run(model_lst[cv_num][8].softmax, 
                                             feed_dict = {model_lst[cv_num][8].X : valid_cv_lst4, model_lst[cv_num][8].training :False})][0][:,1]
    result8 = list(result8)
    
    result9 = [model_lst[cv_num][9].sess.run(model_lst[cv_num][9].softmax, 
                                             feed_dict = {model_lst[cv_num][9].X : valid_cv_lst5, model_lst[cv_num][9].training :False})][0][:,1]
    result9 = list(result9)
    
    result_df7 = pd.DataFrame(result7).rename(columns = {0 : "label"})
    result_df8 = pd.DataFrame(result8).rename(columns = {0 : "label"})
    result_df9 = pd.DataFrame(result9).rename(columns = {0 : "label"})
    
    
    # week 분기
    valid_cv_lst0, valid_cv_acc_id0 = making_ensemble(cv_data[valid_num], result_df0, valid_cv_acc_id)
    
    result10 = [model_lst[cv_num][10].sess.run(model_lst[cv_num][10].softmax, 
                                             feed_dict = {model_lst[cv_num][10].X : valid_cv_lst0, model_lst[cv_num][10].training :False})][0][:,1]
    result10 = list(result10)
    
    result11 = [model_lst[cv_num][11].sess.run(model_lst[cv_num][11].softmax, 
                                             feed_dict = {model_lst[cv_num][11].X : valid_cv_lst0, model_lst[cv_num][11].training :False})][0][:,1]
    result11 = list(result11)
    
    result12 = [model_lst[cv_num][12].sess.run(model_lst[cv_num][12].softmax, 
                                             feed_dict = {model_lst[cv_num][12].X : valid_cv_lst0, model_lst[cv_num][12].training :False})][0][:,1]
    result12 = list(result12)
    
    result_df10 = pd.DataFrame(result10).rename(columns = {0 : "label"})
    result_df11 = pd.DataFrame(result11).rename(columns = {0 : "label"})
    result_df12 = pd.DataFrame(result12).rename(columns = {0 : "label"})
    
    valid_cv_lst12, valid_cv_acc_id12 = making_ensemble(cv_data[valid_num], result_df12, valid_cv_acc_id0)
    valid_cv_lst10, valid_cv_acc_id10 = making_ensemble(cv_data[valid_num], result_df10, valid_cv_acc_id0)
    valid_cv_lst11, valid_cv_acc_id11 = making_ensemble(cv_data[valid_num], result_df11, valid_cv_acc_id0)
    
    result13 = [model_lst[cv_num][13].sess.run(model_lst[cv_num][13].softmax, 
                                             feed_dict = {model_lst[cv_num][13].X : valid_cv_lst12, model_lst[cv_num][13].training :False})][0][:,1]
    result13 = list(result13)
    
    result14 = [model_lst[cv_num][14].sess.run(model_lst[cv_num][14].softmax, 
                                             feed_dict = {model_lst[cv_num][14].X : valid_cv_lst10, model_lst[cv_num][14].training :False})][0][:,1]
    result14 = list( result14)
    
    result15 = [model_lst[cv_num][15].sess.run(model_lst[cv_num][15].softmax, 
                                             feed_dict = {model_lst[cv_num][15].X : valid_cv_lst11, model_lst[cv_num][15].training :False})][0][:,1]
    result15 = list(result15)
    
    result_df13 = pd.DataFrame(result13).rename(columns = {0 : "label"})
    result_df14 = pd.DataFrame(result14).rename(columns = {0 : "label"})
    result_df15 = pd.DataFrame(result15).rename(columns = {0 : "label"})
    
    
    # 모든 label 예측
    result16 = [model_lst[cv_num][16].sess.run(model_lst[cv_num][16].softmax, 
                                             feed_dict = {model_lst[cv_num][16].X : valid_cv_lst, model_lst[cv_num][16].training :False})][0][:,:]
    result16 = list(result16)
    
    result_df16 = pd.DataFrame(result16).rename(columns = {0 : "total_week", 1: "total_month", 2:"total_2month", 3:"total_retained"})
    
    result_stack_df = pd.concat([making_df(result_df0, valid_cv_acc_id).set_index("acc_id").rename(columns={"label" : "week-tree"}), 
                                 making_df(result_df1, valid_cv_acc_id).set_index("acc_id").rename(columns={"label" : "month-tree"}), 
                                 making_df(result_df2, valid_cv_acc_id).set_index("acc_id").rename(columns={"label" : "2month-tree"}), 
                                 making_df(result_df3, valid_cv_acc_id).set_index("acc_id").rename(columns={"label" : "retained-tree"}), 
                                 making_df(result_df4, valid_cv_acc_id3).set_index("acc_id").rename(columns={"label" : "retained-week-tree"}), 
                                 making_df(result_df5, valid_cv_acc_id3).set_index("acc_id").rename(columns={"label" : "retained-month-tree"}), 
                                 making_df(result_df6, valid_cv_acc_id3).set_index("acc_id").rename(columns={"label" : "retained-2month-tree"}), 
                                 making_df(result_df7, valid_cv_acc_id6).set_index("acc_id").rename(columns={"label" : "retained-2month-week_month-tree"}), 
                                 making_df(result_df8, valid_cv_acc_id4).set_index("acc_id").rename(columns={"label" : "retained-week-month_2month-tree"}),
                                 making_df(result_df9, valid_cv_acc_id5).set_index("acc_id").rename(columns={"label" : "retained-month-week_2month-tree"}),
                                 making_df(result_df10, valid_cv_acc_id0).set_index("acc_id").rename(columns={"label" : "week-month-tree"}), 
                                 making_df(result_df11, valid_cv_acc_id0).set_index("acc_id").rename(columns={"label" : "week-2month-tree"}), 
                                 making_df(result_df12, valid_cv_acc_id0).set_index("acc_id").rename(columns={"label" : "week-retained-tree"}), 
                                 making_df(result_df13, valid_cv_acc_id12).set_index("acc_id").rename(columns={"label" : "week-retained-month_2month-tree"}), 
                                 making_df(result_df14, valid_cv_acc_id10).set_index("acc_id").rename(columns={"label" : "week-month-2month_retained-tree"}),
                                 making_df(result_df15, valid_cv_acc_id11).set_index("acc_id").rename(columns={"label" : "week-2month-month_retained-tree"}),
                                 making_df(result_df16, valid_cv_acc_id).set_index("acc_id").rename(columns={"label" : "total"})], axis=1)
    
    result_df_lst.append(result_stack_df)

In [None]:
result_stack_df_label.to_csv("final_result/valid_250_epoch.csv")