In [1]:
from tensorflow.contrib.layers import batch_norm
import tensorflow as tf
import pandas as pd
import numpy as np
import argparse
import time

## Preprocessing

In [2]:
## set data path
test_fold = 0
data_name = 'politic_new'
data_base_dir = "../data/"
path = data_base_dir + "%s" % data_name + "/"

train_file_name = 'Train_ratings_fold_' + str(test_fold)
test_file_name = 'Test_ratings_fold_' + str(test_fold)

In [3]:
## raw data load
### user, item, voting으로 구성된 데이터를 로드하고, -1 voting을 0으로 replace 한다.
trn = pd.read_csv(path+train_file_name, sep='\t', names=['user','item','voting'])
tst = pd.read_csv(path+test_file_name, sep='\t', names=['user','item','voting'])

trn = trn.replace(-1, 0)
tst = tst.replace(-1, 0)
print("trn shape:", trn.shape)
print("tst shape:", tst.shape)

trn shape: (2399876, 3)
tst shape: (599968, 3)


In [4]:
trn.head()

Unnamed: 0,user,item,voting
0,653,6012,0
1,425,5908,1
2,654,1464,1
3,626,2505,1
4,1433,5696,1


In [5]:
## pivot data
### trn, tst 데이터를 user by item value voting 으로 pivoting 한다.
trn_pv = trn.pivot(index='user',columns='item',values='voting')
tst_pv = tst.pivot(index='user',columns='item',values='voting')
all_pv = pd.concat([trn, tst]).pivot(index='user',columns='item',values='voting')
print("shape all:", all_pv.shape, "trn:", trn_pv.shape, 'tst:', tst_pv.shape)

### tst 데이터의 index가 trn과 맞지 않아 index를 맞춰 주기 위해 다음과 같이 수행한다.
### 맞추고자 하는 shape의 index와 column을 0으로 세팅하고 tst_pv를 concat 하면,
### row는 union되고 tst_pv에 없는 column은 nan으로 생성된다.
### 후에, index를 group by로 min_count=2로 합을 구하는데, min_count=2는 값이 두개 있어야 합계가 계산된다.
### 즉, tst_pv에 없었던 column과 row는 nan으로 생성되고 나머지 tst_pv 데이터는 값은 유지된다.
all_pv_zero = all_pv.copy()
all_pv_zero[:] = 0
trn_pv = pd.concat([all_pv_zero, trn_pv]).groupby(level=0).sum(min_count=2)
tst_pv = pd.concat([all_pv_zero, tst_pv]).groupby(level=0).sum(min_count=2)
print("shape all:", all_pv.shape, "trn:", trn_pv.shape, 'tst:', tst_pv.shape)

shape all: (1537, 7975) trn: (1536, 7975) tst: (1536, 7975)
shape all: (1537, 7975) trn: (1537, 7975) tst: (1537, 7975)


In [6]:
## make input data
### R은 user by item에서 voting이 1이면 1 아니면 0으로 구성한다.
### mask_R은 user by item에서 voting이 1 또는 0 이면 1로 Nan이면 0으로 데이터 여부를 마스킹한다.
R = all_pv.fillna(0).values
mask_R = all_pv.replace(0.0, 1).fillna(0).values

train_R = trn_pv.fillna(0).values
train_mask_R = trn_pv.replace(0.0, 1).fillna(0).values

test_R = tst_pv.fillna(0).values
test_mask_R = tst_pv.replace(0.0, 1).fillna(0).values

In [7]:
### index 2 유저는 index 0,4 아이템에 voting 1을 했다. 
### 그리고 mask_R로 부터 index 2,3 아이템에 voting -1을 한 것을 알 수 있다.
### 즉 index 1에는 voting 하지 않았다.
print("R")
print(R[:5,:5])
print("mask_R")
print(mask_R[:5,:5])

R
[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1.]
 [1. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 1.]]
mask_R
[[1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1.]
 [1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 1.]]


In [8]:
### train, test의 user, item set을 구성하고 데이터 수를 변수로 저장한다.
user_train_set = set(trn['user'])
item_train_set = set(trn['item'])

user_test_set = set(tst['user'])
item_test_set = set(tst['item'])

num_users = R.shape[0]
num_items = R.shape[1]
num_train_ratings = len(trn)
num_test_ratings = len(tst)
num_total_ratings = num_train_ratings + num_test_ratings

## prepare_model

In [9]:
## parameter setting
hidden_neuron = 50
# layer_structure = [num_items, 512, 128, hidden_neuron, 128, 512, num_items]
layer_structure = [num_items, 128, hidden_neuron, 128, num_items]
lambda_value = 1e-3
lr = 1e-3
global_step = tf.Variable(0, trainable=False)
min_RMSE = 99999
min_epoch = -99999
patience = 0
total_patience = 20

In [10]:
## place holder
### model_mask_corruption은 설정한 random 확률로 0, 1로 된 데이터가 들어간다.
model_mask_corruption = tf.placeholder(dtype=tf.float32, shape=[None, num_items])
input_R = tf.placeholder(dtype=tf.float32, shape=[None, num_items], name="input_R")
input_mask_R = tf.placeholder(dtype=tf.float32, shape=[None, num_items], name="input_mask_R")

model_batch_data_idx = tf.placeholder(dtype=tf.int32)
real_batch_size = tf.cast(tf.shape(input_R)[0], tf.int32)

In [11]:
### corruption data를 input에 곱해준다. 즉, dropout과 동일하다.
corrupted_R = tf.multiply(model_mask_corruption, input_R)
corrupted_input_mask_R = tf.multiply(model_mask_corruption, input_mask_R)

In [12]:
### user embedding matrix를 생성하고 batch user index를 lookup한다.
with tf.variable_scope("user_scopes", reuse=tf.AUTO_REUSE):
    V = tf.get_variable(name="User_embed", shape=[num_users, layer_structure[1]], 
                        initializer=tf.contrib.layers.xavier_initializer(), 
                        dtype=tf.float32)
    batch_V = tf.nn.embedding_lookup(V, model_batch_data_idx)

In [13]:
### weight initialize
def make_layer_weights(n_visible, n_hidden, itr):
    with tf.variable_scope("SDAE_Variable"):
        pre_W = tf.get_variable(name=("pre_W"+str(itr)), shape=[n_visible, n_hidden], 
                                initializer=tf.contrib.layers.xavier_initializer(), 
                                dtype=tf.float32)
        pre_b = tf.get_variable(name=("pre_b"+str(itr)), shape=[n_hidden], 
                                 initializer=tf.zeros_initializer(), dtype=tf.float32)
    return pre_W, pre_b

### 각 layer 층의 W, b weight을 생성한다.
n_layer = len(layer_structure)
Weight = dict()
bias = dict()
for itr in range(n_layer - 1):
    Weight[itr], bias[itr] = make_layer_weights(layer_structure[itr], 
                                                layer_structure[itr + 1], itr)

In [14]:
Weight

{0: <tf.Variable 'SDAE_Variable/pre_W0:0' shape=(7975, 128) dtype=float32_ref>,
 1: <tf.Variable 'SDAE_Variable/pre_W1:0' shape=(128, 50) dtype=float32_ref>,
 2: <tf.Variable 'SDAE_Variable/pre_W2:0' shape=(50, 128) dtype=float32_ref>,
 3: <tf.Variable 'SDAE_Variable/pre_W3:0' shape=(128, 7975) dtype=float32_ref>}

## calculate Encoder and CDAE output(Decoder)

In [15]:
batch_normalization = 'FALSE'
f_act = tf.nn.sigmoid
g_act = tf.nn.sigmoid
keep_prob = 1.0

hidden_value = corrupted_R

In [16]:
for itr1 in range(len(layer_structure) - 1):
    ## Encoder 
    ### encoder 부분으로 첫 레이어에서 batch_V 즉, user data를 넣어준다.
    ### batch normal이 True라면 normal을 수행한다.
    ### sigmoid로 activation 하였다.
    ### matmul -> (batch_normal) -> sigmoid
    if itr1 <= int(len(layer_structure) / 2) - 1:
        if itr1 == 0:
            before_activation = tf.add(
                tf.add(tf.matmul(hidden_value, Weight[itr1]), batch_V), bias[itr1])
        else:
            before_activation = tf.add(tf.matmul(hidden_value, Weight[itr1]), bias[itr1])
        if batch_normalization == "True":
            before_activation = batch_norm(before_activation)
        hidden_value = f_act(before_activation)
    ## Decoder
    ### Encoder와 유사하게 수행한다.
    elif itr1 > int(len(layer_structure) / 2) - 1:
        before_activation = tf.add(tf.matmul(hidden_value, Weight[itr1]), bias[itr1])
        if batch_normalization == "True":
            before_activation = batch_norm(before_activation)
        hidden_value = g_act(before_activation)
    ### 마지막 레이어를 제외하고 dropout을 수행한다. 
    if itr1 < len(layer_structure) - 2: # add dropout except final layer
        hidden_value = tf.nn.dropout(hidden_value, keep_prob)
    ### Encoder 부분이 끝났을 때, Encoder 변수를 저장한다.
    if itr1 == int(len(layer_structure) / 2) - 1:
        Encoded_X = hidden_value

Decoder = hidden_value

## Optimization

In [17]:
## avg reconstruction error term
### log cross entropy를 계산한다.
pre_cost1 = -1 * tf.multiply(corrupted_R, tf.log(Decoder)) - \
                tf.multiply((1-corrupted_R) , tf.log(1-Decoder))
### corrupted_input_mask_R 값을 곱해 기존 존재하던 값만 살린다.
pre_cost1 = tf.multiply(pre_cost1, corrupted_input_mask_R)
### error의 average를 계산한다.
cost1 = tf.reduce_sum(pre_cost1) / tf.cast(real_batch_size, tf.float32)

In [18]:
## regularization term
### 모든 weight의 paramter의 l2 norm을 합한다.
pre_cost2 = tf.constant(0, dtype=tf.float32)
for itr in range(len(Weight.keys())):
    pre_cost2 = tf.add(pre_cost2,
                       tf.add(tf.nn.l2_loss(Weight[itr]), tf.nn.l2_loss(bias[itr])))
pre_cost2 = pre_cost2 + tf.nn.l2_loss(batch_V)
### lambda value를 곱하여 최종 값을 계산한다.
cost2 = lambda_value * 0.5 * pre_cost2

In [19]:
## cost 
cost = cost1 + cost2

In [20]:
## optimizer
optimizer = tf.train.AdamOptimizer(lr)
gvs = optimizer.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs]
optimizer = optimizer.apply_gradients(capped_gvs, global_step=global_step)

## evaluation

In [21]:
def evaluation(test_R,test_mask_R,Estimated_R,num_test_ratings):

    pre_numerator = np.multiply((test_R - Estimated_R), test_mask_R)
    numerator = np.sum(np.square(pre_numerator))
    denominator = num_test_ratings
    RMSE = np.sqrt(numerator / float(denominator))

    pre_numeartor = np.multiply((test_R - Estimated_R), test_mask_R)
    numerator = np.sum(np.abs(pre_numeartor))
    denominator = num_test_ratings
    MAE = numerator / float(denominator)

    pre_numeartor1 = np.sign(Estimated_R - 0.5)
    tmp_test_R = np.sign(test_R - 0.5)

    pre_numerator2 = np.multiply((pre_numeartor1 == tmp_test_R), test_mask_R)
    numerator = np.sum(pre_numerator2)
    denominator = num_test_ratings
    ACC = numerator / float(denominator)

    a = np.log(Estimated_R)
    b = np.log(1 - Estimated_R)
    a[a == -np.inf] = 0
    b[b == -np.inf] = 0

    tmp_r = test_R
    tmp_r = a * (tmp_r > 0) + b * (tmp_r == 0)
    tmp_r = np.multiply(tmp_r, test_mask_R)
    numerator = np.sum(tmp_r)
    denominator = num_test_ratings
    AVG_loglikelihood = numerator / float(denominator)

    return RMSE,MAE,ACC,AVG_loglikelihood

## train_model

In [22]:
## tf session start
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [23]:
epoch = 50
train_cost_list = []
test_cost_list = []
test_rmse_list = []
test_mae_list = []
test_acc_list = []
test_avg_loglike_list = []
batch_size = 64
display_step = 1
num_batch = int(num_users / float(batch_size)) + 1

In [24]:
start_time = time.time()
for itr in range(epoch):
    ## corruption data 생성 및 user shuffle
    corruption_level = 0.3
    mask_corruption_np = np.random.binomial(1, 1 - corruption_level, (num_users, num_items))
    random_perm_doc_idx = np.random.permutation(num_users)
    batch_cost = 0
    for i in range(num_batch):
        if i == num_batch - 1:
            batch_set_idx = random_perm_doc_idx[i * batch_size:]
        elif i < num_batch -1:
            batch_set_idx = random_perm_doc_idx[i * batch_size : (i+1) * batch_size]

        _, Cost = sess.run([optimizer, cost], 
                           feed_dict={model_mask_corruption: mask_corruption_np[batch_set_idx, :],
                                      input_R: train_R[batch_set_idx, :],
                                      input_mask_R: train_mask_R[batch_set_idx, :],
                                      model_batch_data_idx: batch_set_idx})
        batch_cost = batch_cost + Cost

    if i % display_step == 0:
        print ("Training //", "Epoch %d //" % (itr+1),  
               "Train cost = {:.2f}".format(batch_cost/num_batch), 
               "Elapsed time : %d sec" % (time.time() - start_time))
    
    '''test''' 
    ## validation 및 test에서는 corruption을 하지 않는다.
    mask_corruption_np = np.random.binomial(1, 1 - 0, (num_users, num_items))
    batch_set_idx = np.arange(num_users)
    
    Cost, decoder = sess.run([cost, Decoder],
                        feed_dict={model_mask_corruption: mask_corruption_np, 
                                   input_R: test_R,
                                   input_mask_R: test_mask_R,
                                   model_batch_data_idx: batch_set_idx})
    test_cost_list.append(Cost)
    Estimated_R = decoder.clip(min=0, max=1)
    RMSE, MAE, ACC, AVG_loglikelihood = evaluation(test_R, test_mask_R, 
                                               Estimated_R, num_test_ratings)
    test_rmse_list.append(RMSE)
    test_mae_list.append(MAE)
    test_acc_list.append(ACC)
    test_avg_loglike_list.append(AVG_loglikelihood)
    
    if itr % display_step == 0:
        print("Testing //", "Epoch %d //" % (itr+1), " Test cost = {:.2f}".format(Cost))
        print("RMSE = {:.4f}".format(RMSE), "MAE = {:.4f}".format(MAE), 
              "ACC = {:.10f}".format(ACC), "AVG Loglike = {:.4f}".format(AVG_loglikelihood))
        print("=" * 100)
        
    if RMSE <= min_RMSE:
        min_RMSE = RMSE
        min_epoch = itr
        patience = 0
    else:
        patience = patience + 1
        
    if (itr > 100) and (patience >= total_patience):
        test_rmse_list.append(test_rmse_list[min_epoch])
        test_mae_list.append(test_mae_list[min_epoch])
        test_acc_list.append(test_acc_list[min_epoch])
        test_avg_loglike_list.append(test_avg_loglike_list[min_epoch])
        earlystop_switch = True
        print ("========== Early Stopping at Epoch %d" %itr+1)

Training // Epoch 1 // Train cost = 581.02 Elapsed time : 1 sec
Testing // Epoch 1 //  Test cost = 154.60
RMSE = 0.3527 MAE = 0.2853 ACC = 0.8030378287 AVG Loglike = -0.3953
Training // Epoch 2 // Train cost = 398.45 Elapsed time : 6 sec
Testing // Epoch 2 //  Test cost = 138.92
RMSE = 0.3451 MAE = 0.2409 ACC = 0.8001610086 AVG Loglike = -0.3549
Training // Epoch 3 // Train cost = 391.33 Elapsed time : 9 sec
Testing // Epoch 3 //  Test cost = 136.85
RMSE = 0.3442 MAE = 0.2371 ACC = 0.8033845138 AVG Loglike = -0.3496
Training // Epoch 4 // Train cost = 398.86 Elapsed time : 16 sec
Testing // Epoch 4 //  Test cost = 136.67
RMSE = 0.3447 MAE = 0.2361 ACC = 0.8011877300 AVG Loglike = -0.3491
Training // Epoch 5 // Train cost = 384.07 Elapsed time : 24 sec
Testing // Epoch 5 //  Test cost = 136.53
RMSE = 0.3447 MAE = 0.2369 ACC = 0.8001676756 AVG Loglike = -0.3487
Training // Epoch 6 // Train cost = 367.76 Elapsed time : 31 sec
Testing // Epoch 6 //  Test cost = 135.97
RMSE = 0.3439 MAE = 0

Training // Epoch 31 // Train cost = 140.23 Elapsed time : 222 sec
Testing // Epoch 31 //  Test cost = 55.20
RMSE = 0.1983 MAE = 0.0819 ACC = 0.9473221905 AVG Loglike = -0.1364
Training // Epoch 32 // Train cost = 138.29 Elapsed time : 229 sec
Testing // Epoch 32 //  Test cost = 54.96
RMSE = 0.1978 MAE = 0.0814 ACC = 0.9474821991 AVG Loglike = -0.1357
Training // Epoch 33 // Train cost = 140.95 Elapsed time : 237 sec
Testing // Epoch 33 //  Test cost = 54.79
RMSE = 0.1977 MAE = 0.0803 ACC = 0.9473788602 AVG Loglike = -0.1352
Training // Epoch 34 // Train cost = 144.73 Elapsed time : 246 sec
Testing // Epoch 34 //  Test cost = 55.00
RMSE = 0.1981 MAE = 0.0802 ACC = 0.9474888661 AVG Loglike = -0.1357
Training // Epoch 35 // Train cost = 135.55 Elapsed time : 254 sec
Testing // Epoch 35 //  Test cost = 55.00
RMSE = 0.1983 MAE = 0.0790 ACC = 0.9470155075 AVG Loglike = -0.1355
Training // Epoch 36 // Train cost = 133.73 Elapsed time : 262 sec
Testing // Epoch 36 //  Test cost = 54.66
RMSE =