In [1]:
import tensorflow as tf
import numpy as np
import os
import tensorflow as tf
from sklearn import preprocessing #标准化数据模块
########## Hyperparameter ##########
BATCH_SIZE = 5
EPOCH_BOUND = 1000
EARLY_STOP_CHECK_EPOCH = 100
TAKE_CROSS_VALIDATION = True
LEARNING_RATE = 0.01
CROSS_VALIDATION = 10
########## Hyperparameter ##########

def loadTrainFile():
    tmp = np.loadtxt("train.csv", dtype=np.str, delimiter=",")
    userID = tmp[1:,0].astype(int)
    item1 = tmp[1:,1].astype(int)
    item2 = tmp[1:,2].astype(int)
    labels = tmp[1:,3].astype(int)
    return userID, item1, item2, labels
def loadTestFile():
    tmp = np.loadtxt("test.csv", dtype=np.str, delimiter=",")
    userID = tmp[1:,0].astype(int)
    item1 = tmp[1:,1].astype(int)
    item2 = tmp[1:,2].astype(int)
    return userID, item1, item2
def loadUserFile():
    tmp = np.loadtxt("users.csv", dtype=str, delimiter=",")
    return tmp[1:,1:]

def loadItemFile():
    tmp = np.loadtxt("items.csv", dtype=np.str, delimiter=",")
    return tmp[1:,1:]

def dnn(x):
    dense1 = tf.layers.dense(
        inputs=x,
        units=8,
        activation=tf.nn.relu,
        name='dense1'
    )
    dense2 = tf.layers.dense(
        inputs=dense1,
        units=8,
        activation=tf.nn.relu,
        name='dense2'
    )
    dense3 = tf.layers.dense(
        inputs=dense2,
        units=8,
        activation=tf.nn.relu,
        name='dense3'
    )
    dense4 = tf.layers.dense(
        inputs=dense3,
        units=8,
        activation=tf.nn.relu,
        name='dense4'
    )
    logits = tf.layers.dense(inputs=dense4, units=1, name='logits')
    
    return logits

# split dataset into training set and one validation set
def split_folds(indices, Inputs, Labels, cross_validation, fold):
    n = Inputs.shape[0]
    if fold == cross_validation:
        validation_size = n - (int(n/cross_validation) * (cross_validation-1))
        X_train_idx, X_validate_idx = indices[:(n-validation_size)], indices[(n-validation_size):]
        y_train_idx, y_validate_idx = indices[:(n-validation_size)], indices[(n-validation_size):]
    else:
        validation_size = int(n/cross_validation)
        X_train_idx, X_validate_idx = np.concatenate((indices[:validation_size*(fold-1)], indices[validation_size*fold:]), axis=0), indices[(validation_size*(fold-1)):(validation_size*fold)]
        y_train_idx, y_validate_idx = np.concatenate((indices[:validation_size*(fold-1)], indices[validation_size*fold:]), axis=0), indices[(validation_size*(fold-1)):(validation_size*fold)]
    X_train, X_validate = np.array(Inputs[X_train_idx,:]), np.array(Inputs[X_validate_idx,:])
    y_train, y_validate = np.array(Labels[y_train_idx]), np.array(Labels[y_validate_idx])
    return X_train, y_train, X_validate, y_validate

def train(X_train, y_train, X_validate, y_validate, optimizer, epoch_bound, stop_threshold, batch_size, testing=False):

    global saver
    global loss
    
    early_stop = 0
    winner_loss = np.infty
    
    for epoch in range(epoch_bound):

        # randomize training set
        indices_training = np.random.permutation(X_train.shape[0])
        X_train, y_train = X_train[indices_training,:], y_train[indices_training]

        # split training set into multiple mini-batches and start training
        total_batches = int(X_train.shape[0] / batch_size)
        for batch in range(total_batches):
            if batch == total_batches - 1:
                sess.run(optimizer, feed_dict={x: X_train[batch*batch_size:], 
                                               y: y_train[batch*batch_size:]})
            else:
                sess.run(optimizer, feed_dict={x: X_train[batch*batch_size : (batch+1)*batch_size], 
                                               y: y_train[batch*batch_size : (batch+1)*batch_size]})
        
        # validating
        cur_loss = 0.0
        total_batches = int(X_validate.shape[0] / batch_size)
        cur_loss = sess.run(loss, feed_dict={x:X_validate,
                                             y:y_validate})
#         print('Loss: ', cur_loss)
        # If the accuracy rate does not increase for many times, it will early stop epochs-loop 
        if cur_loss < winner_loss:
            early_stop = 0
            winner_loss = cur_loss
            
            save_path = saver.save(sess, "./saved_model/dnn.ckpt")
        else:
            early_stop += 1
        if early_stop == stop_threshold:
            break
    
    saver.restore(sess, "./saved_model/dnn.ckpt")
#     winner_accuracy = sess.run(accuracy, feed_dict={x:X_validate,
#                                                     y:y_validate})
    return winner_loss, epoch

  return f(*args, **kwds)


In [2]:
########### Data ###########
user_dic = loadUserFile()
item_dic = loadItemFile()
userID, item1, item2, labels = loadTrainFile()

preference = np.zeros([len(user_dic),10], dtype=int)

for idx, label in enumerate(labels):
    if(label==0):
        preference[userID[idx]-1][item1[idx]-1]+=1
        preference[userID[idx]-1][item2[idx]-1]-=1
    else:
        preference[userID[idx]-1][item1[idx]-1]-=1
        preference[userID[idx]-1][item2[idx]-1]+=1

preference = preprocessing.scale(preference, axis=1, copy=False)
X_train = []
y_train = []
for i in range(len(user_dic)):
    for j in range(len(item_dic)):
        X_train.append(np.concatenate([user_dic[i].astype(float),item_dic[j].astype(float)]))
        y_train.append(preference[i][j])
X_train = np.array(X_train)
y_train = np.array(y_train).reshape(len(y_train))
print(X_train)
print(y_train)

########### Data ###########

[[3.  4.  2.  ... 1.  2.5 2. ]
 [3.  4.  2.  ... 2.  5.5 1. ]
 [3.  4.  2.  ... 1.  4.5 2. ]
 ...
 [3.  2.  2.  ... 2.  2.5 1. ]
 [3.  2.  2.  ... 2.  3.5 2. ]
 [3.  2.  2.  ... 2.  4.5 2. ]]
[ 0.         -1.13592367  1.13592367  0.85194275  0.85194275 -0.56796183
 -1.98786642 -0.56796183  0.28398092  1.13592367  0.65938047 -0.65938047
  1.31876095 -0.32969024 -0.98907071 -0.65938047  1.64845118 -0.65938047
  0.98907071 -1.31876095 -0.56796183  1.13592367 -0.56796183 -0.28398092
 -1.41990459  0.56796183  1.98786642  0.56796183 -0.28398092 -1.13592367
  0.58722022 -1.17444044  1.76166066  0.88083033  0.29361011 -1.17444044
 -0.88083033 -1.17444044  0.88083033  0.          0.          0.
  0.         -1.18585412  1.18585412  0.79056942 -1.18585412  1.58113883
  0.39528471 -1.58113883  0.62017367 -1.24034735  1.24034735  0.31008684
  0.93026051  0.         -2.17060786 -0.62017367  0.31008684  0.62017367
  0.         -1.17444044  1.76166066  0.88083033  0.88083033 -1.17444044
 -0.29361011 



In [3]:
########### Model ###########
x = tf.placeholder(tf.float32, [None, X_train.shape[1]], name='x')
y = tf.placeholder(tf.float32, [None], name='y')

logits = dnn(x)
loss = tf.reduce_mean(tf.square(logits-y), name='loss')

# Training iteration
optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())

# # Calculate Accuracy
# probabilities = tf.nn.softmax(logits, name="softmax_tensor")
# correct_prediction = tf.equal(y, tf.argmax(probabilities,1,output_type=tf.int32))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

In [4]:
########## Train ##########
print("########## Start training ##########")
sess = tf.Session()
writer = tf.summary.FileWriter("./log", sess.graph)
init = tf.global_variables_initializer()
# init saver to save model
saver = tf.train.Saver()

# randomize dataset
indices = np.random.permutation(X_train.shape[0])

# start cross validation
avg_loss = 0.0

if TAKE_CROSS_VALIDATION == True:
    for fold in range(1, CROSS_VALIDATION+1):
        print("########## Fold:", fold, "##########")
        # init weights
        sess.run(init)
        # split inputs into training set and validation set for each fold
        X_train_fold, y_train_fold, X_validate_fold, y_validate_fold = split_folds(indices, X_train, y_train, CROSS_VALIDATION, fold)
        print('validate data: ', X_validate_fold.shape)
        print('validate label: ', y_validate_fold.shape)
        print('train data: ', X_train_fold.shape)
        print('train label: ', y_train_fold.shape)

        winner_loss, epoch = train(X_train_fold, y_train_fold, X_validate_fold, y_validate_fold
                                , train_op, EPOCH_BOUND, EARLY_STOP_CHECK_EPOCH, BATCH_SIZE, testing=False)
        avg_loss += winner_loss
        
        
        print("Epoch: ", epoch, " Loss: ", winner_loss)
    avg_loss /= CROSS_VALIDATION
    
    
    print("average loss: ", avg_loss)

writer.close()

########## Start training ##########
########## Fold: 1 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  164  Loss:  1.0672585
########## Fold: 2 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  144  Loss:  0.96185017
########## Fold: 3 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  148  Loss:  1.1721845
########## Fold: 4 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  140  Loss:  0.87871027
########## Fold: 5 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
tr

In [8]:
########## Final Train ##########
sess = tf.Session()
writer = tf.summary.FileWriter("./log", sess.graph)
init = tf.global_variables_initializer()
# init saver to save model
saver = tf.train.Saver()
print("########## Start training ##########")
sess.run(init)

for epoch in range(160):

        # randomize training set
        indices_training = np.random.permutation(X_train.shape[0])
        X_train, y_train = X_train[indices_training,:], y_train[indices_training]
        
        # split training set into multiple mini-batches and start training
        total_batches = int(X_train.shape[0] / 1)
        for batch in range(total_batches):
            if batch == total_batches - 1:
                sess.run(train_op, feed_dict={x: X_train[batch*BATCH_SIZE:], 
                                               y: y_train[batch*BATCH_SIZE:]})
            else:
                sess.run(train_op, feed_dict={x: X_train[batch*BATCH_SIZE : (batch+1)*BATCH_SIZE], 
                                               y: y_train[batch*BATCH_SIZE : (batch+1)*BATCH_SIZE]})
writer.close()    

########## Start training ##########


In [9]:
hit = 0
user_preference = sess.run(logits, feed_dict={x:X_train})
print(user_preference)
X_train_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1]], axis=1)
X_train_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1]], axis=1)
user_preference_item1 = sess.run(logits, feed_dict={x:X_train_item1})
user_preference_item2 = sess.run(logits, feed_dict={x:X_train_item2})

for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(labels[idx]==0 and user_preference_item1[idx]>=user_preference_item2[idx]):
        hit+=1
    elif(labels[idx]==1 and user_preference_item1[idx]<user_preference_item2[idx]):
        hit+=1
print(hit/userID.shape[0])

[[-2.29446646e-02]
 [ 7.46844560e-02]
 [ 6.68972731e-03]
 [-2.82700658e-02]
 [ 1.13425545e-01]
 [ 1.00531988e-01]
 [ 1.50226131e-02]
 [ 6.78622127e-02]
 [-4.58576605e-02]
 [ 1.11639552e-01]
 [ 3.72695476e-02]
 [-6.97197169e-02]
 [ 6.43476099e-02]
 [ 1.63462967e-01]
 [-4.18127514e-02]
 [ 2.24442184e-02]
 [ 1.00429840e-02]
 [ 6.34210557e-02]
 [ 1.45401418e-01]
 [-5.53622469e-02]
 [ 1.66651011e-01]
 [ 9.13475230e-02]
 [-4.97684926e-02]
 [ 8.43014792e-02]
 [ 1.36394411e-01]
 [-4.89983708e-02]
 [-7.08011836e-02]
 [-4.96989563e-02]
 [-6.52272776e-02]
 [ 6.30768761e-02]
 [-4.53749001e-02]
 [ 9.16666761e-02]
 [ 4.97490615e-02]
 [-5.00613786e-02]
 [-1.86254978e-02]
 [-2.52128392e-02]
 [-6.52205721e-02]
 [ 7.21899197e-02]
 [ 8.77789035e-02]
 [-4.45183665e-02]
 [-5.01136445e-02]
 [-7.04809278e-02]
 [ 7.35320449e-02]
 [-5.93829080e-02]
 [-4.34970707e-02]
 [ 5.91310635e-02]
 [-6.47744983e-02]
 [ 3.34304050e-02]
 [ 1.06047668e-01]
 [ 1.00272276e-01]
 [ 8.29815194e-02]
 [-5.65277077e-02]
 [ 2.8420887

In [7]:
########## Test ##########
print("########## Start Test ##########")

userID, item1, item2 = loadTestFile()

X_test_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1]], axis=1)
X_test_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1]], axis=1)
user_preference_item1 = sess.run(logits, feed_dict={x:X_test_item1})
user_preference_item2 = sess.run(logits, feed_dict={x:X_test_item2})

test_output=[['User-Item1-Item2','Preference']]
for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(user_preference_item1[idx]>=user_preference_item2[idx]):
        value=0
    else:
        value=1
    test_output.append([entry,value])
print(test_output)
np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")
# print(X_test.shape)
# print(X_test.astype(float))
# for idx, label in enumerate(labels):
# #     print(user_preference[userID[idx]*item1[idx]-1])
# #     print(user_preference[userID[idx]*item2[idx]-1])
#     if(label==0 and (user_preference[userID[idx]*item1[idx]-1]>=user_preference[userID[idx]*item2[idx]-1])):
#         hit+=1
#     elif(label==1 and (user_preference[userID[idx]*item1[idx]-1]<=user_preference[userID[idx]*item2[idx]-1])):
#         hit+=1
        
# test_output=[['User-Item1-Item2','Preference']]
# for idx in range(pridict_output.shape[0]):
#     entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
#     value = pridict_output[idx]
#     test_output.append([entry,value])

# print(test_output)
# np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")

########## Start Test ##########
[['User-Item1-Item2', 'Preference'], ['31-4-10', 0], ['13-1-8', 0], ['15-4-9', 0], ['49-1-9', 0], ['14-9-5', 1], ['35-10-3', 1], ['38-2-6', 1], ['42-3-9', 0], ['44-4-10', 0], ['12-9-7', 0], ['28-5-8', 0], ['39-7-4', 1], ['43-4-5', 0], ['56-4-9', 0], ['39-9-7', 0], ['22-5-4', 0], ['37-5-10', 0], ['33-6-1', 0], ['60-1-4', 0], ['28-5-4', 0], ['42-9-5', 1], ['26-1-8', 0], ['34-7-4', 1], ['36-10-5', 1], ['48-10-2', 0], ['39-1-6', 0], ['19-4-3', 0], ['38-5-10', 0], ['49-9-5', 1], ['11-4-10', 0], ['51-3-9', 0], ['42-9-4', 1], ['21-6-1', 0], ['37-1-3', 0], ['34-4-1', 1], ['41-10-5', 0], ['44-8-2', 0], ['31-7-4', 1], ['37-10-2', 0], ['20-10-4', 1], ['37-5-9', 0], ['48-4-5', 0], ['35-8-6', 0], ['46-6-2', 0], ['7-3-1', 0], ['16-5-10', 0], ['38-8-6', 1], ['31-10-2', 0], ['18-9-5', 1], ['33-5-8', 1], ['27-6-8', 1], ['39-5-4', 0], ['16-2-10', 1], ['58-9-7', 0], ['45-1-9', 0], ['60-5-9', 0], ['41-9-4', 0], ['14-9-3', 1], ['9-6-2', 0], ['36-1-9', 0], ['47-1-6', 0], ['5