In [1]:
import tensorflow as tf
import numpy as np
import os
import tensorflow as tf
from sklearn import preprocessing #标准化数据模块
########## Hyperparameter ##########
BATCH_SIZE = 5
EPOCH_BOUND = 1000
EARLY_STOP_CHECK_EPOCH = 100
TAKE_CROSS_VALIDATION = True
LEARNING_RATE = 0.01
CROSS_VALIDATION = 10
########## Hyperparameter ##########

def loadTrainFile():
    tmp = np.loadtxt("train.csv", dtype=np.str, delimiter=",")
    userID = tmp[1:,0].astype(int)
    item1 = tmp[1:,1].astype(int)
    item2 = tmp[1:,2].astype(int)
    labels = tmp[1:,3].astype(int)
    return userID, item1, item2, labels
def loadTestFile():
    tmp = np.loadtxt("test.csv", dtype=np.str, delimiter=",")
    userID = tmp[1:,0].astype(int)
    item1 = tmp[1:,1].astype(int)
    item2 = tmp[1:,2].astype(int)
    return userID, item1, item2
def loadUserFile():
    tmp = np.loadtxt("users.csv", dtype=str, delimiter=",")
    return tmp[1:,1:]

def loadItemFile():
    tmp = np.loadtxt("items.csv", dtype=np.str, delimiter=",")
    return tmp[1:,1:]

def dnn(x):
    dense1 = tf.layers.dense(
        inputs=x,
        units=16,
        activation=tf.nn.relu,
        name='dense1'
    )
    dense2 = tf.layers.dense(
        inputs=dense1,
        units=8,
        activation=tf.nn.relu,
        name='dense2'
    )
    dense3 = tf.layers.dense(
        inputs=dense2,
        units=8,
        activation=tf.nn.relu,
        name='dense3'
    )
    dense4 = tf.layers.dense(
        inputs=dense3,
        units=4,
        activation=tf.nn.relu,
        name='dense4'
    )
    logits = tf.layers.dense(inputs=dense4, units=1, name='logits')
    
    return logits

# split dataset into training set and one validation set
def split_folds(indices, Inputs, Labels, cross_validation, fold):
    n = Inputs.shape[0]
    if fold == cross_validation:
        validation_size = n - (int(n/cross_validation) * (cross_validation-1))
        X_train_idx, X_validate_idx = indices[:(n-validation_size)], indices[(n-validation_size):]
        y_train_idx, y_validate_idx = indices[:(n-validation_size)], indices[(n-validation_size):]
    else:
        validation_size = int(n/cross_validation)
        X_train_idx, X_validate_idx = np.concatenate((indices[:validation_size*(fold-1)], indices[validation_size*fold:]), axis=0), indices[(validation_size*(fold-1)):(validation_size*fold)]
        y_train_idx, y_validate_idx = np.concatenate((indices[:validation_size*(fold-1)], indices[validation_size*fold:]), axis=0), indices[(validation_size*(fold-1)):(validation_size*fold)]
    X_train, X_validate = np.array(Inputs[X_train_idx,:]), np.array(Inputs[X_validate_idx,:])
    y_train, y_validate = np.array(Labels[y_train_idx]), np.array(Labels[y_validate_idx])
    return X_train, y_train, X_validate, y_validate

def train(X_train, y_train, X_validate, y_validate, optimizer, epoch_bound, stop_threshold, batch_size, testing=False):

    global saver
    global loss
    
    early_stop = 0
    winner_loss = np.infty
    
    for epoch in range(epoch_bound):

        # randomize training set
        indices_training = np.random.permutation(X_train.shape[0])
        X_train, y_train = X_train[indices_training,:], y_train[indices_training]

        # split training set into multiple mini-batches and start training
        total_batches = int(X_train.shape[0] / batch_size)
        for batch in range(total_batches):
            if batch == total_batches - 1:
                sess.run(optimizer, feed_dict={x: X_train[batch*batch_size:], 
                                               y: y_train[batch*batch_size:]})
            else:
                sess.run(optimizer, feed_dict={x: X_train[batch*batch_size : (batch+1)*batch_size], 
                                               y: y_train[batch*batch_size : (batch+1)*batch_size]})
        
        # validating
        cur_loss = 0.0
        total_batches = int(X_validate.shape[0] / batch_size)
        cur_loss = sess.run(loss, feed_dict={x:X_validate,
                                             y:y_validate})
#         print('Loss: ', cur_loss)
        # If the accuracy rate does not increase for many times, it will early stop epochs-loop 
        if cur_loss < winner_loss:
            early_stop = 0
            winner_loss = cur_loss
            
            save_path = saver.save(sess, "./saved_model/dnn.ckpt")
        else:
            early_stop += 1
        if early_stop == stop_threshold:
            break
    
    saver.restore(sess, "./saved_model/dnn.ckpt")
#     winner_accuracy = sess.run(accuracy, feed_dict={x:X_validate,
#                                                     y:y_validate})
    return winner_loss, epoch

  return f(*args, **kwds)


In [2]:
########### Data ###########
user_dic = loadUserFile()
item_dic = loadItemFile()
userID, item1, item2, labels = loadTrainFile()

preference = np.zeros([len(user_dic),10], dtype=int)

for idx, label in enumerate(labels):
    if(label==0):
        preference[userID[idx]-1][item1[idx]-1]+=1
#         preference[userID[idx]-1][item2[idx]-1]-=1
    else:
#         preference[userID[idx]-1][item1[idx]-1]-=1
        preference[userID[idx]-1][item2[idx]-1]+=1

preference = preprocessing.scale(preference, axis=1, copy=False)
X_train = []
y_train = []
for i in range(len(user_dic)):
    for j in range(len(item_dic)):
        X_train.append(np.concatenate([user_dic[i].astype(float),item_dic[j].astype(float)]))
        y_train.append(preference[i][j])
X_train = np.array(X_train)
X_train = preprocessing.scale(X_train, axis=0, copy=False)
y_train = np.array(y_train).reshape(len(y_train))
print(X_train)
print(y_train)

########### Data ###########

[[ 0.72166713  1.47393266  0.97112381 ... -1.22474487 -1.28527737
   0.81649658]
 [ 0.72166713  1.47393266  0.97112381 ...  0.81649658  1.33773767
  -1.22474487]
 [ 0.72166713  1.47393266  0.97112381 ... -1.22474487  0.46339932
   0.81649658]
 ...
 [ 0.72166713 -0.34948919  0.97112381 ...  0.81649658 -1.28527737
  -1.22474487]
 [ 0.72166713 -0.34948919  0.97112381 ...  0.81649658 -0.41093902
   0.81649658]
 [ 0.72166713 -0.34948919  0.97112381 ...  0.81649658  0.46339932
   0.81649658]]
[-0.19324699 -0.83740361  1.73922289  0.45090964  1.09506626 -0.83740361
 -1.48156024 -0.83740361 -0.19324699  1.09506626  0.37907125 -0.16245911
  1.46213197 -0.70398947 -0.70398947 -0.70398947  2.00366234 -0.70398947
  0.37907125 -1.24551983 -0.60547036  1.25751537 -0.13972393 -0.60547036
 -1.0712168   0.3260225   2.18900823  0.3260225  -0.60547036 -1.0712168
  0.41758499 -0.77551498  2.20723495  0.41758499  0.41758499 -1.37206497
 -0.178965   -1.37206497  0.41758499 -0.178965   -0.22298824  0.5203059



In [3]:
########### Model ###########
x = tf.placeholder(tf.float32, [None, X_train.shape[1]], name='x')
y = tf.placeholder(tf.float32, [None], name='y')

logits = dnn(x)
loss = tf.reduce_mean(tf.square(logits-y), name='loss')

# Training iteration
optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(
            loss=loss,
            global_step=tf.train.get_global_step())

# # Calculate Accuracy
# probabilities = tf.nn.softmax(logits, name="softmax_tensor")
# correct_prediction = tf.equal(y, tf.argmax(probabilities,1,output_type=tf.int32))
# accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")

In [4]:
########## Train ##########
print("########## Start training ##########")
sess = tf.Session()
writer = tf.summary.FileWriter("./log", sess.graph)
init = tf.global_variables_initializer()
# init saver to save model
saver = tf.train.Saver()

# randomize dataset
indices = np.random.permutation(X_train.shape[0])

# start cross validation
avg_loss = 0.0

if TAKE_CROSS_VALIDATION == True:
    for fold in range(1, CROSS_VALIDATION+1):
        print("########## Fold:", fold, "##########")
        # init weights
        sess.run(init)
        # split inputs into training set and validation set for each fold
        X_train_fold, y_train_fold, X_validate_fold, y_validate_fold = split_folds(indices, X_train, y_train, CROSS_VALIDATION, fold)
        print('validate data: ', X_validate_fold.shape)
        print('validate label: ', y_validate_fold.shape)
        print('train data: ', X_train_fold.shape)
        print('train label: ', y_train_fold.shape)

        winner_loss, epoch = train(X_train_fold, y_train_fold, X_validate_fold, y_validate_fold
                                , train_op, EPOCH_BOUND, EARLY_STOP_CHECK_EPOCH, BATCH_SIZE, testing=False)
        avg_loss += winner_loss
        
        
        print("Epoch: ", epoch, " Loss: ", winner_loss)
    avg_loss /= CROSS_VALIDATION
    
    
    print("average loss: ", avg_loss)

writer.close()

########## Start training ##########
########## Fold: 1 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  119  Loss:  0.8400482
########## Fold: 2 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  109  Loss:  1.0964218
########## Fold: 3 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  221  Loss:  0.8552751
########## Fold: 4 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
train label:  (540,)
INFO:tensorflow:Restoring parameters from ./saved_model/dnn.ckpt
Epoch:  101  Loss:  0.8963422
########## Fold: 5 ##########
validate data:  (60, 8)
validate label:  (60,)
train data:  (540, 8)
trai

In [5]:
########## Hyperparameter ##########
BATCH_SIZE = 5
EPOCH = 150
LEARNING_RATE = 0.05
########## Hyperparameter ##########
########## Final Train ##########
sess = tf.Session()
writer = tf.summary.FileWriter("./log", sess.graph)
init = tf.global_variables_initializer()
# init saver to save model
saver = tf.train.Saver()
print("########## Start training ##########")
sess.run(init)

for epoch in range(EPOCH):

        # randomize training set
        indices_training = np.random.permutation(X_train.shape[0])
        X_train, y_train = X_train[indices_training,:], y_train[indices_training]
        
        # split training set into multiple mini-batches and start training
        total_batches = int(X_train.shape[0] / BATCH_SIZE)
        for batch in range(total_batches):
            if batch == total_batches - 1:
                sess.run(train_op, feed_dict={x: X_train[batch*BATCH_SIZE:], 
                                               y: y_train[batch*BATCH_SIZE:]})
            else:
                sess.run(train_op, feed_dict={x: X_train[batch*BATCH_SIZE : (batch+1)*BATCH_SIZE], 
                                               y: y_train[batch*BATCH_SIZE : (batch+1)*BATCH_SIZE]})
writer.close()    

########## Start training ##########


In [6]:
hit = 0
user_preference = sess.run(logits, feed_dict={x:X_train})
print(user_preference)
X_train_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1].astype(float)], axis=1)
X_train_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1].astype(float)], axis=1)
user_preference_item1 = sess.run(logits, feed_dict={x:X_train_item1})
user_preference_item2 = sess.run(logits, feed_dict={x:X_train_item2})

for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(labels[idx]==0 and user_preference_item1[idx]>=user_preference_item2[idx]):
        hit+=1
    elif(labels[idx]==1 and user_preference_item1[idx]<user_preference_item2[idx]):
        hit+=1
print(hit/userID.shape[0])

[[-1.47659481e-01]
 [-2.97247842e-02]
 [-6.79320768e-02]
 [-7.66169950e-02]
 [-2.64696851e-02]
 [ 3.33014876e-02]
 [-2.88119167e-03]
 [ 1.12030305e-01]
 [ 1.12030305e-01]
 [ 1.33619532e-02]
 [-4.44168076e-02]
 [-5.37030399e-04]
 [-1.38820350e-01]
 [-4.94357869e-02]
 [ 1.12030305e-01]
 [ 1.12030305e-01]
 [-8.32894668e-02]
 [ 1.12030305e-01]
 [-9.37804058e-02]
 [ 1.12030305e-01]
 [ 6.49837926e-02]
 [-5.75906411e-02]
 [-5.67488447e-02]
 [ 3.88700441e-02]
 [ 3.27541828e-02]
 [ 7.46758729e-02]
 [ 5.66167720e-02]
 [-1.18446924e-01]
 [ 5.69018871e-02]
 [ 1.12030305e-01]
 [-6.74689561e-03]
 [-3.91212925e-02]
 [ 1.12030305e-01]
 [ 1.12030305e-01]
 [-1.61073878e-02]
 [ 3.09761912e-02]
 [ 1.12030305e-01]
 [-5.06874993e-02]
 [ 1.06181212e-01]
 [ 1.12030305e-01]
 [ 1.12030305e-01]
 [ 1.12030305e-01]
 [-3.11673954e-02]
 [ 1.12030305e-01]
 [-4.83555421e-02]
 [-1.20097153e-01]
 [-7.75989667e-02]
 [ 8.92842337e-02]
 [ 1.12030305e-01]
 [ 1.54736489e-02]
 [ 1.12030305e-01]
 [-1.63464069e-01]
 [ 8.3900317

In [None]:
########## Test ##########
print("########## Start Test ##########")

userID, item1, item2 = loadTestFile()

X_test_item1 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item1-1]], axis=1)
X_test_item2 = np.concatenate([user_dic[userID-1].astype(float), item_dic[item2-1]], axis=1)
user_preference_item1 = sess.run(logits, feed_dict={x:X_test_item1})
user_preference_item2 = sess.run(logits, feed_dict={x:X_test_item2})

test_output=[['User-Item1-Item2','Preference']]
for idx in range(userID.shape[0]):
    entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
    if(user_preference_item1[idx]>=user_preference_item2[idx]):
        value=0
    else:
        value=1
    test_output.append([entry,value])
print(test_output)
np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")
# print(X_test.shape)
# print(X_test.astype(float))
# for idx, label in enumerate(labels):
# #     print(user_preference[userID[idx]*item1[idx]-1])
# #     print(user_preference[userID[idx]*item2[idx]-1])
#     if(label==0 and (user_preference[userID[idx]*item1[idx]-1]>=user_preference[userID[idx]*item2[idx]-1])):
#         hit+=1
#     elif(label==1 and (user_preference[userID[idx]*item1[idx]-1]<=user_preference[userID[idx]*item2[idx]-1])):
#         hit+=1
        
# test_output=[['User-Item1-Item2','Preference']]
# for idx in range(pridict_output.shape[0]):
#     entry = str(int(userID[idx]))+'-'+str(int(item1[idx]))+'-'+str(int(item2[idx]))
#     value = pridict_output[idx]
#     test_output.append([entry,value])

# print(test_output)
# np.savetxt("output.csv", np.array(test_output, dtype=np.str), fmt='%s,%s', delimiter=",")