In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import sys

## Preprocessing

In [2]:
## RRN params
batch_size = 50
hidden_size = 128
out_size = 64
n_step = 1
lr = 0.01
verbose = 10

dataName = 'ml-1m'
dataPath = "data/" + dataName + "/"

In [3]:
## getRatingInfo
dataPath = dataPath + "ratings.dat"

ratings_title = ['UserID', 'MovieID', 'Rating', 'TimeStamp']
ratings = pd.read_table(dataPath, sep='::', header=None, names=ratings_title, engine='python')
ratings = ratings.sort_values(by=['TimeStamp'])
ratings[:5]

Unnamed: 0,UserID,MovieID,Rating,TimeStamp
1000138,6040,858,4,956703932
1000153,6040,2384,4,956703954
999873,6040,593,5,956703954
1000007,6040,1961,4,956703977
1000192,6040,2019,5,956703977


In [4]:
train = ratings.values

num_users = max(train[:,0])
num_movies = max(train[:,1])

## Prepare Model

In [5]:
## placeholder
userID = tf.placeholder(tf.int32, shape=[None, 1], name="userID")
movieID = tf.placeholder(tf.int32, shape=[None, 1], name="movieID")
rating = tf.placeholder(tf.float32, shape=[None, 1], name="rating")
dropout = tf.placeholder(tf.float32, name='dropout')

In [6]:
## user and movie embedding layer & lookup
### user, movie 각각 embedding matirx를 생성하고, 각 ID에 lookup 후 relu activation을 취한다.
### uid_layer & mid_layer shape: [None, 1, hidden_size]
with tf.variable_scope("userID_embedding", reuse=tf.AUTO_REUSE):
    # user id embedding
    Embedding_User = tf.get_variable(name="embedding_users", shape=[num_users, hidden_size],
                                    initializer=tf.glorot_uniform_initializer())
    uid_layer = tf.nn.embedding_lookup(Embedding_User, userID)
    uid_layer = tf.nn.relu(uid_layer)
    
with tf.variable_scope("movie_embedding", reuse=tf.AUTO_REUSE):
    # movie id embedding
    Embedding_Item = tf.get_variable(name="embedding_items", shape=[num_movies, hidden_size],
                                    initializer=tf.glorot_uniform_initializer())
    mid_layer = tf.nn.embedding_lookup(Embedding_Item, movieID)
    mid_layer = tf.nn.relu(mid_layer)

In [7]:
## feedfoward to rnn_layer
### input에 transepose [1, 0, 2]를 하면, input shape은 [1, None, hidden_size]로 변환된다.
### None은 batch_size 또는 입력 데이터 크기이고, RNN model에서 RNN size가 된다.
### 즉, RNN size가 달라질 수 있으므로, dynamic_rnn 모듈에 넣어준다.
### 과거(학습 시작점)부터 user의 시간 순서 item의 시간 순서에 대하여 각각의 output을 계산하므로,
### 최근 업데이트 시점에서 user 및 item에 대한 최근 트랜드가 반영될 것으로 생각된다.
with tf.variable_scope("user_rnn_cell", reuse=tf.AUTO_REUSE):
    userCell = tf.nn.rnn_cell.GRUCell(num_units=hidden_size)
    userInput = tf.transpose(uid_layer, [1, 0, 2])
    userOutputs, userStates = tf.nn.dynamic_rnn(userCell, userInput, dtype=tf.float32)
    userOutput = userOutputs[-1]
    
with tf.variable_scope("movie_rnn_cell", reuse=tf.AUTO_REUSE):
    movieCell = tf.nn.rnn_cell.GRUCell(num_units=hidden_size)
    movieInput = tf.transpose(mid_layer, [1, 0, 2])
    movieOutputs, movieStates = tf.nn.dynamic_rnn(movieCell, movieInput, dtype=tf.float32)
    movieOutput = movieOutputs[-1]

In [8]:
## predict ratings
### user, movie 각 rnn output에서 fullyConnect를 통해 final output vector를 구한다.
### user, movie final vector의 곱의 합(즉, 내적)을 최종 예측 rating 값으로 정의한다.
with tf.variable_scope("pred_layer", reuse=tf.AUTO_REUSE):
    userVector = tf.layers.dense(userOutput, out_size, activation=None)
    movieVector = tf.layers.dense(movieOutput, out_size, activation=None)
    pred = tf.reduce_sum(tf.multiply(userVector, movieVector), axis=1, keepdims=True)

In [9]:
## loss & optimizer
### mean squared error와 adam optimizer를 수행한다.
loss = tf.reduce_mean(
    tf.losses.mean_squared_error(rating, pred)
)
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)

## Training

In [10]:
## session start
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#config.allow_soft_placement = True
#tf.Session(config=self.config) <-- gpu mode
sess = tf.Session() 
sess.run(tf.global_variables_initializer())

In [11]:
## training
length = len(train)
batches = length // batch_size + 1

train_loss = []
for i in range(batches):
    minIdx = i * batch_size
    maxIdx = min(length, (i+1)*batch_size)
    train_batch = train[minIdx:maxIdx]
    ### raw data의 user, item id가 1부터 시작하므로 -1을 한다.
    inputs = np.array([(i[0]-1, i[1]-1, float(i[2])) for i in train_batch])
    feed_dict = {userID: np.expand_dims(inputs[:,0], 1), 
                 movieID: np.expand_dims(inputs[:,1], 1), 
                 rating: np.expand_dims(inputs[:,2], 1), dropout: 1.}

    _, batch_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
    train_loss.append(batch_loss)

    ### 최근 20개 batch_loss의 평균을 프린트한다.
    if verbose and i % verbose == 0:
        sys.stdout.write('\r{} / {}： loss = {}'.format(
            i, batches, np.sqrt(np.mean(train_loss[-20:]))
        ))
        sys.stdout.flush()

### 최근 2000개 loss의 평균을 프린트한다.
print("Training Finish, Last 2000 batches loss is {}.".format(
    np.sqrt(np.mean(train_loss[-2000:]))
))

20000 / 20005： loss = 0.9554230570793152Training Finish, Last 2000 batches loss is 0.9868500232696533.


In [87]:
## last 1000 evaluation
### 최근 1000개의 데이터로 예측을 수행하고 Dataframe으로 만든다.
train_batch = train[-1000:]
inputs = np.array([(i[0]-1, i[1]-1, float(i[2])) for i in train_batch])
feed_dict = {userID: np.expand_dims(inputs[:,0], 1), 
             movieID: np.expand_dims(inputs[:,1], 1), 
             rating: np.expand_dims(inputs[:,2], 1), dropout: 1.}

p = sess.run(pred, feed_dict=feed_dict)
dt = pd.DataFrame({'act': inputs[:,2], 'pred': p.reshape(-1)})
dt[:5]

Unnamed: 0,act,pred
0,5.0,4.344459
1,4.0,4.419746
2,5.0,3.846145
3,5.0,4.287585
4,5.0,3.925497


In [88]:
#### rmse, mae, mape 각 지표를 계산한다.
a = dt['act'].values - dt['pred'].values
rmse = np.mean(np.power(a, 2))
mae = np.mean(np.abs(a))
mape = np.mean(np.abs(a/dt['act'].values))*100
print("last 1000 evaluation")
print("rmse:", rmse, "mae:", mae, "mape:", mape)

last 1000 evaluation
rmse: 0.8654721555151278 mae: 0.7396353408098221 mape: 27.624654208421706
