In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import os

In [2]:
DATA_TYPE = 'sample'
PATH_TO_TRAIN = './rsc15_train_{}.txt'.format(DATA_TYPE)
PATH_TO_TEST = './rsc15_test_{}.txt'.format(DATA_TYPE)
checkpoint_dir = './checkpoint'
if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)

In [3]:
layers = 1
rnn_size = 100
batch_size = 50
drop_keep_prob = 0.7

n_epochs = 3
learning_rate = 0.001
decay = 0.96
decay_steps = 1e4
grad_cap = 0
print_step = 1e3

In [4]:
## load data
data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId': np.int64})
valid = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [5]:
data.head(10)

Unnamed: 0,SessionId,ItemId,timestamp
0,6,214701242,1396804000.0
1,6,214826623,1396804000.0
2,21,214838503,1396861000.0
3,21,214838503,1396861000.0
4,21,214838503,1396861000.0
5,21,214838503,1396862000.0
6,21,214838503,1396862000.0
7,21,214548744,1396862000.0
8,36,214551594,1396814000.0
9,36,214586970,1396814000.0


In [6]:
itemids = data['ItemId'].unique()
n_items = len(itemids)

In [7]:
n_items

21878

In [8]:
# item 에 unique index 생성  
itemidmap = pd.Series(data=np.arange(n_items), index=itemids).to_dict()
itemidmap

{214701242: 0,
 214826623: 1,
 214838503: 2,
 214548744: 3,
 214551594: 4,
 214586970: 5,
 214821277: 6,
 214544355: 7,
 214601212: 8,
 214832557: 9,
 214559660: 10,
 214819520: 11,
 214586540: 12,
 214587797: 13,
 214835775: 14,
 214844109: 15,
 214560099: 16,
 214832750: 17,
 214555869: 18,
 214537185: 19,
 214712272: 20,
 214820450: 21,
 214826810: 22,
 214821013: 23,
 214572182: 24,
 214820252: 25,
 214582502: 26,
 214836802: 27,
 214672963: 28,
 214573312: 29,
 214829257: 30,
 214834987: 31,
 214829282: 32,
 214826874: 33,
 214821399: 34,
 214774685: 35,
 214839997: 36,
 214839313: 37,
 214835561: 38,
 214821290: 39,
 214585395: 40,
 214826700: 41,
 214826801: 42,
 214691366: 43,
 214821377: 44,
 214567410: 45,
 214643657: 46,
 214826608: 47,
 214684513: 48,
 214829312: 49,
 214685795: 50,
 214821298: 51,
 214717007: 52,
 214717005: 53,
 214839870: 54,
 214839866: 55,
 214705787: 56,
 214819385: 57,
 214819378: 58,
 214710090: 59,
 214820938: 60,
 214821022: 61,
 214695345: 62,
 2

In [9]:
data['ItemIdx'] = data['ItemId'].map(lambda x: itemidmap[x])

In [10]:
data = data.drop_duplicates(['SessionId','ItemId'])

In [11]:
data[-20:]

Unnamed: 0,SessionId,ItemId,timestamp,ItemIdx
3140287,11562072,214638478,1411661000.0,1770
3140288,11562077,214718199,1411709000.0,3636
3140289,11562077,214569917,1411709000.0,11335
3140290,11562083,214552595,1411576000.0,6230
3140291,11562083,214552629,1411576000.0,7897
3140292,11562087,214534580,1411637000.0,8144
3140293,11562087,214853173,1411637000.0,19974
3140294,11562087,214828882,1411637000.0,1729
3140296,11562087,214609350,1411638000.0,15666
3140297,11562099,214854819,1411589000.0,21532


In [12]:
# 각 session 길이 / session 첫 시작 index 리스트 
offset_sessions = np.zeros(data['SessionId'].nunique()+1, dtype=np.int32)
offset_sessions[1:] = data.groupby('SessionId').size().cumsum()

In [13]:
offset_sessions

array([      0,       2,       4, ..., 2504714, 2504715, 2504717],
      dtype=int32)

In [14]:
import tensorflow.compat.v1 as tf 
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [15]:
X = tf.placeholder(tf.int32, [batch_size], name='input')
Y = tf.placeholder(tf.int32, [batch_size], name='output')
States = [tf.placeholder(tf.float32, [batch_size, rnn_size], name='rnn_state') for _ in range(layers)]
global_step = tf.Variable(0, name='global_step', trainable=False)
lr = tf.maximum(1e-5,tf.train.exponential_decay(
    learning_rate, global_step, decay_steps, decay, staircase=True
))


In [16]:
# gru weights 
### input item에 대한 embedding matrix 와
### next item 즉 output을 위한 softmax W, b matrix를 구성한다.
with tf.variable_scope('gru_layer', reuse=tf.AUTO_REUSE):
    #sigma = sigma if sigma != 0 else np.sqrt(6.0 / (n_items + rnn_size))
    #initializer = tf.random_uniform_initializer(minval=-sigma, maxval=sigma)
    initializer = tf.glorot_uniform_initializer()
    embedding = tf.get_variable('embedding', [n_items, rnn_size], initializer=initializer)
    softmax_W = tf.get_variable('softmax_w', [n_items, rnn_size], initializer=initializer)
    softmax_b = tf.get_variable('softmax_b', [n_items], initializer=tf.zeros_initializer())

In [17]:
# gru_cell
### ㅁt => ㅁt+1 => ㅁt+2 => ... 
### 위와 같은 recurrent network에서 ㅁ. 즉, 단일 gru cell을 말한다.
with tf.variable_scope('gru_cell', reuse=tf.AUTO_REUSE):
    cell = tf.nn.rnn_cell.GRUCell(rnn_size, activation=tf.nn.tanh)
    drop_cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=drop_keep_prob)
    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([drop_cell] * layers)






In [18]:
inputs = tf.nn.embedding_lookup(embedding, X)
output, state_ = stacked_cell(inputs, tuple(States))
final_state = state_

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor




In [19]:
print(inputs[0])

Tensor("strided_slice:0", shape=(100,), dtype=float32)


In [20]:
## calculate cost(loss)
### 학습일 경우 negative sampling을 통해 
### cross-entropy loss로 계산하였다. bpt, top1 loss는 주석처리 하였다. 

### for training
sampled_W = tf.nn.embedding_lookup(softmax_W, Y)
sampled_b = tf.nn.embedding_lookup(softmax_b, Y)
logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b
### cross-entropy loss
yhat = tf.nn.softmax(logits)
cost = tf.reduce_mean(-tf.log(tf.diag_part(yhat)+1e-24))

### bpr loss
# yhat = logits
# yhatT = tf.transpose(yhat)
# cost = tf.reduce_mean(-tf.log(tf.nn.sigmoid(tf.diag_part(yhat)-yhatT)))

### top1 loss
# yhat = logits
# yhatT = tf.transpose(yhat)
# term1 = tf.reduce_mean(tf.nn.sigmoid(-tf.diag_part(yhat)+yhatT)+tf.nn.sigmoid(yhatT**2), axis=0)
# term2 = tf.nn.sigmoid(tf.diag_part(yhat)**2) / batch_size
# cost = tf.reduce_mean(term1 - term2)


In [21]:
### for prediction
logits_all = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b
yhat_all = tf.nn.softmax(logits_all)

In [22]:
## optimize
### Adam optimizer를 사용한다.
optimizer = tf.train.AdamOptimizer(lr)
### grad_cap>0 다면, minimize시 gradient cliping을 수행한다.
### gradient cliping을 수행하는 이유는 다음 블로그 참조 (https://dhhwang89.tistory.com/90)
### 간략하게 학습 중에 gradient가 급격하게 변하는 지점이 발생할 수 있는데, 이는 기존 minima를 찾아가는 방향이 
### 급변할 수 있기 때문에, 이를 방지하기 위해 수행한다.
### 본 학습에서는 cliping을 하지 않는데, 유사하게 learning rate decay을 사용하기 때문인 것으로 생각됨.
tvars = tf.trainable_variables()
gvs = optimizer.compute_gradients(cost, tvars)
if grad_cap > 0:
    capped_gvs = [(tf.clip_by_norm(grad, grad_cap), var) for grad, var in gvs]
else:
    capped_gvs = gvs 
train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

In [23]:
## session start
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())

In [24]:
### 1. 초기 세팅으로 batch_size 만큼 index array를 만들고 maxiter값을 저장한다.
### 2. start는 offset_session(sessionid의 시작 index) 에서 iters를 추출한다.
###    즉, 첫 50개 sessionid의 시작 index를 추출다.
### 3. end는 각 세션에서 다음 세션의 시작되는 index를 추출한다.
batch_size=50
iters = np.arange(batch_size)
maxiter = iters.max()
print(iters)
print(maxiter)
start = offset_sessions[iters]
end = offset_sessions[iters+1]
print(start)
print(end)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49]
49
[  0   2   4   6   7   9  13  16  18  20  22  23  25  29  32  33  35  38
  39  52  54  57  62  67  75  78  79  82  83  85  89  90  93 100 102 104
 106 108 110 113 114 118 120 122 127 128 130 132 133 134]
[  2   4   6   7   9  13  16  18  20  22  23  25  29  32  33  35  38  39
  52  54  57  62  67  75  78  79  82  83  85  89  90  93 100 102 104 106
 108 110 113 114 118 120 122 127 128 130 132 133 134 139]


In [25]:
offset_sessions[:10]

array([ 0,  2,  4,  6,  7,  9, 13, 16, 18, 20], dtype=int32)

In [26]:
### 1. end - start의 최소 값을 추출한다.
### 만약 최소값이 3라면 즉, 하나의 세션의 item이 두개라면,
### 첫번째 item은 input으로 사용되고 두번째 item은 output으로 사용된다.
### 그리고 다음 배체에서 두번째 item은 input으로 사용되고 세번째 item이 output으로 사용된다.
### 해당 세션은 배치가 두번 돈 후 더이상 학습 할 수 없으므로 다음 세션으로 교체되어야 한다.
### 만약 최소값으 2라면, 해당 세션은 1번 배치 후 다음 세션으로 교체되어야 한다.
### 즉, end - start - 1의 최소 값은 현재 배치된 session의 반복 수를 의미한다.
### 2. out_idx는 각 session의 첫 itemidx를 나타낸다.
minlen = (end-start).min()
out_idx = data.ItemIdx.values[start]
print(minlen)
print(out_idx)

1
[  0   2   4   6   7   9  13  16  18  20  22  23  25  29  32  33  35  38
  39  52  54  57  62  67  74  77  78  81  82  52  87  88  91  96  97  99
 100 101 103  45 105 109 111 111 117 118 120 121 122 123]


In [27]:
data[:10]

Unnamed: 0,SessionId,ItemId,timestamp,ItemIdx
0,6,214701242,1396804000.0,0
1,6,214826623,1396804000.0,1
2,21,214838503,1396861000.0,2
7,21,214548744,1396862000.0,3
8,36,214551594,1396814000.0,4
9,36,214586970,1396814000.0,5
10,41,214821277,1396773000.0,6
15,53,214544355,1396449000.0,7
16,53,214601212,1396449000.0,8
19,56,214832557,1396543000.0,9


In [28]:

### 위에서 설명한 것과 같이, 각 세션의 첫번째 아이템이 in, 두번째 아이템이 out이 된 후 학습에 사용된다.
### minlen - 1 의 수만큼 반복(i)되어 학습한다.
i = 1
in_idx = out_idx
out_idx = data.ItemIdx.values[start+i+1]
print(in_idx)
print(out_idx)

[  0   2   4   6   7   9  13  16  18  20  22  23  25  29  32  33  35  38
  39  52  54  57  62  67  74  77  78  81  82  52  87  88  91  96  97  99
 100 101 103  45 105 109 111 111 117 118 120 121 122 123]
[  2   4   6   8   9  11  15  18  20  22  24  25  27  31  34  35  37  40
  41  54  56  59  64  68  76  79  80  83  52  85  89  90  93  97  99 100
 101 103 100 106 107 111 111 114 119 120 121 123 124  78]


In [29]:
start = start+minlen-1
mask = np.arange(len(iters))[(end-start)<=1]
print(end[:5])
print(start[:5])
print(mask[:5])

[2 4 6 7 9]
[0 2 4 6 7]
[ 3 10 14 17 25]


In [30]:
tic = time.time()
for epoch in range(n_epochs):
    epoch_cost = []
    state = [np.zeros([batch_size, rnn_size], dtype=np.float32) for _ in range(layers)]
    iters = np.arange(batch_size)
    maxiter = iters.max()
    
    start = offset_sessions[iters]
    end = offset_sessions[iters+1]
    
    finished = False
    while not finished:
        minlen = (end-start).min()
        out_idx = data.ItemIdx.values[start]
        for i in range(minlen-1):
            in_idx = out_idx
            out_idx = data.ItemIdx.values[start+i+1]
            # prepare inputs, targeted outputs and hidden states
            fetches = [cost, final_state, global_step, lr, train_op]
            feed_dict = {X: in_idx, Y: out_idx}
            for j in range(layers): 
                feed_dict[States[j]] = state[j]
            
            cost_, state, step, lr_, _ = sess.run(fetches, feed_dict)
            epoch_cost.append(cost_)
                
            if step == 1 or step % print_step == 0:
                avgc = np.mean(epoch_cost)
                print('Epoch {}\tStep {}\tlr: {:.5f}\tloss: {:.4f}\tElapsed: {:.1f}'.
                      format(epoch, step, lr_, avgc, time.time()-tic))

        start = start+minlen-1
        mask = np.arange(len(iters))[(end-start)<=1]
        for idx in mask:
            maxiter += 1
            if maxiter >= len(offset_sessions)-1:
                finished = True
                break
            iters[idx] = maxiter
            start[idx] = offset_sessions[maxiter]
            end[idx] = offset_sessions[maxiter+1]
        if len(mask):
            for i in range(layers):
                state[i][mask] = 0
        
    avgc = np.mean(epoch_cost)
    if np.isnan(avgc):
        print('Epoch {}: Nan error!'.format(epoch, avgc))
        break
    saver.save(sess, '{}/gru-model'.format(checkpoint_dir), global_step=epoch)
print("1 epoch elapsed time:", time.time() - tic)

Epoch 0	Step 1	lr: 0.00100	loss: 3.9120	Elapsed: 0.2
Epoch 0	Step 1000	lr: 0.00100	loss: 3.5049	Elapsed: 9.4
Epoch 0	Step 2000	lr: 0.00100	loss: 3.2726	Elapsed: 18.2
Epoch 0	Step 3000	lr: 0.00100	loss: 3.1093	Elapsed: 26.3
Epoch 0	Step 4000	lr: 0.00100	loss: 3.0285	Elapsed: 34.3
Epoch 0	Step 5000	lr: 0.00100	loss: 2.9560	Elapsed: 42.3
Epoch 0	Step 6000	lr: 0.00100	loss: 2.8879	Elapsed: 50.3
Epoch 0	Step 7000	lr: 0.00100	loss: 2.8253	Elapsed: 58.3
Epoch 0	Step 8000	lr: 0.00100	loss: 2.7765	Elapsed: 66.2
Epoch 0	Step 9000	lr: 0.00100	loss: 2.7323	Elapsed: 74.3
Epoch 0	Step 10000	lr: 0.00100	loss: 2.6896	Elapsed: 82.2
Epoch 0	Step 11000	lr: 0.00096	loss: 2.6459	Elapsed: 90.2
Epoch 0	Step 12000	lr: 0.00096	loss: 2.6174	Elapsed: 98.1
Epoch 0	Step 13000	lr: 0.00096	loss: 2.5881	Elapsed: 106.1
Epoch 0	Step 14000	lr: 0.00096	loss: 2.5645	Elapsed: 114.0
Epoch 0	Step 15000	lr: 0.00096	loss: 2.5406	Elapsed: 122.0
Epoch 0	Step 16000	lr: 0.00096	loss: 2.5239	Elapsed: 130.0
Epoch 0	Step 17000	lr: 0.

KeyboardInterrupt: 