# Reimplementing RETAIN
Notes and test code for setting up model and data for my reimplementation of RETAIN model.  

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import os.path
import sys
import time
import math
from datetime import datetime
from six.moves import xrange

import cPickle as pickle
import numpy as np
import tensorflow as tf

# Import from util.py...  
from util import *


In [None]:
# Set up basic model parameters
input_dim = 15851
output_dim = 2
batch_size = 10
max_length = 20
embedding_dim = 100
hidden_dim = 50
data_dir = '/data1/stride6/data'
log_dir = '/home/kjung/projects/tf-recipes/logs'
EPS = 1e-10
l2_coeff = 0.0001
init_lr = 0.001

# Input placeholders
tf.reset_default_graph()
input_placeholder = tf.placeholder(tf.float32, shape=[batch_size, max_length, input_dim])
target_placeholder = tf.placeholder(tf.float32, shape=[batch_size,])
seqlen_placeholder = tf.placeholder(tf.int32, shape=[batch_size,])
loss_mask_placeholder = tf.placeholder(tf.float32, shape=[batch_size, max_length])

In [None]:

# Embedding layer
embeddings = tf.get_variable('embeddings', 
                             [input_dim, embedding_dim], 
                             initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
reshaped_input = tf.reshape(input_placeholder, [-1, input_dim])
input_embeddings = tf.matmul(reshaped_input, embeddings)
input_embeddings = tf.reshape(input_embeddings, [batch_size, max_length, embedding_dim])

# Check on shape
print(embeddings)
print(input_embeddings)


In [None]:
# Define rnn_alpha - an rnn that outputs states and produces scalar weights for each
# input embedding.  
with tf.variable_scope('rnn_alpha') :
    rnn_cell_alpha = tf.nn.rnn_cell.GRUCell(hidden_dim)
    alpha_initial_state = rnn_cell_alpha.zero_state(batch_size, tf.float32)
    rnn_alpha_output, rnn_alpha_final_state = tf.nn.dynamic_rnn(cell=rnn_cell_alpha, 
                                                                dtype=tf.float32, 
                                                                sequence_length=seqlen_placeholder, 
                                                                initial_state=alpha_initial_state,
                                                                inputs=input_embeddings)
print(rnn_alpha_output)
print(tf.reshape(rnn_alpha_output, [-1, hidden_dim]))
      
W_alpha = tf.get_variable('W_alpha', 
                          [hidden_dim,1],
                          initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
bias_alpha = tf.get_variable('bias_alpha', [1,])
print(W_alpha)
print(bias_alpha)

# We have variable length sequences, so we can't use the provided softmax function. 
# Option 1: Do the softmax ourselves, taking care to avoid numerical issues.  
# Option 2: Zero out invalid scores, add back eps, then use built-in softmax.  
# I suspect option 2 will be faster, so let's try that...
alpha_scores = tf.matmul(tf.reshape(rnn_alpha_output, [-1, hidden_dim]), W_alpha) + bias_alpha
alpha_scores = tf.reshape(alpha_scores, [batch_size, max_length])
alpha_scores = alpha_scores * loss_mask_placeholder + EPS
print(alpha_scores)

alpha = tf.nn.softmax(alpha_scores)
print(alpha)
  


In [None]:
# Define rnn_beta
with tf.variable_scope('rnn_beta') : 
    rnn_cell_beta = tf.nn.rnn_cell.GRUCell(hidden_dim)
    beta_initial_state = rnn_cell_beta.zero_state(batch_size, tf.float32)
    rnn_beta_output, rnn_beta_final_state = tf.nn.dynamic_rnn(cell=rnn_cell_beta, 
                                                              dtype=tf.float32, 
                                                              sequence_length=seqlen_placeholder, 
                                                              initial_state=beta_initial_state,
                                                              inputs=input_embeddings)
print(rnn_beta_output) 
W_beta = tf.get_variable('W_beta', 
                        [hidden_dim, embedding_dim], 
                        initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
print(W_beta)
bias_beta = tf.get_variable('bias_beta', [embedding_dim])

beta = tf.tanh( tf.matmul(tf.reshape(rnn_beta_output, [-1, hidden_dim]), W_beta) + bias_beta)
beta = tf.reshape(beta, [batch_size * max_length, embedding_dim])
print(beta)


In [None]:
# Define context node
context = beta * tf.reshape(input_embeddings, [-1, embedding_dim])
print(context)
context = tf.nn.dropout(context, tf.constant(0.5))

context = tf.reshape(context, [embedding_dim, batch_size*max_length])
print(context)
context = tf.transpose(context * tf.reshape(alpha, [-1]))
print(context)


In [None]:
# Prediction node
# The tricky thing here is figuring out the last context vector...  Want to go 
# from [batch_size*max_length, embedding_dim] to [batch_size, embedding_dim], 
# and from there to a scalar... 

# TODO - reduce context to right shape : [batch_size, embedding_dim]...  use tf.gather_nd?  

W_classify = tf.get_variable('W_classify', 
                             [embedding_dim,1], 
                             initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
bias_classify = tf.get_variable('bias_classify', [1])
logits = tf.matmul(context, W_classify) + bias_classify
print(logits)



In [None]:
# Reshape logits to [batch_size, max_length]
logits = tf.reshape(logits, [batch_size, max_length])
print(logits)
logits = tf.gather_nd(logits, tf.pack([tf.range(batch_size), seqlen_placeholder - 1], axis=1))
print(logits)


In [None]:
# Now apply sigmoid to logits to get probs...  
output_probs = tf.nn.sigmoid(logits)
print(output_probs)

# Loss - redo this for 0/1 instead of -1/1 labels
classification_loss = -1. * target_placeholder * tf.log(output_probs) + (1. - target_placeholder) * (1. - output_probs)
print(classification_loss)
classification_loss = tf.reduce_mean(classification_loss)
l2_loss = l2_coeff * tf.nn.l2_loss(W_classify)
total_loss = classification_loss + l2_loss
print(classification_loss)
print(total_loss)


In [None]:
# Set up to run training... 
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(0.0001)
train_op = optimizer.minimize(total_loss, global_step=global_step)
print(train_op)

sess = tf.Session()
init_op = tf.global_variables_initializer()
sess.run(init_op)





In [2]:
import numpy as np
foo = np.array([], dtype=np.float32)

In [3]:
print(foo)

[]


In [4]:
bar = np.append(foo, np.array(range(10)))

In [5]:
print(bar)

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9.]
