In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
import spacy
from splitstream import splitfile
import json

In [5]:
jsons = []
f = open('/home/ubuntu/RecSys/reviews_Musical_Instruments_5.json')
for jsonstr in splitfile(f, format="json"):
    jsons.append(json.loads(jsonstr))

df = pd.DataFrame(jsons)
df['tid'] = np.arange(len(df))

reviewerMappings = dict(zip(df.reviewerID.unique(),range(len(df.reviewerID.unique()))))
reviewerRevMappings = {userId:userIdx for (userIdx,userId) in reviewerMappings.items()}

df['reviewer_idx'] = df.reviewerID.apply(lambda x: reviewerMappings[x])

itemMappings = dict(zip(df.asin.unique(),range(len(df.asin.unique()))))
itemRevMapping = {itemId:itemIdx for (itemIdx,itemId) in itemMappings.items()}
df['item_idx'] = df.asin.apply(lambda x: itemMappings[x])

def convertToFrac(x):
    if(x[1] == 0):
        return -1
    return float(x[0]) / float(x[1])
df['helpfulness'] = df['helpful'].apply(convertToFrac)

def tokenize(s,nlp):
    doc = nlp(s)
    tokSen = []
    for word in doc:
        tokSen.append(word.text)
    return tokSen

nlp = spacy.load('en')
reviews = df.reviewText.values
tokenizedReviews = [tokenize(review.lower(),nlp) for review in reviews]

wordToIdx = {}
wordCounts = {}
wordToIdx['PAD'] = 0
wordCounts['PAD'] = 0
wordToIdx['END'] = 1
wordCounts['END'] = 0
idx = 2
for review in tokenizedReviews:
    for word in review:
        if(word in wordCounts):
            wordCounts[word] += 1
            continue
        wordToIdx[word] = idx
        wordCounts[word] = 1
        idx += 1
        
def tokenizeAndIndex(review):
    global nlp,wordToIdx
    words = tokenize(review.lower(),nlp)
    retVal = [wordToIdx[word] for word in words]
    retVal.append(wordToIdx['END'])
    return retVal
df['reviewIndexed'] = df.reviewText.apply(tokenizeAndIndex)
df['reviewLength'] = df.reviewIndexed.apply(len)

print "%age records with word length > 500: {0}".format(len(df[df.reviewLength>500]) / float(len(df)))
maxSeqLength = 500
df = df[df.reviewLength <= maxSeqLength]
def normalizeSequenceLength(sequence):
    if(len(sequence) < maxSeqLength):
        padding = [wordToIdx['PAD'] for i in range(maxSeqLength - len(sequence))]
        sequence.extend(padding)
    return sequence

df.reviewIndexed = df.reviewIndexed.apply(normalizeSequenceLength)

#Creating train and test sets
def sampleFrom(x):
    size = int(math.ceil(0.7*len(x)))
    return x.iloc[np.random.choice(range(len(x)),size=size)]
trainData = df.groupby('reviewerID').apply(sampleFrom)

trainData = df[df.tid.isin(trainData.tid.values) == True]
testData = df[df.tid.isin(trainData.tid.values) == False]

temp = testData[testData.asin.isin(trainData.asin.values) == False]
testData = testData[testData.asin.isin(trainData.asin.values) == True]
trainData = trainData.append(temp)

print("Size of training set: {0}".format(len(trainData)))
print("Size of test set: {0}".format(len(testData)))
print("Ratio: {0}/{1}".format(int(100 * float(len(trainData)) / len(df)),
                              int(math.ceil(100 * float(len(testData)) / len(df)))))

%age records with word length > 500: 0.0188090829354
Size of training set: 5698
Size of test set: 4370
Ratio: 56/44


In [14]:
tf.reset_default_graph()

latentFactors = 50
vocab_size = len(wordToIdx)
embedding_size = latentFactors

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))

reviewInput = tf.placeholder(tf.int32, [None, maxSeqLength], name="review")
revEmbedding = tf.nn.embedding_lookup(W, reviewInput)
mask = tf.placeholder(tf.float32, [None, maxSeqLength, embedding_size], name="reviewMask")
revEmbedding = tf.multiply(revEmbedding,mask)
revEmbedding = tf.reduce_sum(revEmbedding, 1)

#mu = tf.Variable(tf.zeros(1),name='global_bias',dtype=tf.float32)

noUsers = len(df.reviewerID.unique())
#userBias = tf.Variable(tf.random_uniform([noUsers],minval=0,maxval=1),name='user_bias',dtype=tf.float32)

noItems = len(df.asin.unique())
#itemBias = tf.Variable(tf.random_uniform([noItems],minval=0,maxval=1),name='item_bias',dtype=tf.float32)

userFactors = tf.Variable(tf.random_uniform([noUsers, latentFactors],minval=0,maxval=1),
                          name='user_factors',dtype=tf.float32)
itemFactors = tf.Variable(tf.random_uniform([noItems, latentFactors],minval=0,maxval=1),
                          name='item_factors',dtype=tf.float32)

userId = tf.placeholder(tf.int32, [None], name='user_id')
itemId = tf.placeholder(tf.int32, [None], name='item_id')

#userBiasLU = tf.nn.embedding_lookup(userBias, userId)
userFactorLU = tf.nn.embedding_lookup(userFactors, userId)

#itemBiasLU = tf.nn.embedding_lookup(itemBias, itemId)
itemFactorLU = tf.nn.embedding_lookup(itemFactors, itemId)

predRatings = tf.reduce_sum(tf.multiply(itemFactorLU,userFactorLU),axis=1)
userRatings = tf.reduce_sum(tf.multiply(revEmbedding,userFactorLU),axis=1)
itemRatings = tf.reduce_sum(tf.multiply(revEmbedding,itemFactorLU),axis=1)

topKItems = tf.nn.top_k(predRatings,k=10,name='recommendations')
actRatings = tf.placeholder(tf.float32,[None],name='actual_ratings')

In [15]:
squaredPredLoss = tf.losses.mean_squared_error(actRatings,predRatings)
squaredUserLoss = tf.losses.mean_squared_error(actRatings,userRatings)
squaredItemLoss = tf.losses.mean_squared_error(actRatings,itemRatings)

rmse = tf.sqrt(squaredPredLoss)
maeLoss = tf.reduce_mean(tf.abs(actRatings - predRatings))

userReg = tf.reduce_mean(tf.square(userFactorLU))
itemReg = tf.reduce_mean(tf.square(itemFactorLU))
embReg = tf.reduce_mean(tf.square(revEmbedding))
beta = 0.05
loss = squaredPredLoss + squaredUserLoss + squaredItemLoss + beta*userReg + beta*itemReg
train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.01,
                                           optimizer="Adam")

In [23]:
sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

fetches = {'eval_op':train_op}
noEpisodes = 100
batchSize = 1000
noEpochs = len(trainData) / batchSize

testFeed = {userId:testData.reviewer_idx.values,
            itemId:testData.item_idx.values,
            actRatings:testData.overall.values}

prevMAE = 10000
prevRMSE = 10000

print("episode no, train loss, test rmse, test mae")
for episode in range(noEpisodes):
    episodeData = trainData.iloc[np.random.permutation(len(trainData))]
    startIdx = 0
    episodeLoss = 0
    for epoch in range(noEpochs):
        batch = episodeData.iloc[startIdx:startIdx+batchSize]
        startIdx += batchSize

        labels = batch.overall.values
        userIds = batch.reviewer_idx.values
        itemIds = batch.item_idx.values
        
        reviewSequence = np.array(list(batch.reviewIndexed.values),dtype=np.int32)
        maskSequence = [np.append(np.ones(revLen),np.zeros(maxSeqLength-revLen)) for revLen in batch.reviewLength]
        feed_dict = {userId:userIds,itemId:itemIds,actRatings:labels,reviewInput:reviewSequence,mask:maskSequence}
        trainMetrics = sess.run(fetches,feed_dict)
        episodeLoss += trainMetrics['eval_op']

    episodeLoss /= noEpochs

    fetches = {'rmse':rmse,'mae':maeLoss}
    testMetrics = sess.run(fetches,testFeed)

    print "{},{},{},{}".format(episode,episodeLoss,testMetrics['rmse'],testMetrics['mae'])
        
    prevRMSE = testMetrics['rmse']
    prevMAE = testMetrics['mae']

    fetches = {'eval_op':train_op}

ResourceExhaustedError: OOM when allocating tensor with shape[29419,50]
	 [[Node: random_uniform/RandomUniform = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=0, seed2=0, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform/shape)]]

Caused by op u'random_uniform/RandomUniform', defined at:
  File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 589, in launch_instance
    app.start()
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2825, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-a32cb3644b30>", line 7, in <module>
    W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/random_ops.py", line 244, in random_uniform
    seed2=seed2)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_random_ops.py", line 220, in _random_uniform
    seed=seed, seed2=seed2, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2327, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1226, in __init__
    self._traceback = _extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[29419,50]
	 [[Node: random_uniform/RandomUniform = RandomUniform[T=DT_INT32, dtype=DT_FLOAT, seed=0, seed2=0, _device="/job:localhost/replica:0/task:0/gpu:0"](random_uniform/shape)]]
