In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
import spacy
from splitstream import splitfile
import json

In [3]:
jsons = []
f = open('/home/ubuntu/RecSys/reviews_Musical_Instruments_5.json')
for jsonstr in splitfile(f, format="json"):
    jsons.append(json.loads(jsonstr))

df = pd.DataFrame(jsons)
df['tid'] = np.arange(len(df))

reviewerMappings = dict(zip(df.reviewerID.unique(),range(len(df.reviewerID.unique()))))
reviewerRevMappings = {userId:userIdx for (userIdx,userId) in reviewerMappings.items()}

df['reviewer_idx'] = df.reviewerID.apply(lambda x: reviewerMappings[x])

itemMappings = dict(zip(df.asin.unique(),range(len(df.asin.unique()))))
itemRevMapping = {itemId:itemIdx for (itemIdx,itemId) in itemMappings.items()}
df['item_idx'] = df.asin.apply(lambda x: itemMappings[x])

def convertToFrac(x):
    if(x[1] == 0):
        return -1
    return float(x[0]) / float(x[1])
df['helpfulness'] = df['helpful'].apply(convertToFrac)

def tokenize(s,nlp):
    doc = nlp(s)
    tokSen = []
    for word in doc:
        tokSen.append(word.text)
    return tokSen

nlp = spacy.load('en')
reviews = df.reviewText.values
tokenizedReviews = [tokenize(review.lower(),nlp) for review in reviews]

wordToIdx = {}
wordCounts = {}
wordToIdx['PAD'] = 0
wordCounts['PAD'] = 0
wordToIdx['END'] = 1
wordCounts['END'] = 0
idx = 2
for review in tokenizedReviews:
    for word in review:
        if(word in wordCounts):
            wordCounts[word] += 1
            continue
        wordToIdx[word] = idx
        wordCounts[word] = 1
        idx += 1
        
def tokenizeAndIndex(review):
    global nlp,wordToIdx
    words = tokenize(review.lower(),nlp)
    retVal = [wordToIdx[word] for word in words]
    retVal.append(wordToIdx['END'])
    return retVal
df['reviewIndexed'] = df.reviewText.apply(tokenizeAndIndex)
df['reviewLength'] = df.reviewIndexed.apply(len)

print("%age records with word length > 500: {0}".format(len(df[df.reviewLength>500]) / float(len(df))))
maxSeqLength = 500
df = df[df.reviewLength <= maxSeqLength]
def normalizeSequenceLength(sequence):
    if(len(sequence) < maxSeqLength):
        padding = [wordToIdx['PAD'] for i in range(maxSeqLength - len(sequence))]
        sequence.extend(padding)
    return sequence

df.reviewIndexed = df.reviewIndexed.apply(normalizeSequenceLength)

#Creating train and test sets
def sampleFrom(x):
    size = int(math.ceil(0.7*len(x)))
    return x.iloc[np.random.choice(range(len(x)),size=size)]
trainData = df.groupby('reviewerID').apply(sampleFrom)

trainData = df[df.tid.isin(trainData.tid.values) == True]
testData = df[df.tid.isin(trainData.tid.values) == False]

temp = testData[testData.asin.isin(trainData.asin.values) == False]
testData = testData[testData.asin.isin(trainData.asin.values) == True]
trainData = trainData.append(temp)

print("Size of training set: {0}".format(len(trainData)))
print("Size of test set: {0}".format(len(testData)))
print("Ratio: {0}/{1}".format(int(100 * float(len(trainData)) / len(df)),
                              int(math.ceil(100 * float(len(testData)) / len(df)))))

%age records with word length > 500: 0.0188090829354
Size of training set: 5665
Size of test set: 4403
Ratio: 56/44


In [113]:
tf.reset_default_graph()

latentFactors = 20

mu = tf.Variable(tf.zeros(1),name='global_bias',dtype=tf.float32)

noUsers = len(df.reviewerID.unique())
userBias = tf.Variable(tf.random_uniform([noUsers],minval=0,maxval=1),name='user_bias',dtype=tf.float32)

noItems = len(df.asin.unique())
itemBias = tf.Variable(tf.random_uniform([noItems],minval=0,maxval=1),name='item_bias',dtype=tf.float32)

userFactors = tf.Variable(tf.random_uniform([noUsers, latentFactors],minval=0,maxval=1),
                          name='user_factors',dtype=tf.float32)
itemFactors = tf.Variable(tf.random_uniform([noItems, latentFactors],minval=0,maxval=1),
                          name='item_factors',dtype=tf.float32)

userId = tf.placeholder(tf.int32, [None], name='user_id')
itemId = tf.placeholder(tf.int32, [None], name='item_id')

userBiasLU = tf.nn.embedding_lookup(userBias, userId)
userFactorLU = tf.nn.embedding_lookup(userFactors, userId)

itemBiasLU = tf.nn.embedding_lookup(itemBias, itemId)
itemFactorLU = tf.nn.embedding_lookup(itemFactors, itemId)

vocab_size = len(wordToIdx)
embedding_size = latentFactors

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
wordBias = tf.Variable(tf.random_uniform([vocab_size],minval=0,maxval=1),name='word_bias',dtype=tf.float32)

reviewInput = tf.placeholder(tf.int32, [None, maxSeqLength], name="review")
revEmbedding = tf.nn.embedding_lookup(W, reviewInput,name='embedding_lu')
revBias = tf.nn.embedding_lookup(wordBias, reviewInput,name='wordBias_lu')

mask = tf.placeholder(tf.float32, [None, maxSeqLength, 1], name="reviewMask")
revEmbedding = tf.multiply(revEmbedding,mask, name='multiplyMaskEmbedding')
revBias = tf.multiply(revBias,tf.squeeze(mask,[2]),name='multiplyMaskBias') #1000,500

wordProbs = tf.matmul(revEmbedding, tf.expand_dims(itemFactorLU,1), transpose_b=True, name='wordProbs')
wordProbs = tf.squeeze(wordProbs,axis=2)

reviewProb = tf.add(wordProbs,revBias,name='reviewProbUnnorm')
reviewProb = tf.reduce_mean(reviewProb, axis=1,name='reviewProbNorm')

predRatings = tf.reduce_sum(tf.multiply(itemFactorLU,userFactorLU),axis=1)

topKItems = tf.nn.top_k(predRatings,k=10,name='recommendations')
actRatings = tf.placeholder(tf.float32,[None],name='actual_ratings')

In [114]:
squaredPredLoss = tf.losses.mean_squared_error(actRatings,predRatings)

reviewLoss = tf.reduce_mean(reviewProb)

rmse = tf.sqrt(squaredPredLoss)
maeLoss = tf.reduce_mean(tf.abs(actRatings - predRatings))

userReg = tf.reduce_mean(tf.square(userFactorLU))
itemReg = tf.reduce_mean(tf.square(itemFactorLU))

beta = 0.05
alpha = 0.7
loss = alpha*squaredPredLoss + (1-alpha)*reviewLoss

train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.001,
                                           optimizer="Adam")

In [115]:
sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

fetches = {'eval_op':train_op}
noEpisodes = 50
batchSize = 500
noEpochs = len(trainData) / batchSize

testFeed = {userId:testData.reviewer_idx.values,
            itemId:testData.item_idx.values,
            actRatings:testData.overall.values}

prevMAE = 10000
prevRMSE = 10000

print("episode no, train loss, test rmse, test mae")
for episode in range(noEpisodes):
    episodeData = trainData.iloc[np.random.permutation(len(trainData))]
    startIdx = 0
    episodeLoss = 0
    for epoch in range(noEpochs):
        batch = episodeData.iloc[startIdx:startIdx+batchSize]
        startIdx += batchSize

        labels = batch.overall.values
        userIds = batch.reviewer_idx.values
        itemIds = batch.item_idx.values
        
        reviewSequence = np.array(list(batch.reviewIndexed.values),dtype=np.int32)
        maskSequence = [np.append(np.ones((revLen,1)),np.zeros((maxSeqLength-revLen,1)),axis=0) 
                        for revLen in batch.reviewLength]
        feed_dict = {userId:userIds,itemId:itemIds,actRatings:labels,reviewInput:reviewSequence,mask:maskSequence}
        trainMetrics = sess.run(fetches,feed_dict)
        episodeLoss += trainMetrics['eval_op']

    episodeLoss /= noEpochs

    fetches = {'rmse':rmse,'mae':maeLoss}
    testMetrics = sess.run(fetches,testFeed)

    print "{},{},{},{}".format(episode,episodeLoss,testMetrics['rmse'],testMetrics['mae'])
        
    prevRMSE = testMetrics['rmse']
    prevMAE = testMetrics['mae']

    fetches = {'eval_op':train_op}

episode no, train loss, test rmse, test mae
0,1.4666889039,1.42086398602,1.06803679466
1,1.3302303661,1.37972462177,1.03280842304
2,1.2275851098,1.34334719181,1.00186002254
3,1.13113185492,1.31036734581,0.973746776581
4,1.05937282606,1.28090667725,0.948532998562
5,0.972280020064,1.25534307957,0.926683366299
6,0.91068086299,1.23178958893,0.906675934792
7,0.857886569066,1.21160614491,0.889737665653
8,0.797970858487,1.19337904453,0.87424916029
9,0.754103102467,1.1772313118,0.860660552979
10,0.703573021022,1.16249465942,0.848532497883
11,0.660977623679,1.14997339249,0.838210225105
12,0.620899541812,1.13856780529,0.828559577465
13,0.593140591275,1.12852919102,0.820092201233
14,0.557861401276,1.11939561367,0.81252104044
15,0.517745524645,1.11122846603,0.80577236414
16,0.497221616181,1.10389828682,0.79976940155
17,0.466213898225,1.09740257263,0.794289886951
18,0.446199864149,1.09167599678,0.789517104626
19,0.418082733046,1.08645522594,0.785197138786
20,0.396627017043,1.08186554909,0.781481266

In [17]:
x = np.append(np.ones((revLen,1)),np.ones((500-revLen,1)),axis=0)
np.shape(x)

(500, 1)