In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
import spacy
from splitstream import splitfile
import json

In [8]:
jsons = []
f = open('../reviews_Musical_Instruments_5.json')
for jsonstr in splitfile(f, format="json"):
    jsons.append(json.loads(jsonstr))

df = pd.DataFrame(jsons)
df['tid'] = np.arange(len(df))

reviewerMappings = dict(zip(df.reviewerID.unique(),range(len(df.reviewerID.unique()))))
reviewerRevMappings = {userId:userIdx for (userIdx,userId) in reviewerMappings.items()}

df['reviewer_idx'] = df.reviewerID.apply(lambda x: reviewerMappings[x])

itemMappings = dict(zip(df.asin.unique(),range(len(df.asin.unique()))))
itemRevMapping = {itemId:itemIdx for (itemIdx,itemId) in itemMappings.items()}
df['item_idx'] = df.asin.apply(lambda x: itemMappings[x])

def convertToFrac(x):
    if(x[1] == 0):
        return -1
    return float(x[0]) / float(x[1])
df['helpfulness'] = df['helpful'].apply(convertToFrac)

def tokenize(s,nlp):
    doc = nlp(s)
    tokSen = []
    for word in doc:
        tokSen.append(word.text)
    return tokSen

nlp = spacy.load('en')
reviews = df.reviewText.values
tokenizedReviews = [tokenize(review.lower(),nlp) for review in reviews]

wordToIdx = {}
wordCounts = {}
wordToIdx['PAD'] = 0
wordCounts['PAD'] = 0
wordToIdx['END'] = 1
wordCounts['END'] = 0
idx = 2
for review in tokenizedReviews:
    for word in review:
        if(word in wordCounts):
            wordCounts[word] += 1
            continue
        wordToIdx[word] = idx
        wordCounts[word] = 1
        idx += 1
        
def tokenizeAndIndex(review):
    global nlp,wordToIdx
    words = tokenize(review.lower(),nlp)
    retVal = [wordToIdx[word] for word in words]
    retVal.append(wordToIdx['END'])
    return retVal
df['reviewIndexed'] = df.reviewText.apply(tokenizeAndIndex)
df['reviewLength'] = df.reviewIndexed.apply(len)

print("%age records with word length > 500: {0}".format(len(df[df.reviewLength>500]) / float(len(df))))
maxSeqLength = 500
df = df[df.reviewLength <= maxSeqLength]
def normalizeSequenceLength(sequence):
    if(len(sequence) < maxSeqLength):
        padding = [wordToIdx['PAD'] for i in range(maxSeqLength - len(sequence))]
        sequence.extend(padding)
    return sequence

df.reviewIndexed = df.reviewIndexed.apply(normalizeSequenceLength)

#Creating train and test sets
def sampleFrom(x):
    size = int(math.ceil(0.7*len(x)))
    return x.iloc[np.random.choice(range(len(x)),size=size)]
trainData = df.groupby('reviewerID').apply(sampleFrom)

trainData = df[df.tid.isin(trainData.tid.values) == True]
testData = df[df.tid.isin(trainData.tid.values) == False]

temp = testData[testData.asin.isin(trainData.asin.values) == False]
testData = testData[testData.asin.isin(trainData.asin.values) == True]
trainData = trainData.append(temp)

print("Size of training set: {0}".format(len(trainData)))
print("Size of test set: {0}".format(len(testData)))
print("Ratio: {0}/{1}".format(int(100 * float(len(trainData)) / len(df)),
                              int(math.ceil(100 * float(len(testData)) / len(df)))))

TypeError: the JSON object must be str, not 'bytes'

In [4]:
tf.reset_default_graph()

latentFactors = 50

mu = tf.Variable(tf.zeros(1),name='global_bias',dtype=tf.float32)

noUsers = 20000
userBias = tf.Variable(tf.random_uniform([noUsers],minval=0,maxval=1),name='user_bias',dtype=tf.float32)

noItems = 20000
itemBias = tf.Variable(tf.random_uniform([noItems],minval=0,maxval=1),name='item_bias',dtype=tf.float32)

userFactors = tf.Variable(tf.random_uniform([noUsers, latentFactors],minval=0,maxval=1),
                          name='user_factors',dtype=tf.float32)
itemFactors = tf.Variable(tf.random_uniform([noItems, latentFactors],minval=0,maxval=1),
                          name='item_factors',dtype=tf.float32)

userId = tf.placeholder(tf.int32, [None], name='user_id')
itemId = tf.placeholder(tf.int32, [None], name='item_id')

userBiasLU = tf.nn.embedding_lookup(userBias, userId)
userFactorLU = tf.nn.embedding_lookup(userFactors, userId)

itemBiasLU = tf.nn.embedding_lookup(itemBias, itemId)
itemFactorLU = tf.nn.embedding_lookup(itemFactors, itemId)

maxSeqLength = 500
vocab_size = 1000
latentFactors = 50
embedding_size = latentFactors

W = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
wordBias = tf.Variable(tf.random_uniform([vocab_size],minval=0,maxval=1),name='word_bias',dtype=tf.float32)

reviewInput = tf.placeholder(tf.int32, [None, maxSeqLength], name="review")
revEmbedding = tf.nn.embedding_lookup(W, reviewInput,name='embedding_lu')
revBias = tf.nn.embedding_lookup(wordBias, reviewInput,name='wordBias_lu')

mask = tf.placeholder(tf.float32, [None, maxSeqLength, 1], name="reviewMask")
revEmbedding = tf.multiply(revEmbedding,mask)
revBias = tf.multiply(revBias,tf.squeeze(mask,[2]))

revEmbedding = tf.reshape(revEmbedding, [-1, 50])
wordProbs = tf.reshape(tf.matmul(revEmbedding,itemFactorLU,transpose_b=True), [-1, 500])
reviewProb = tf.reduce_mean(wordProbs + revBias, axis=1)

predRatings = tf.reduce_sum(tf.multiply(itemFactorLU,userFactorLU),axis=1)

topKItems = tf.nn.top_k(predRatings,k=10,name='recommendations')
actRatings = tf.placeholder(tf.float32,[None],name='actual_ratings')

In [6]:
squaredPredLoss = tf.losses.mean_squared_error(actRatings,predRatings)

reviewLoss = -tf.reduce_mean(reviewProb)

rmse = tf.sqrt(squaredPredLoss)
maeLoss = tf.reduce_mean(tf.abs(actRatings - predRatings))

userReg = tf.reduce_mean(tf.square(userFactorLU))
itemReg = tf.reduce_mean(tf.square(itemFactorLU))

beta = 0.05
alpha = 0.1
loss = alpha*squaredPredLoss + (1-alpha)*reviewLoss

train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.01,
                                           optimizer="Adam")

In [None]:
sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

fetches = {'eval_op':train_op}
noEpisodes = 100
batchSize = 1000
noEpochs = len(trainData) / batchSize

testFeed = {userId:testData.reviewer_idx.values,
            itemId:testData.item_idx.values,
            actRatings:testData.overall.values}

prevMAE = 10000
prevRMSE = 10000

print("episode no, train loss, test rmse, test mae")
for episode in range(noEpisodes):
    episodeData = trainData.iloc[np.random.permutation(len(trainData))]
    startIdx = 0
    episodeLoss = 0
    for epoch in range(noEpochs):
        batch = episodeData.iloc[startIdx:startIdx+batchSize]
        startIdx += batchSize

        labels = batch.overall.values
        userIds = batch.reviewer_idx.values
        itemIds = batch.item_idx.values
        
        reviewSequence = np.array(list(batch.reviewIndexed.values),dtype=np.int32)
        maskSequence = [np.append(np.ones(revLen),np.zeros(maxSeqLength-revLen)) for revLen in batch.reviewLength]
        feed_dict = {userId:userIds,itemId:itemIds,actRatings:labels,reviewInput:reviewSequence,mask:maskSequence}
        trainMetrics = sess.run(fetches,feed_dict)
        episodeLoss += trainMetrics['eval_op']

    episodeLoss /= noEpochs

    fetches = {'rmse':rmse,'mae':maeLoss}
    testMetrics = sess.run(fetches,testFeed)

    print "{},{},{},{}".format(episode,episodeLoss,testMetrics['rmse'],testMetrics['mae'])
        
    prevRMSE = testMetrics['rmse']
    prevMAE = testMetrics['mae']

    fetches = {'eval_op':train_op}