In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math

Data prep

In [2]:
filePath = "~/RecSys/ratings_Musical_Instruments.csv"
df = pd.read_csv(filePath,delimiter=",")

df['tid'] = np.arange(len(df))

userMappings = dict(zip(df.user_id.unique(),range(len(df.user_id.unique()))))
userRevMapping = {userId:userIdx for (userIdx,userId) in userMappings.items()}

df['user_idx'] = df.user_id.apply(lambda x: userMappings[x])

itemMappings = dict(zip(df.item_id.unique(),range(len(df.item_id.unique()))))
itemRevMapping = {itemId:itemIdx for (itemIdx,itemId) in itemMappings.items()}
df['item_idx'] = df.item_id.apply(lambda x: itemMappings[x])

Creating train and test sets

In [5]:
itemGroups = df.groupby('item_id')
noRatingsPerItem = itemGroups.apply(len)
userGroups = df.groupby('user_id')
noRatingsPerUser = userGroups.apply(len)
                                    
def sampleFrom(x):
    #p = noRatingsPerUser.loc[x.user_id]
    #p = p / p.sum()
    size = int(math.ceil(0.5*len(x)))
    return x.iloc[np.random.choice(range(len(x)),size=size)]
trainData = df.groupby('user_id').apply(sampleFrom)
trainData = df[df.tid.isin(trainData.tid.values) == True]
testData = df[df.tid.isin(trainData.tid.values) == False]

temp = testData[testData.item_id.isin(trainData.item_id.values) == False]
testData = testData[testData.item_id.isin(trainData.item_id.values) == True]
trainData = trainData.append(temp)

print("Size of training set: {0}".format(len(trainData)))
print("Size of test set: {0}".format(len(testData)))
print("Ratio: {0}/{1}".format(int(100 * float(len(trainData)) / len(df)),
                              int(math.ceil(100 * float(len(testData)) / len(df)))))

Creating tensorflow network

In [8]:
tf.reset_default_graph()

mu = tf.Variable(tf.zeros(1),name='global_bias',dtype=tf.float32)

noUsers = len(df.user_id.unique())
userBias = tf.Variable(tf.random_uniform([noUsers],minval=0,maxval=1),name='user_bias',dtype=tf.float32)

noItems = len(df.item_id.unique())
itemBias = tf.Variable(tf.random_uniform([noItems],minval=0,maxval=1),name='item_bias',dtype=tf.float32)

latentFactors = 5
userFactors = tf.Variable(tf.random_uniform([noUsers, latentFactors],minval=0,maxval=1),
                          name='user_factors',dtype=tf.float32)
itemFactors = tf.Variable(tf.random_uniform([noItems, latentFactors],minval=0,maxval=1),
                          name='item_factors',dtype=tf.float32)

userId = tf.placeholder(tf.int32, [None], name='user_id')
itemId = tf.placeholder(tf.int32, [None], name='item_id')

userBiasLU = tf.nn.embedding_lookup(userBias, userId)
userFactorLU = tf.nn.embedding_lookup(userFactors, userId)

itemBiasLU = tf.nn.embedding_lookup(itemBias, itemId)
itemFactorLU = tf.nn.embedding_lookup(itemFactors, itemId)

predRatings = mu + itemBiasLU + userBiasLU + tf.reduce_sum(tf.multiply(itemFactorLU,userFactorLU),axis=1)
topKItems = tf.nn.top_k(predRatings,k=10,name='recommendations')
actRatings = tf.placeholder(tf.float32,[None],name='actual_ratings')

Creating Loss functions

In [9]:
squaredLoss = tf.losses.mean_squared_error(actRatings,predRatings)
rmse = tf.sqrt(squaredLoss)
maeLoss = tf.reduce_mean(tf.abs(actRatings - predRatings))

userReg = tf.nn.l2_loss(userFactors)
itemReg = tf.nn.l2_loss(itemFactors)

beta = 0.5
loss = maeLoss + beta*userReg + beta*itemReg
train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.001,
                                           optimizer="Adam")

Start training

In [19]:
sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

fetches = {'eval_op':train_op}
noEpisodes = 100
batchSize = 1000
noEpochs = len(trainData) / batchSize

testFeed = {userId:testData.user_idx.values,
            itemId:testData.item_idx.values,
            actRatings:testData.rating.values}

prevMAE = 10000
prevRMSE = 10000

for episode in range(noEpisodes):
    episodeData = trainData.iloc[np.random.permutation(len(trainData))]
    startIdx = 0
    episodeLoss = 0
    for epoch in range(noEpochs):
        batch = episodeData.iloc[startIdx:startIdx+batchSize]
        startIdx += batchSize

        labels = batch.rating.values
        userIds = batch.user_idx.values
        itemIds = batch.item_idx.values
        feed_dict = {userId:userIds,itemId:itemIds,actRatings:labels}
        trainMetrics = sess.run(fetches,feed_dict)
        episodeLoss += trainMetrics['eval_op']

    episodeLoss /= noEpochs

    if(episode % 10 == 0):
        print "Episode: ",episode
        print "\t Training Loss: {0}".format(episodeLoss)
        print "\n"

        fetches = {'rmse':rmse,'mae':maeLoss}
        testMetrics = sess.run(fetches,testFeed)
        
        if(prevRMSE < testMetrics['rmse'] or prevMAE < testMetrics['mae']):
            break
        prevRMSE = testMetrics['rmse']
        prevMAE = testMetrics['mae']
        
        fetches = {'eval_op':train_op}
        print "\t Test RMSE Loss: {0}".format(testMetrics['rmse'])
        print "\t      MAE Loss: {0}".format(testMetrics['mae'])

Episode:  0
	 Training Loss: 101617.01384


	 Test RMSE Loss: 2.81927466393
	      MAE Loss: 2.64410972595
Episode:  10
	 Training Loss: 0.799484325696


	 Test RMSE Loss: 1.11149597168
	      MAE Loss: 0.777261018753
Episode:  20
	 Training Loss: 0.591310235903


	 Test RMSE Loss: 1.08298504353
	      MAE Loss: 0.722121059895
Episode:  30
	 Training Loss: 0.464577709585


	 Test RMSE Loss: 1.07900643349
	      MAE Loss: 0.716177523136
Episode:  40
	 Training Loss: 0.37633929239


	 Test RMSE Loss: 1.08269011974
	      MAE Loss: 0.722053468227
Episode:  50
	 Training Loss: 0.311710931852


	 Test RMSE Loss: 1.09157216549
	      MAE Loss: 0.732465028763


KeyboardInterrupt: 

Generating recommendations

In [65]:
l = 0
for uid in range(1000):
    noItemsInTest = len(testData.item_idx.unique())
    userFeed = {userId:np.ones((noItemsInTest)) * uid,
                itemId:testData.item_idx.unique()}
    fetches = {'topK':topKItems}
    retVal = sess.run(fetches=fetches,feed_dict=userFeed)
    topKRecommendations = retVal['topK']
    x = userGroups.get_group(userRevMapping[uid])
    lTemp = len(set(x.item_idx.values) & set(topKRecommendations.indices))
    if(l < lTemp):
        l = lTemp