In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math

Prepare data from original file

In [None]:
filePath = "~/RecSys/ratings_Musical_Instruments.csv"
df = pd.read_csv(filePath,delimiter=",")

df['tid'] = np.arange(len(df))

userMappings = dict(zip(df.user_id.unique(),range(len(df.user_id.unique()))))
userRevMapping = {userId:userIdx for (userIdx,userId) in userMappings.items()}

df['user_idx'] = df.user_id.apply(lambda x: userMappings[x])

itemMappings = dict(zip(df.item_id.unique(),range(len(df.item_id.unique()))))
itemRevMapping = {itemId:itemIdx for (itemIdx,itemId) in itemMappings.items()}
df['item_idx'] = df.item_id.apply(lambda x: itemMappings[x])

itemGroups = df.groupby('item_id')
noRatingsPerItem = itemGroups.apply(len)
userGroups = df.groupby('user_id')
noRatingsPerUser = userGroups.apply(len)

#Creating train and test sets

def sampleFrom(x):
    #p = noRatingsPerUser.loc[x.user_id]
    #p = p / p.sum()
    size = int(math.ceil(0.5*len(x)))
    return x.iloc[np.random.choice(range(len(x)),size=size)]
trainData = df.groupby('user_id').apply(sampleFrom)
trainData = df[df.tid.isin(trainData.tid.values) == True]
testData = df[df.tid.isin(trainData.tid.values) == False]

temp = testData[testData.item_id.isin(trainData.item_id.values) == False]
testData = testData[testData.item_id.isin(trainData.item_id.values) == True]
trainData = trainData.append(temp)

print("Size of training set: {0}".format(len(trainData)))
print("Size of test set: {0}".format(len(testData)))
print("Ratio: {0}/{1}".format(int(100 * float(len(trainData)) / len(df)),
                              int(math.ceil(100 * float(len(testData)) / len(df)))))

Or load from saved files...

In [2]:
filePath = "~/RecSys/ratings_Musical_Instruments.csv"
df = pd.read_csv(filePath,delimiter=",")
userGroups = df.groupby('user_id')

trainData = pd.read_csv('/home/ubuntu/RecSys/trainData.csv')
testData = pd.read_csv('/home/ubuntu/RecSys/testData.csv')

In [3]:
userMappings = dict(zip(df.user_id.unique(),range(len(df.user_id.unique()))))
userRevMapping = {userId:userIdx for (userIdx,userId) in userMappings.items()}

df['user_idx'] = df.user_id.apply(lambda x: userMappings[x])

itemMappings = dict(zip(df.item_id.unique(),range(len(df.item_id.unique()))))
itemRevMapping = {itemId:itemIdx for (itemIdx,itemId) in itemMappings.items()}
df['item_idx'] = df.item_id.apply(lambda x: itemMappings[x])

Creating tensorflow network

In [44]:
tf.reset_default_graph()

mu = tf.Variable(tf.zeros(1),name='global_bias',dtype=tf.float32)

noUsers = len(df.user_id.unique())
userBias = tf.Variable(tf.random_uniform([noUsers],minval=0,maxval=1),name='user_bias',dtype=tf.float32)

noItems = len(df.item_id.unique())
itemBias = tf.Variable(tf.random_uniform([noItems],minval=0,maxval=1),name='item_bias',dtype=tf.float32)

latentFactors = 50
userFactors = tf.Variable(tf.random_uniform([noUsers, latentFactors],minval=0,maxval=1),
                          name='user_factors',dtype=tf.float32)
itemFactors = tf.Variable(tf.random_uniform([noItems, latentFactors],minval=0,maxval=1),
                          name='item_factors',dtype=tf.float32)

userId = tf.placeholder(tf.int32, [None], name='user_id')
itemId = tf.placeholder(tf.int32, [None], name='item_id')

userBiasLU = tf.nn.embedding_lookup(userBias, userId)
userFactorLU = tf.nn.embedding_lookup(userFactors, userId)

itemBiasLU = tf.nn.embedding_lookup(itemBias, itemId)
itemFactorLU = tf.nn.embedding_lookup(itemFactors, itemId)

predRatings = mu + itemBiasLU + userBiasLU + tf.reduce_sum(tf.multiply(itemFactorLU,userFactorLU),axis=1)
topKItems = tf.nn.top_k(predRatings,k=10,name='recommendations')
actRatings = tf.placeholder(tf.float32,[None],name='actual_ratings')

Creating Loss functions

In [45]:
squaredLoss = tf.losses.mean_squared_error(actRatings,predRatings)
rmse = tf.sqrt(squaredLoss)
maeLoss = tf.reduce_mean(tf.abs(actRatings - predRatings))

userReg = tf.reduce_mean(tf.square(userFactorLU))
itemReg = tf.reduce_mean(tf.square(itemFactorLU))

beta = 0.05
loss = maeLoss + beta*userReg + beta*itemReg
train_op = tf.contrib.layers.optimize_loss(loss=loss,
                                           global_step=tf.contrib.framework.get_global_step(),
                                           learning_rate=0.01,
                                           optimizer="Adam")

Start training

In [46]:
sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())

fetches = {'eval_op':train_op}
noEpisodes = 100
batchSize = 1000
noEpochs = len(trainData) / batchSize

testFeed = {userId:testData.user_idx.values,
            itemId:testData.item_idx.values,
            actRatings:testData.rating.values}

prevMAE = 10000
prevRMSE = 10000

print("episode no, train loss, test rmse, test mae")
for episode in range(noEpisodes):
    episodeData = trainData.iloc[np.random.permutation(len(trainData))]
    startIdx = 0
    episodeLoss = 0
    for epoch in range(noEpochs):
        batch = episodeData.iloc[startIdx:startIdx+batchSize]
        startIdx += batchSize

        labels = batch.rating.values
        userIds = batch.user_idx.values
        itemIds = batch.item_idx.values
        feed_dict = {userId:userIds,itemId:itemIds,actRatings:labels}
        trainMetrics = sess.run(fetches,feed_dict)
        episodeLoss += trainMetrics['eval_op']

    episodeLoss /= noEpochs

    fetches = {'rmse':rmse,'mae':maeLoss}
    testMetrics = sess.run(fetches,testFeed)

    print "{},{},{},{}".format(episode,episodeLoss,testMetrics['rmse'],testMetrics['mae'])
        
    prevRMSE = testMetrics['rmse']
    prevMAE = testMetrics['mae']

    fetches = {'eval_op':train_op}

episode no, train loss, test rmse, test mae
0,3.6383128502,2.20157527924,1.75790774822
1,1.74262724492,2.14853167534,1.6839094162
2,1.71459503464,2.03091835976,1.60903608799
3,1.49034933632,1.95286226273,1.54688000679
4,1.37099198399,1.81477129459,1.44539916515
5,1.21037630049,1.72896134853,1.37855124474
6,1.11121500718,1.62254393101,1.29827749729
7,1.00514532542,1.55774867535,1.24690651894
8,0.948956213173,1.47882175446,1.18306171894
9,0.870989158674,1.43192255497,1.14648580551
10,0.84964560948,1.37449169159,1.09818899632
11,0.789433303521,1.33490741253,1.05992686749
12,0.7816856682,1.29897844791,1.0244781971
13,0.737846340927,1.27742052078,1.00519573689
14,0.735366341578,1.2526216507,0.9807690382
15,0.7017516872,1.23736250401,0.963768005371
16,0.699468786946,1.21871197224,0.94155395031
17,0.67787554497,1.20814108849,0.926775693893
18,0.673996503734,1.20088040829,0.916720569134
19,0.656768002335,1.18900370598,0.898796081543
20,0.653934389504,1.19100415707,0.90294867754
21,0.6413482444

KeyboardInterrupt: 

Generating recommendations

In [47]:
precisionAtK = 0
fetches = {'topK':topKItems,'predRatings':predRatings}
for uid in range(noUsers):
    testItems = np.random.randint(noItems,size=(1000))
    x = userGroups.get_group(userRevMapping[0])
    ratedItems = x.item_idx.unique()
    testItems = np.append(testItems,ratedItems)
    testItems = np.array(list(set(testItems)),dtype=np.int32)
    userFeed = {userId:np.ones((len(testItems))) * uid,
                itemId:testItems}
    retVal = sess.run(fetches=fetches,feed_dict=userFeed)
    topKRecommendations = retVal['topK']
    topKIds = testItems[topKRecommendations.indices]
    precisionAtK += len(set(ratedItems) & set(topKIds))

precisionAtK = 100.0 * precisionAtK / (10.0 * noUsers)
print("Precision@10: {0}".format(precisionAtK))

Precision@10: 0.0305986186404


In [None]:
temp = trainData[['user_idx','item_idx','rating']]
#temp.to_csv('/home/ubuntu/RecSys/trainRatings.csv')
f = open('/home/ubuntu/RecSys/trainRatings.csv','w')
np.savetxt(f,temp.values,fmt='%d',delimiter=',')

In [None]:
temp = testData[['user_idx','item_idx','rating']]
f = open('/home/ubuntu/RecSys/testRatings.csv','w')
np.savetxt(f,temp.values,fmt='%d',delimiter=',')