In [1]:
# Imports
import pandas as pd
import autograd.numpy as np
from matplotlib import pyplot as plt
from numpy.random import multivariate_normal as N
from scipy.stats import norm as N1
from autograd import grad
import math
import random

In [2]:
# Read in the data
# Each row is formatted as [User ID] [Joke ID] [Rating]
ratingsFile =  "ratings.dat"
allData = pd.read_csv(ratingsFile, sep=" ", header=None)
allData = allData.astype(int)

In [3]:
allData.columns = ['UserID', 'JokeID', 'Rating']

In [4]:
numTrain = 100000
numTest = 100000
train = allData[:numTrain]
test = allData[numTrain:numTrain + numTest]

In [5]:
trainUserIDs = sorted(train['UserID'].unique())
trainJokeIDs = sorted(train['JokeID'].unique())
testUserIDs = sorted(test['UserID'].unique())
testJokeIDs = sorted(test['JokeID'].unique())

In [6]:
# Note that not all users in the testing set are in the training set! 
sorted(trainJokeIDs) == sorted(testJokeIDs), sorted(trainUserIDs) == sorted(testUserIDs)

(True, False)

In [7]:
# We precompute the ratings each user gave to the movies he or she rated
def createMappings():
    '''
    Given a data set, creates the mappings necessary for quick access of results
    '''
    userIds = trainUserIDs
    jokeIds = trainJokeIDs
    
    # This is a mapping from userID to an numpy array of ratings for that user.
    userToRatings = {}
    # This is a mapping from userID to a list of indexes corresponding to the jokes rated by the user.
    userToJokeIDs = {}
    
    # Same as above, but in reverse.
    jokeToRatings = {}
    jokeToUserIDs = {}
    
    # Fill the mappings per users
    for userID in userIds:
        userToRatings[userID] = np.array(train[train.UserID == userID].Rating.values)
        userToJokeIDs[userID] = np.array(train[train.UserID == userID].JokeID.values)
        
    for jokeID in jokeIds:
        jokeToRatings[jokeID] = np.array(train[train.JokeID == jokeID].Rating.values)
        jokeToUserIDs[jokeID] = np.array(train[train.JokeID == jokeID].UserID.values)
        
    return userToRatings, userToJokeIDs, jokeToRatings, jokeToUserIDs

In [None]:
trainUtoR, trainUToJ, trainJtoR, trainJToU = createMappings()

In [None]:
# We're going to create a mapping from userID to row_index in U
# Also, a mapping from jokeID to row_index in V
# Additionally, a mapping from userID to row_index in V of jokes rated by the user
# and a mapping from jokeID to row_index in U of the users that rated that joke
def createInitialParams(K, sigma2u, sigma2v, users, jokes, UtoJ, JtoU):
    '''
    Returns initial parameters of our U,V distributions along with the mappings described in the table above.
    Input parameters K, the number of laten variables, the variance of U,V distributions respectively,
    a list of userIds, list of jokeIds, a mapping from userIDs to jokes they've rated, and a mapping from 
    jokeIDs to users that have rated them, in that order. 
    '''
    # Maps from Users/Jokes to rows
    UserToU = { user : i for i, user in enumerate(users)}
    JokeToV = { joke : i for i, joke in enumerate(jokes)}
    
    # Now we create the more complicated mapping from Users to JokeRowsRater and Joke to UserRowsRatedBy
    UserToV = { user: np.array([JokeToV[joke] for joke in UtoJ[user]]) for user in users}
    JokeToU = { joke: np.array([UserToU[user] for user in JtoU[joke]]) for joke in jokes}
    
    # Lastly, we create our parameters for the normal distributions from which we sample U,V
    mus = { user: np.zeros(K) for user in users}
    nus = { joke: np.zeros(K) for joke in jokes}
    Sigmas = { user: sigma2u * np.identity(K) for user in users}
    Taus = { joke: sigma2v * np.identity(K) for joke in jokes}
    
    return mus, nus, Sigmas, Taus, UserToU, UserToV, JokeToU, JokeToV

In [None]:
# Constant values that won't change. The only parameter that changes is K.
sigma2u, sigma2v, sigma2 = 5.0, 5.0, 1.0

# Start epoch iterations.
# We do gibs sampling on the specified value of K, the number of latent variables to consdier.
def gibbsSampling(K, EPOCHS = 100):
    # We start by creating our subsets of parameters using the mappings we pre-computed with createMappings function.
    # The function returns a list of the likelihood results on the test and training for each epoch.
    '''
    trainUserIDs = a list of ids in the training set for users
    trainJokeIDs = a list of ids in the training set for jokes
    trainUtoR - training user to list of ratings
    trainUToJ - training user to list of joke ids rated
    trainJtoR - training joke to list of raings 
    trainJToU - training joke to list of usersids that rated it
    '''
    print "Initializing prior distributions..."
    mus, nus, Sigmas, Taus, UserToU, UserToV, JokeToU, JokeToV = createInitialParams(
        K,sigma2u, sigma2v, trainUserIDs, trainJokeIDs, trainUToJ, trainJToU)
    
    # Additionally, we create distionaries of inverse Sigmas and Taus
    inverseSigmas = {user : np.linalg.inv(Sigmas[user]) for user in Sigmas} 
    inverseTaus = {joke : np.linalg.inv(Taus[joke]) for joke in Taus}
    
    print "Initialized prior distributions..."
    
    print "Initializing random samples of U,V..."
    U0 = np.zeros((len(trainUserIDs), K))
    V0 = np.zeros((len(trainJokeIDs), K))
    for user in trainUserIDs:
        U0[UserToU[user], :] = N(mus[user], Sigmas[user])
        
    for joke in trainJokeIDs:
        V0[JokeToV[joke], :] = N(nus[joke], Taus[joke])
        
    Us = {}
    Vs = {}
    Us[0] = U0
    Vs[0] = V0
    print "Finished initializing U,V..."
        
    print "Initializing epoch iterations!...."    
    for epoch in range(EPOCHS):
        # Iterate over the users
        U = np.zeros(Us[epoch].shape)
        for user in trainUserIDs:
            # Update the mu and Sigma
            Vi = Vs[epoch][UserToV[user], :] # The jokes rated by user i
            Rij = trainUtoR[user] # The ratings by user i
            inverseSigmaNew = inverseSigmas[user] + Vi.T.dot(Vi)
            SigmaNew = np.linalg.inv(inverseSigmaNew)
            muNew = SigmaNew.dot(Vi.T.dot(Rij) + inverseSigmas[user].dot(mus[user]))
            
            # Sample a new latent vector for this user.
            U[UserToU[user], :] = N(muNew, SigmaNew)
            
            # Store new values
            mus[user] = muNew
            Sigmas[user] = SigmaNew
            inverseSigmas[user] = inverseSigmaNew
            
        print "Finished updating the U matrix..."
        
        # Iteratve over the jokes
        V = np.zeros(Vs[epoch].shape)
        for joke in trainJokeIDs:
            # Update the nu and Tau
            Ui = Us[epoch][JokeToU[joke], :] # The users rated that rated our joke
            Rij = trainJtoR[joke] # The ratings of the joke
            inverseTauNew = inverseTaus[joke] + Ui.T.dot(Ui)
            TauNew = np.linalg.inv(inverseTauNew)
            nuNew = TauNew.dot(Ui.T.dot(Rij) + inverseTaus[joke].dot(nus[joke]))
            
            # Sample a new laten vector for this user.
            V[JokeToV[joke], :] = N(nuNew, TauNew)
            
            # Store new values
            nus[joke] = nuNew
            Taus[joke] = TauNew
            inverseTaus[joke] = inverseTauNew
            
        print "Finished updating the V matrix...."
            
        # Store the results!
        Us[epoch + 1] = U
        Vs[epoch + 1] = V
        
        print "Finished updating the results!..."
        print "Done with Epoch {}".format(epoch)
    
    print "Finished Epochs!"
    
    # We return the stack of Us ans well as the mappings from Users to U/V and from Jokes to U/V
    return Us, Vs, UserToU, UserToV, JokeToU, JokeToV


In [None]:
# We run 100 times
Us, Vs, UserToU, UserToV, JokeToU, JokeToV = gibbsSampling(2, 100)

In [None]:
def createRatingSet(data, nusers, njokes, UserToU, JokeToV, filename='test.out'):
    '''
    Given a pandas data frame, creates a matrix R with the ratings of each user filled in.
    '''
    try:
        R = np.loadtxt(filename)
    except IOError:
        R = np.zeros((nusers, njokes))
        ignored = 0
        for i, row in data.iterrows():
            try:
                i = UserToU[row.UserID]
                j = JokeToV[row.JokeID]
                R[i,j] = row.Rating
            except KeyError:
                # We don't have this user or joke in our training set, so we just ignore it?
                ignored += 1
                pass
            
        print "Ignored a total of {} users/ratings that don't exist in training set".format(ignored)

        np.savetxt(filename, R)
    return R

In [None]:
def nonZeroSubtract(x,y):
    '''
    Computs x-y iff x \neq 0, otherwise returns 0.
    '''
    return x-y if x != 0 else 0

vectSub = np.vectorize(nonZeroSubtract)

In [None]:
# We can calculate these after as long as we return our stack of Us 
def getTrainTestMSE(Us, Vs, UserToU, JokeToV):
    '''
    Given the sequence of created Us and Vs during the Epochs, we calculate the log likelihood on the
    training and testing data. We also need the mappings from UserID to index in U and JokeId to index in V.
    '''
    # We calculate the results profressively, so let us iterated over the sorted keys for the epochs.
    assert(sorted(Us.keys()) == sorted(Vs.keys()))
    epochs = sorted(Us.keys())
    assert(len(epochs) > 0)
    # The running mean for U and V
    UMean = np.zeros(Us[epochs[0]].shape)
    VMean = np.zeros(Vs[epochs[0]].shape)
    n, k = UMean.shape
    m, k = VMean.shape
    
    # Create R (note that this function tries to read from disk if R already exists)
    RTrain = createRatingSet(train, n, m, UserToU, JokeToV, 'R_Train.out')
    RTest = createRatingSet(test, n, m, UserToU, JokeToV, 'R_Test.out')
    assert(np.count_nonzero(RTrain) == len(train))
    trainN = np.count_nonzero(RTrain)
    print "Training samples: {}.".format(trainN)
    testN = np.count_nonzero(RTest)
    print "Testing samples: {}.".format(testN)
    
    train_mse = []
    utrain_mse = []
    train_avg = 0
    test_mse = []
    utest_mse = []
    test_avg = 0
    for epoch in epochs:
        # We DO NOT AVERAGE Us, we average the likelihood's instead
        UMean = Us[epoch] 
        VMean = Vs[epoch]
        
        predictions = np.dot(UMean, VMean.T)
        
        print "Calculating MSE of training data for epoch {}".format(epoch)
    
        res = np.sum(vectSub(RTrain, predictions) ** 2) / float(len(train))
        utrain_mse.append(res)
        train_avg = (res + epoch * train_avg) / (epoch + 1)
        train_mse.append(train_avg)
        print "Train MSE for Epoch {} is {}".format(epoch, train_avg)

        print "Calculating MSE of testing data for epoch {}".format(epoch)
        res = np.sum(vectSub(RTest, predictions) ** 2) / float(len(test))
        utest_mse.append(res)
        test_avg = (res + epoch * test_avg) / (epoch + 1)
        test_mse.append(test_avg)
        print "Test MSE for Epoch {} is {}".format(epoch, test_avg)
        
    return (test_mse, train_mse, utest_mse, utrain_mse)

In [None]:
(testMSE, trainMSE, uTestMSE, uTrainMSE) = getTrainTestMSE(Us, Vs, UserToU, JokeToV)

In [None]:
plt.scatter(range(101), testMSE, color='b', label="Test Data (Averaged)")
plt.scatter(range(101), trainMSE, color='r', label="Train Data (Averaged)")
plt.scatter(range(101), uTestMSE, color='g', label="Test Data")
plt.scatter(range(101), uTrainMSE, color='y', label="Train Data")
plt.xlim((0,100))
plt.title("MSE of Test and Training Data (Averaged and Unaveraged)")
plt.xlabel("Epoch")
plt.ylabel("Mean Squared Error")
plt.legend()

In [None]:
plt.show()