In [None]:
# Imports
import pandas as pd
import autograd.numpy as np
from matplotlib import pyplot as plt
from autograd.numpy.random import multivariate_normal as N
from autograd.scipy.stats import norm as N1
from autograd import grad
import math
import random

In [None]:
# Read in the data
# Each row is formatted as [User ID] [Joke ID] [Rating]
ratingsFile =  "ratings.dat"
allData = pd.read_csv(ratingsFile, sep=" ", header=None)
allData = allData.astype(int)

In [None]:
allData.columns = ['UserID', 'JokeID', 'Rating']

In [None]:
numTrain = 100000
numTest = 100000
train = allData[:numTrain]
test = allData[numTrain:numTrain + numTest]

In [None]:
trainUserIDs = sorted(train['UserID'].unique())
trainJokeIDs = sorted(train['JokeID'].unique())
testUserIDs = sorted(test['UserID'].unique())
testJokeIDs = sorted(test['JokeID'].unique())

In [None]:
# Note that not all users in the testing set are in the training set! 
sorted(trainJokeIDs) == sorted(testJokeIDs), sorted(trainUserIDs) == sorted(testUserIDs)

In [None]:
# We precompute the ratings each user gave to the movies he or she rated
def createMappings():
    '''
    Given a data set, creates the mappings necessary for quick access of results
    '''
    userIds = trainUserIDs
    jokeIds = trainJokeIDs
    
    # This is a mapping from userID to an numpy array of ratings for that user.
    userToRatings = {}
    # This is a mapping from userID to a list of indexes corresponding to the jokes rated by the user.
    userToJokeIDs = {}
    
    # Same as above, but in reverse.
    jokeToRatings = {}
    jokeToUserIDs = {}
    
    # Fill the mappings per users
    for userID in userIds:
        userToRatings[userID] = np.array(train[train.UserID == userID].Rating.values)
        userToJokeIDs[userID] = np.array(train[train.UserID == userID].JokeID.values)
        
    for jokeID in jokeIds:
        jokeToRatings[jokeID] = np.array(train[train.JokeID == jokeID].Rating.values)
        jokeToUserIDs[jokeID] = np.array(train[train.JokeID == jokeID].UserID.values)
        
    return userToRatings, userToJokeIDs, jokeToRatings, jokeToUserIDs

In [None]:
trainUtoR, trainUToJ, trainJtoR, trainJToU = createMappings()

In [None]:
# We're going to create a mapping from userID to row_index in U
# Also, a mapping from jokeID to row_index in V
# Additionally, a mapping from userID to row_index in V of jokes rated by the user
# and a mapping from jokeID to row_index in U of the users that rated that joke
def createInitialParams(K, sigma2u, sigma2v, users, jokes, UtoJ, JtoU):
    '''
    Returns initial parameters of our U,V distributions along with the mappings described in the table above.
    Input parameters K, the number of laten variables, the variance of U,V distributions respectively,
    a list of userIds, list of jokeIds, a mapping from userIDs to jokes they've rated, and a mapping from 
    jokeIDs to users that have rated them, in that order. 
    '''
    # Maps from Users/Jokes to rows
    UserToU = { user : i for i, user in enumerate(users)}
    JokeToV = { joke : i for i, joke in enumerate(jokes)}
    
    # Now we create the more complicated mapping from Users to JokeRowsRater and Joke to UserRowsRatedBy
    UserToV = { user: np.array([JokeToV[joke] for joke in UtoJ[user]]) for user in users}
    JokeToU = { joke: np.array([UserToU[user] for user in JtoU[joke]]) for joke in jokes}
    
    # Lastly, we create our parameters for the normal distributions from which we sample U,V
    mus = { user: N(np.ones(K), np.identity(K)) for user in users}
    nus = { joke: N(np.ones(K), np.identity(K)) for joke in jokes}
    Sigmas = { user: sigma2u * np.identity(K) for user in users}
    Taus = { joke: sigma2v * np.identity(K) for joke in jokes}
    
    return mus, nus, Sigmas, Taus, UserToU, UserToV, JokeToU, JokeToV

In [None]:
# Now we try to code up Part 3!
def LogLikelihood(theta, rij, k, sigmaepsilon2, z1, z2):
    # Return the log-likelihood of a single rating.
    # Assumes rij | ui, vj \sim N(ui^Tvj, sigamepsilon2)
    assert(len(theta) == k * 4)
    mui, nuj, logsigmai, logtauj = theta[:k], theta[k:2*k], theta[2*k:3*k], theta[3*k:]
    mean = 0
    for a in range(k):
        mean = mean + (mui[a] + z1[a] * np.exp(logsigmai[a])) * (nuj[a] + z2[a] * np.exp(logtauj[a]))
    
    mean = mean / k
    return N1.logpdf(rij, mean, sigmaepsilon2)

gradLogLikelihood = grad(LogLikelihood)

In [None]:
def MCExpectedGradLogLikelihood(theta, rij, sigmaepsilon2, K, S):
    # S is the number of samples to use for estimating the expected value of the gradient of the log likelihood
    # K is number of latent variables
    total = np.zeros(len(theta))
    for i in range(S):
        z1, z2 = N(np.zeros(K), np.identity(K)), N(np.zeros(K), np.identity(K))
        total = total + gradLogLikelihood(theta, rij, K, sigmaepsilon2, z1, z2)
    return total / S

In [None]:
# We now code up our gradient ascept algorithm, on the data, using multiple epochs!
EPOCHS = 10
sigmaU2, sigmaV2, sigmaepsilon2 = 5.0, 5.0, 1.0
def optimzeLowerBound(alpha, K, S, NSamples):
    # Optimizes the lowerbound and the set of parameters for each epoch as muis, nuis, logsigmais, logtauis
    # K is the number of latent variables
    # alpha is the learning rate for stochastic gradient ascent.
    # Initialize parameters
    print "Initializing initial distributions..."
    mus, nus, Sigmas, Taus, UserToU, UserToV, JokeToU, JokeToV = createInitialParams(
        K,sigmaU2, sigmaV2, trainUserIDs, trainJokeIDs, trainUToJ, trainJToU)
    
    # Convert to logs
    logSigmas = {k: -0.5 * np.log(np.diag(v)) for k,v in Sigmas.iteritems()}
    logTaus = {k : -0.5 * np.log(np.diag(v)) for k,v in Taus.iteritems()}
    
    # Let's precompute numbers
    numUsers = {joke : len(JokeToU[joke]) for joke in trainJokeIDs} 
    numJokes = {user : len(UserToV[user]) for user in trainUserIDs} 
    
    print "Initialized prior distributions..."
    
    print "Initializing random samples of params..."
    thetasUser = np.zeros((len(trainUserIDs), 2*K))
    thetasJoke = np.zeros((len(trainJokeIDs), 2*K))
    for user in trainUserIDs:
        thetasUser[UserToU[user], :] = np.concatenate((mus[user], logSigmas[user]))
    for joke in trainJokeIDs:
        thetasJoke[JokeToV[joke], :] = np.concatenate((nus[joke], logTaus[joke]))
        
    userThetas = {}
    jokeThetas = {}
    userThetas[0] = np.copy(thetasUser)
    jokeThetas[0] = np.copy(thetasJoke)
    print "Finished initializing parameters..."
        
    print "Initializing epoch iterations!...."    
    for epoch in range(EPOCHS):
        userThetas[epoch + 1] = np.copy(userThetas[epoch])
        jokeThetas[epoch + 1] = np.copy(jokeThetas[epoch])
        
        # select a random subset of the training data!
        rows = random.sample(train.index, NSamples)
        trainSample = train.ix[rows]
        i = 0
        for rating_Num,row in trainSample.iterrows():
            # Get the rating and user for this row
            rij = row.Rating
            user = row.UserID
            joke = row.JokeID
            
            # Grab parameters for this user in the epoch we're updating!
            mui = userThetas[epoch+1][UserToU[user], :K]
            logsigmai = userThetas[epoch+1][UserToU[user], K:]
            nuj = jokeThetas[epoch+1][JokeToV[joke], :K]
            logtauj = jokeThetas[epoch+1][JokeToV[joke], K:]
            
            # Calculate gradient of likelihood term for mui, nuj, logsigmai, logtauj.
            theta = np.concatenate((mui, nuj, logsigmai, logtauj))
            assert(len(theta) == 4 * K)
            gradL = MCExpectedGradLogLikelihood(theta, rij, sigmaepsilon2, K, S)
            
            # Calculate analytical gradient of prior and entropy terms, and additionally, scale them!
            gradKL = np.concatenate((-mui / (float(sigmaU2) * numJokes[user]), 
                                     -nuj / (float(sigmaV2) * numUsers[joke]),
                                     np.exp(2 * logsigmai) / (float(sigmaU2) * numJokes[user]),
                                     np.exp(2 * logtauj) / (float(sigmaV2) * numUsers[joke])))
            
            # Now we can combine them
            gradient = gradL + gradKL 
            if np.isnan(gradKL).any():
                gradient = gradL
                
            if np.isnan(gradient).any():
                pass
            else:
                # Update the results using gradient ascent!
                userThetas[epoch+1][UserToU[user], :K] += alpha * (gradient[:K])
                jokeThetas[epoch+1][JokeToV[joke], :K] += alpha * (gradient[K:2*K])
                userThetas[epoch+1][UserToU[user], K:] += alpha * (gradient[2*K:3*K])
                jokeThetas[epoch+1][JokeToV[joke], K:] += alpha * (gradient[3*K:])

            if i % 1000 == 0:
                print "Processed {} ratings".format(i)
            i += 1
                       
        print "Finished the results for epoch {}".format(epoch)
            
    print "Finished Epochs!"
    
    return userThetas, jokeThetas, UserToU, UserToV, JokeToU, JokeToV
    

In [None]:
def createRatingSet(data, nusers, njokes, UserToU, JokeToV, filename='test.out'):
    '''
    Given a pandas data frame, creates a matrix R with the ratings of each user filled in.
    '''
    try:
        R = np.loadtxt(filename)
    except IOError:
        R = np.zeros((nusers, njokes))
        ignored = 0
        for i, row in data.iterrows():
            try:
                i = UserToU[row.UserID]
                j = JokeToV[row.JokeID]
                R[i,j] = row.Rating
            except KeyError:
                # We don't have this user or joke in our training set, so we just ignore it?
                ignored += 1
                pass
            
        print "Ignored a total of {} users/ratings that don't exist in training set".format(ignored)

        np.savetxt(filename, R)
    return R

In [None]:
def nonZeroSubtract(x,y):
    '''
    Computs x-y iff x \neq 0, otherwise returns 0.
    '''
    return x-y if x != 0 else 0

vectSub = np.vectorize(nonZeroSubtract)

In [None]:
# We can calculate these after as long as we return our stack of Us 
def getTrainTestResults(userTheta, jokeTheta, UserToU, JokeToV,k, nsamples = 100):
    '''
    Given the sequence of created parameters from optimizing our lowerbound, we now use those parameters
    to sample from the distributution nsamples. 
    '''
    # We calculate the results profressively, so let us iterated over the sorted keys for the epochs.
    assert(sorted(userTheta.keys()) == sorted(jokeTheta.keys()))
    # The running mean for U and V
    n, m = len(trainUserIDs), len(trainJokeIDs)

    # Precompute the users/joke that exists in the testing set
    testUserUIndex = [UserToU[user] for user in testUserIDs if user in UserToU]
    testJokeVIndex = [JokeToV[joke] for joke in testJokeIDs if joke in JokeToV]
    
    # Create R (note that this function tries to read from disk if R already exists)
    RTrain = createRatingSet(train, n, m, UserToU, JokeToV, 'R_Train.out')
    RTest = createRatingSet(test, n, m, UserToU, JokeToV, 'R_Test.out')
    
    assert(np.count_nonzero(RTrain) == len(train))
    trainN = np.count_nonzero(RTrain)
    print "Training samples: {}.".format(trainN)
    testN = np.count_nonzero(RTest)
    print "Testing samples: {}.".format(testN)
    
    train_likelihood = []
    train_l = []
    test_likelihood = []
    test_l = []
    print "Beginning epochs!"
    for epoch in range(EPOCHS):
        # Sample so we can estimate the likelihood term!
        tll = 0
        tll_test = 0
        
        print "Starting sampling from distribution!"
        for _ in range(nsamples):
            # Sample U,V
            U = np.zeros((n,k))
            V = np.zeros((m,k))
            for user in trainUserIDs:
                index = UserToU[user]
                U[index, :] = N(userTheta[epoch][index, :k], np.diag(np.exp(2 * userTheta[epoch][index, k:])))
            for joke in trainJokeIDs:
                index = JokeToV[joke]
                V[index, :] = N(jokeTheta[epoch][index, :k], np.diag(np.exp(2 * jokeTheta[epoch][index, k:])))
                
            # We have a new sample, so calculate predictions
            predictions = np.dot(U,V.T)
            
            # Calculate likelihood
            tll += -0.5 * np.log(2 * math.pi * sigmaepsilon2) - 0.5 * np.sum(vectSub(RTrain, predictions)**2)
            tll_test += -0.5 * np.log(2 * math.pi * sigmaepsilon2) - 0.5 * np.sum(vectSub(RTest, predictions)**2)
        
        print "Finished sampling! Calculating values!"
        
        tll = tll / float(nsamples)
        print "Train Log Likelihood for Epoch {} is {}".format(epoch, tll)
        tll_test = tll_test / float(nsamples)
        print "Test Log Likelihood for Epoch {} is {}".format(epoch, tll_test)
        
        # Now that we have the expected likelihood, use closed form solution to calculate entropy and prior terms
        # for the lower bound!
        train_bound = tll + trainN * k * (1 - 0.5 * np.log(sigmaU2) - 0.5 * np.log(sigmaV2))
        train_bound -= np.sum(np.exp(2 * userTheta[epoch][:, k:]) 
                              + userTheta[epoch][:, :k] **2 / float(2 * sigmaU2) - userTheta[epoch][:, k:])
        train_bound -= np.sum(np.exp(2 * jokeTheta[epoch][:, k:]) 
                              + jokeTheta[epoch][:, :k] **2 / float(2 * sigmaU2) - jokeTheta[epoch][:, k:])
        print "Train Lower Bound for Epoch {} is {}".format(epoch, train_bound)
        
        # For the test, we repeat the same as the train but we can only do it with the parameters where
        # the user/joke exists in the test set.
        test_bound = tll_test + testN * k * (1 - 0.5 * np.log(sigmaU2) - 0.5 * np.log(sigmaV2))
        test_bound -= np.sum(np.exp(2 * userTheta[epoch][testUserUIndex, k:]) 
                             + userTheta[epoch][testUserUIndex, :k] **2 / float(2 * sigmaU2) 
                             - userTheta[epoch][testUserUIndex, k:])
        test_bound -= np.sum(np.exp(2 * jokeTheta[epoch][testJokeVIndex, k:]) 
                             + jokeTheta[epoch][testJokeVIndex, :k] **2 / float(2 * sigmaU2) 
                             - jokeTheta[epoch][testJokeVIndex, k:])
        print "Test Lower Bound for Epoch {} is {}".format(epoch, test_bound)
        
        # Store the results!
        train_likelihood.append(tll)
        train_l.append(train_bound)
        test_likelihood.append(tll_test)
        test_l.append(test_bound)
        
    return train_likelihood, train_l, test_likelihood, test_l

In [None]:
(userThetas[0][0, :1], np.diag(np.exp(2 * userThetas[0][0, 1:])))

In [None]:
train_ll_k, train_lb_k, test_ll_k, test_lb_k = [], [], [], []
for a in range(1,5):
    userThetas, jokeThetas, UserToU, UserToV, JokeToU, JokeToV = optimzeLowerBound(0.1, a, 1, 1)
    train_ll , train_lb, test_ll, test_lb = getTrainTestResults(userThetas, jokeThetas, UserToU, JokeToV, a, 10)
    train_ll_k.append(train_ll)
    train_lb_k.append(train_lb)
    test_ll_k.append(test_ll)
    test_lb_k.append(test_lb)

In [None]:
train_lb_k

In [None]:
plt.scatter(range(10), train_lb_k[0], color='b', label="Train Data LowerBound")
plt.scatter(range(10), train_ll_k[0], color='r', label="Train Data Log Likelihood" )
plt.scatter(range(10), test_lb_k[0], color='g', label="Test Data Lower Bound")
plt.scatter(range(10), test_ll_k[0], color='y', label="Test Data Log Likelihood")
plt.xlim((0,10))
plt.title("Log Likelihood and LowerBound Plots for K = 1")
plt.xlabel("Epoch")
plt.ylabel("Log Likelihood/LowerBound")
plt.legend()

In [None]:
plt.show()