In [28]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from numpy.random import multivariate_normal as N
from numpy.random import normal as N1

In [2]:
# Read in the data
# Each row is formatted as [User ID] [Joke ID] [Rating]
ratingsFile =  "ratings.dat"
allData = pd.read_csv(ratingsFile, sep=" ", header=None)
allData = allData.astype(int)

In [3]:
allData.dtypes

0    int64
1    int64
2    int64
dtype: object

In [4]:
allData.columns = ['UserID', 'JokeID', 'Rating']

In [5]:
numTest = 100000
test = allData[:numTest]
train = allData[numTest:]

numUsers = int(max(allData['UserID']))
numJokes = int(max(allData['JokeID']))

In [None]:
# an n x m matrix containing the ratings, R_{i,j} is the rating assigned
# by user i to joke j
R = np.zeros((numUsers, numJokes))
for row in train.iterrows():
    r = row[1]
    R[r['UserID']-1, r['JokeID']-1] = r['Rating']

In [None]:
RTest = np.zeros((numUsers, numJokes))
for row in test.iterrows():
    r = row[1]
    RTest[r['UserID']-1, r['JokeID']-1] = r['Rating']

In [6]:
# write out the ratings matrix so we don't have to recalculat it
R = np.loadtxt('ratings_matrix.out')
RTest = np.loadtxt('ratings_matrix_test.out')

In [None]:
# Model parameters and initial values
def initialValues(K):
    U0 = N(mean=np.zeros(K), cov=0.0001*np.identity(K),size=numUsers)
    V0 = N(mean=np.zeros(K), cov=0.0001*np.identity(K),size=numJokes)
    return U0, V0

def nonZeroOp(x, y):
    # Performs f on two values x,y if and only if x \neq 0
    return 0.0 if x == 0.0 else x-y

vectorizedNonZeroSub = np.vectorize(nonZeroOp)

In [8]:
def gradientDescent(T,K,sigma2, learnRate=0.05):
    # Implements gradient descent through 10 epochs. I'm not sure if this
    # is stochastic or not.
    U0, V0 = initialValues(K)
    U, V = U0, V0
    for epoch in range(T):
        UOld = np.copy(U)
        VOld = np.copy(V)
        G = vectorizedNonZeroSub(R, UOld.dot(VOld.T))
        for (i,row) in enumerate(UOld):
            U[i,:] = UOld[i,:] + (learnRate / sigma2) * G[i,:].dot(VOld)
        for (j,row) in enumerate(VOld):
            V[j,:] = VOld[j,:] + (learnRate / sigma2) * G[:,j].dot(UOld)
        # U = UOld + (learnRate / sigma2) * G.dot(VOld)
        # V = VOld + (learnRate / sigma2) * G.T.dot(UOld)
        print "Epoch {} Finished".format(epoch)
        print U,V
    return U,V

def stochasticDescent(T,K, sigma2, learnRate=0.05):
    # Going to try this gradient descent again
    U, V = initialValues(K)
    for epoch in range(T):
        for row in np.array(train):
            i = row[0] - 1
            j = row[1] - 1
            rating = int(row[2])
            # irand = np.random.random_integers(0, numUsers - 1)
            # jrand = np.random.random_integers(0, numJokes - 1)
            UOld = np.copy(U)
            VOld = np.copy(V)
            U[i,:] = UOld[i,:] + learnRate / sigma2 * (rating - UOld[i,:].dot(VOld[j,:].T)) * VOld[j,:]
            V[j,:] = VOld[j,:] + learnRate / sigma2 * (rating - UOld[i,:].dot(VOld[j,:].T)) * UOld[i,:]
            # print U,V
        print "Epoch {} Finished!".format(epoch)
    return U,V

In [9]:
# U,V = stochasticDescent(10,2,1.0)

In [10]:
def RMSE(data, U, V):
    # Calculates the RMSE
    import math
    acc = 0.0
    for row in data.iterrows():
        r = row[1]
        i = r['UserID'] - 1
        j = r['JokeID'] - 1
        rating = int(r['Rating'])
        prediction = U[i,:].dot(V[j,:].T)
        acc += (prediction - rating)**2
    return math.sqrt(acc / len(data))

def RMSE2(ranks, U, V):
    N = np.count_nonzero(ranks)
    return np.sqrt(np.sum(vectorizedNonZeroSub(ranks, U.dot(V.T)) ** 2) / N)

In [11]:
# RMSE2(R, U, V)

In [14]:
# Evaluate for different values of K
def parametrizeK(K):
    train_res = []
    test_res = []
    for k in range(1,K+1):
        U,V = stochasticDescent(10, k, 1.0)
        train_rmse, test_rmse = RMSE2(R, U,V), RMSE2(RTest, U,V)
        train_res.append(train_rmse)
        test_res.append(test_rmse)
        print "Finished for K={}".format(k)
        print "RMSE: {}, RMSETEST: {}".format(train_rmse, test_rmse)
    return train_res, test_res

In [15]:
# Now we produce a plot using the results from above for K = 1-10
rmse_train, rmse_test = parametrizeK(5)
x = range(1,6)

Epoch 0 Finished!
Epoch 1 Finished!
Epoch 2 Finished!
Epoch 3 Finished!
Epoch 4 Finished!
Epoch 5 Finished!
Epoch 6 Finished!
Epoch 7 Finished!
Epoch 8 Finished!
Epoch 9 Finished!
Finished for K=1
RMSE: 1.09369672092, RMSETEST: 1.13372062941
Epoch 0 Finished!
Epoch 1 Finished!
Epoch 2 Finished!
Epoch 3 Finished!
Epoch 4 Finished!
Epoch 5 Finished!
Epoch 6 Finished!
Epoch 7 Finished!
Epoch 8 Finished!
Epoch 9 Finished!
Finished for K=2
RMSE: 1.06647749029, RMSETEST: 1.12749220839
Epoch 0 Finished!
Epoch 1 Finished!
Epoch 2 Finished!
Epoch 3 Finished!
Epoch 4 Finished!
Epoch 5 Finished!
Epoch 6 Finished!
Epoch 7 Finished!
Epoch 8 Finished!
Epoch 9 Finished!
Finished for K=3
RMSE: 1.06073598136, RMSETEST: 1.14074728584
Epoch 0 Finished!
Epoch 1 Finished!
Epoch 2 Finished!
Epoch 3 Finished!
Epoch 4 Finished!
Epoch 5 Finished!
Epoch 6 Finished!
Epoch 7 Finished!
Epoch 8 Finished!
Epoch 9 Finished!
Finished for K=4
RMSE: 1.06128292497, RMSETEST: 1.14412184569
Epoch 0 Finished!
Epoch 1 Finish

In [19]:
# Now we just plot the resuls
plt.plot(x, rmse_train, 'bs', x, rmse_test, 'g^')
plt.title("RMSE of Testing/Training Data for Multile Dimensions")
plt.ylabel("RMSE of Test/Train Data")
plt.xlabel("Number of Dimensions")

<matplotlib.text.Text at 0x7f9b5356a050>

In [20]:
plt.show()

In [86]:
def initialValues2(K):
    U0 = N(mean=np.zeros(K), cov=0.0001*np.identity(K),size=numUsers)
    V0 = N(mean=np.zeros(K), cov=0.0001*np.identity(K),size=numJokes)
    A0 = N1(loc=0, scale=0.0001, size=numUsers)
    B0 = N1(loc=0, scale=0.0001, size=numJokes)
    g = N1(loc=0, scale=0.0001, size=1)
    A0.shape = (numUsers,1)
    B0.shape = (numJokes,1)
    return U0, V0, A0, B0, g

def stochasticDescent2(T,K, sigma2, learnRate=0.05):
    # Going to try this gradient descent again
    U, V, A, B, g = initialValues2(K)
    for epoch in range(T):
        for row in np.array(train):
            i = row[0] - 1
            j = row[1] - 1
            rating = int(row[2])
            # irand = np.random.random_integers(0, numUsers - 1)
            # jrand = np.random.random_integers(0, numJokes - 1)
            UOld = np.copy(U)
            VOld = np.copy(V)
            AOld = np.copy(A)
            BOld = np.copy(B)
            gOld = np.copy(g)
            value = rating - UOld[i,:].dot(VOld[j,:].T) - AOld[i] - BOld[j] - g
            U[i,:] = UOld[i,:] + learnRate / sigma2 * (value) * VOld[j,:]
            V[j,:] = VOld[j,:] + learnRate / sigma2 * (value) * UOld[i,:]
            A[i] = AOld[i] + learnRate / sigma2 * (value)
            B[j] = BOld[j] + learnRate / sigma2 * (value)
            g = gOld + learnRate / sigma2 * (value)
            # print U,V
        print "Epoch {} Finished!".format(epoch)
    return (U, V, A, B, g)

In [87]:
res = stochasticDescent2(10,2,1.0)

Epoch 0 Finished!
Epoch 1 Finished!
Epoch 2 Finished!
Epoch 3 Finished!
Epoch 4 Finished!
Epoch 5 Finished!
Epoch 6 Finished!
Epoch 7 Finished!
Epoch 8 Finished!
Epoch 9 Finished!


In [83]:
def RMSE2(ranks, U, V, A, B, g):
    N = np.count_nonzero(ranks)
    A.shape = (63978,1)
    B.shape = (150,1)
    return np.sqrt(np.sum(vectorizedNonZeroSub(ranks, U.dot(V.T) + A + B.T + g) ** 2) / N)

In [88]:
RMSE2(R, *res)

0.96700845685836978

In [91]:
RMSE2(RTest, *res)

1.0617056375912579

In [92]:
U, V, A, B, g = res

In [93]:
g

array([ 3.32293717])

In [95]:
sorted(enumerate(B), key=lambda x: x[1])

[(43, array([-1.1853655])),
 (140, array([-1.07480545])),
 (123, array([-1.02470262])),
 (74, array([-0.90325422])),
 (15, array([-0.85890571])),
 (73, array([-0.84389152])),
 (57, array([-0.82579844])),
 (56, array([-0.70571756])),
 (100, array([-0.62642493])),
 (4, array([-0.57625035])),
 (23, array([-0.54043951])),
 (19, array([-0.49597958])),
 (6, array([-0.48233627])),
 (135, array([-0.4380327])),
 (12, array([-0.43660089])),
 (66, array([-0.43340397])),
 (7, array([-0.4093237])),
 (102, array([-0.39951215])),
 (32, array([-0.39774513])),
 (36, array([-0.39041573])),
 (50, array([-0.3888212])),
 (122, array([-0.32849726])),
 (16, array([-0.3282077])),
 (17, array([-0.28395518])),
 (63, array([-0.27810327])),
 (14, array([-0.24521994])),
 (76, array([-0.23966555])),
 (94, array([-0.23012247])),
 (22, array([-0.22489168])),
 (40, array([-0.22118206])),
 (45, array([-0.21598571])),
 (59, array([-0.213346])),
 (42, array([-0.20879787])),
 (84, array([-0.20806483])),
 (127, array([-0.2