In [1]:
import numpy as np
import pandas as pd

In [39]:
def train_test_split(train_fraction):
    rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
    rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)

    rating_df.drop(index= range(10000,1000209), inplace=True)

    train_rating_df = rating_df.sample(frac = train_fraction, random_state=201)
    test_rating_df = rating_df.drop(train_rating_df.index)

    train_users = max(train_rating_df["UserId"])
    train_movies = max(train_rating_df["MovieId"])

    test_users = max(test_rating_df["UserId"])
    test_movies = max(test_rating_df["MovieId"])

    assert train_users == test_users
    users = train_users
    movies = max(train_movies, test_movies)

    train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)

    zero_mat = np.zeros(train_users)
    for i in range(1, train_movies + 1):
        print(i, end = "\r")
        if i not in train_rating_df.columns:   
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            test_rating_df = test_rating_df.join(zero_df)

    train_rating_df = train_rating_df[range(1, movies + 1)]
    test_rating_df = test_rating_df[range(1, movies + 1)]

    train_rating_matrix = np.asarray(train_rating_df)
    test_rating_matrix = np.asarray(test_rating_df)

    assert train_rating_matrix.shape == test_rating_matrix.shape

    return train_rating_matrix, test_rating_matrix

In [40]:
train_rating_matrix, test_rating_matrix = train_test_split(0.8)

  This is separate from the ipykernel package so we can avoid doing imports until


3952

In [41]:
def mean_center(train_rating_matrix, test_rating_matrix):
    assert train_rating_matrix.shape == test_rating_matrix.shape
    A = train_rating_matrix + test_rating_matrix
    
    train_rating_matrix_centered = np.zeros(A.shape)
    test_rating_matrix_centered = np.zeros(A.shape)
    
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in train_rating_matrix[row].nonzero():
            train_rating_matrix_centered[row][i] = A[row][i] - mean
        for j in test_rating_matrix[row].nonzero():
            test_rating_matrix_centered[row][j] = A[row][j] - mean
    return train_rating_matrix_centered, test_rating_matrix_centered

In [42]:
train_rating_matrix_centered , test_rating_matrix_centered = mean_center(train_rating_matrix, test_rating_matrix)

In [43]:
def forbenius_norm(A):
    return (A **2).sum()

In [44]:
u,s,vh = np.linalg.svd(train_rating_matrix_centered)
sig = np.zeros(train_rating_matrix.shape)
sig[:s.shape[0], : s.shape[0]] = np.diag(s)

In [45]:
A = train_rating_matrix_centered

In [55]:
### k = latent factors in p and q matrix

k = 3
np.random.seed(201)
Q = np.random.uniform(size = (A.shape[0], k)) 
P = np.random.uniform(size = (A.shape[1],k )) 

# Q = np.copy(u)
# P = sig.dot(vh).T

In [56]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1
print("Initial error on test dataset")
mse = np.sqrt(error/cnt)
print("MSE = ", mse)

Initial error on test dataset
MSE =  1.334287817652585


In [57]:
Q_prev = 0
P_prev = 0
mse_train_prev = 1000
mse_train = 0
mse_test = 0
n = 6e-3
lambda1 = 0
loop = 0

while abs(mse_train - mse_train_prev) >= 1e-4 :

    mse_train_prev = mse_train
    
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
                
    loop += 1
    print(loop)
    print("mod1 = ", forbenius_norm(Q_prev - Q))
    print("mod2 = ", forbenius_norm(P_prev - P))
    
    pred_A_lf = Q.dot(P.T)
    error = 0
    cnt = 0
    for i in range(train_rating_matrix_centered.shape[0]):
        for j in range(train_rating_matrix_centered.shape[1]):
            if train_rating_matrix[i][j] != 0:
                error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse_train = np.sqrt(error/cnt)
    print("Mse on train = ", mse_train)
    
    error = 0
    cnt = 0
    for i in range(test_rating_matrix_centered.shape[0]):
        for j in range(test_rating_matrix_centered.shape[1]):
            if test_rating_matrix[i][j] != 0:
                error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse_test = np.sqrt(error/cnt)
#     print("Error on test= ", error)
    print("Mse on test = ", mse_test)
    
    
    

19
mod1 =  38.62049794679021
mod2 =  3915.2554174612546
Mse on train =  1.0610329009596156
Mse on test =  1.0947206238509044
29
mod1 =  28.70660555328073
mod2 =  3905.75069042598
Mse on train =  1.0224404939881955
Mse on test =  1.0584211031387314
39
mod1 =  23.48657102234385
mod2 =  3900.88835409894
Mse on train =  1.007810178872521
Mse on test =  1.0457650286376616
49
mod1 =  20.27224288413565
mod2 =  3898.008931562537
Mse on train =  0.9999464652179053
Mse on test =  1.0394444682880961
59
mod1 =  18.171887558257964
mod2 =  3896.2309970980514
Mse on train =  0.9949178822473063
Mse on test =  1.035753013253849
69
mod1 =  16.77875513286458
mod2 =  3895.15294028325
Mse on train =  0.9913314819035086
Mse on test =  1.0334306562232942
79
mod1 =  15.874086923588557
mod2 =  3894.559024032374
Mse on train =  0.9885581228073603
Mse on test =  1.031920291414505
89
mod1 =  15.328191757052185
mod2 =  3894.32152724104
Mse on train =  0.9862672111948069
Mse on test =  1.0309343242571891
99
mod1 = 

KeyboardInterrupt: 

In [17]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0

## For test
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print("cnt  = ", cnt)
print("RMSE on test dataset: ", mse)

error = 0
cnt = 0
## for train
for i in range(train_rating_matrix_centered.shape[0]):
    for j in range(train_rating_matrix_centered.shape[1]):
        if train_rating_matrix[i][j] != 0:
            error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
print("cnt = ", cnt)
print("RMSE on train dataset", mse)

cnt  =  4000
RMSE on test dataset:  1.0329177080970617
cnt =  16000
RMSE on train dataset 1.0214107899993436


In [13]:
#### Alternating least squares method

Q_prev = 0
P_prev = 0
mse_train_prev = 1000
mse_train = 0
mse_test = 0
n = 10e-3
lambda1 = 1
loop = 0
while abs(mse_train - mse_train_prev) > 10e-2:

    mse_train_prev = mse_train
    
    #Modify Q keeping P constant
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                #P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
    
    #Modify P keeping Q constant
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                #Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
    
        

    loop += 1
    print("\n", loop)
    print("mod1 = ", forbenius_norm(Q_prev - Q))
    print("mod2 = ", forbenius_norm(P_prev - P))
    
    pred_A_lf = Q.dot(P.T)
    error = 0
    cnt = 0
    for i in range(train_rating_matrix_centered.shape[0]):
        for j in range(train_rating_matrix_centered.shape[1]):
            if train_rating_matrix[i][j] != 0:
                error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse_train = np.sqrt(error/cnt)
    print("error_train = ", error)
    print("Mse_train = ", mse_train)
    
    error = 0
    cnt = 0
    for i in range(test_rating_matrix_centered.shape[0]):
        for j in range(test_rating_matrix_centered.shape[1]):
            if test_rating_matrix[i][j] != 0:
                error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse_test = np.sqrt(error/cnt)
    print("error_test= ", error)
    print("Mse_test = ", mse_test)
    
    
    

148
 1
mod1 =  6.265478348887985
mod2 =  3258.387424801629
error_train =  16743.30592303949
Mse_train =  1.0229646231370702
error_test=  4285.989408086015
Mse_test =  1.0351315626631736
148
 2
mod1 =  3.7953369107972934
mod2 =  3123.4417050482784
error_train =  16705.512685023128
Mse_train =  1.0218094454515212
error_test=  4269.514557956364
Mse_test =  1.0331401838516838


In [65]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0

## For test
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print("cnt  = ", cnt)
print(mse)

error = 0
cnt = 0
## for train
for i in range(train_rating_matrix_centered.shape[0]):
    for j in range(train_rating_matrix_centered.shape[1]):
        if train_rating_matrix[i][j] != 0:
            error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
print("cnt = ", cnt)
print(mse)


cnt  =  2000
1.043230665710321
cnt =  8000
0.9093231125527568
