In [5]:
import numpy as np
import pandas as pd

In [6]:
def mean_center(train_rating_matrix, test_rating_matrix):
    assert train_rating_matrix.shape == test_rating_matrix.shape
    
    A = train_rating_matrix + test_rating_matrix
    
    train_rating_matrix_centered = np.zeros(A.shape)
    test_rating_matrix_centered = np.zeros(A.shape)
    
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in train_rating_matrix[row].nonzero():
            train_rating_matrix_centered[row][i] = A[row][i] - mean
        for j in test_rating_matrix[row].nonzero():
            test_rating_matrix_centered[row][j] = A[row][j] - mean
    return train_rating_matrix_centered, test_rating_matrix_centered

In [7]:
def train_test_split(train_fraction):
    rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
    rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)

    rating_df.drop(index= range(20000,1000209), inplace=True)

    train_rating_df = rating_df.sample(frac = train_fraction, random_state=201)
    test_rating_df = rating_df.drop(train_rating_df.index)

    train_users = max(train_rating_df["UserId"])
    train_movies = max(train_rating_df["MovieId"])

    test_users = max(test_rating_df["UserId"])
    test_movies = max(test_rating_df["MovieId"])

    assert train_users == test_users
    assert train_movies == test_movies

    train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)

    zero_mat = np.zeros(train_users)
    for i in range(1, train_movies + 1):
        print(i, end = "\r")
        if i not in train_rating_df.columns:   
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,train_users +1))
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, train_users +1))
            test_rating_df = test_rating_df.join(zero_df)

    train_rating_df = train_rating_df[range(1, train_movies + 1)]
    test_rating_df = test_rating_df[range(1, test_movies + 1)]

    train_rating_matrix = np.asarray(train_rating_df)
    test_rating_matrix = np.asarray(test_rating_df)

    assert train_rating_matrix.shape == test_rating_matrix.shape

    return train_rating_matrix, test_rating_matrix

In [8]:
train_rating_matrix, test_rating_matrix = train_test_split(0.8)
train_rating_matrix_centered , test_rating_matrix_centered = mean_center(train_rating_matrix, test_rating_matrix)

  This is separate from the ipykernel package so we can avoid doing imports until


3952

In [3]:
def forbenius_norm(A):
    return (A ** 2).sum()

In [4]:
#### Latent factors

# Q = np.copy(U)
# P = (sigma.dot(V.T)).T
k = 10
Q = np.random.normal(size = (A.shape[0], k))
P = np.random.normal(size = (A.shape[1],k ))

NameError: name 'np' is not defined

In [None]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print(mse)

In [None]:
P_prev = 0
q = 0
Q_prev = 0
p = 0
n = 10e-5
lambda1 = 10e-6
loop = 0
while mod(Q_prev - Q) >= 10e-7 or mod(P_prev - P) >= 10e-7:
    Q_prev = np.copy(Q)
    P_prev = np.copy(P)
    
#     for i in range(Q.shape[0]):
#         for k in range(Q.shape[1]):
#             for x in range(A.shape[1]):
#                 if(train_rating_matrix[i][x] != 0):
#                     q[i,k] = -2 * (train_rating_matrix_centered[i][x] - Q[i].dot(P[x,:])) + 2*lambda1*Q[i,k]
#     for x in range(P.shape[0]):
#         for k in range(P.shape[1]):
#             for i in range(A.shape[0]):
#                 if(train_rating_matrix[i][x] != 0):
#                     p[x,k] = -2 * (train_rating_matrix_centered[i][k])
    
    q = -2 * ((A - Q.dot(P.T)).dot(P))
    p = -2 * (Q.T.dot((A - Q.dot(P.T)))).T
         
    Q = Q - n*q
    P = P - n*p
    loop += 1
    print(loop, end = "\r")
    #print("mod1 = ", forbenius_norm(Q_prev - Q))
    #print("mod2 = ", forbenius_norm(P_prev - P), end = '\r')
    
    

In [None]:
Q_prev = 0
q = 0
p_prev = 0
p = 0
n = 10e-4
lambda1 = 0.01
loop = 0
while mod(Q_prev - Q) >= 10e-6 or mod(P_prev - P) >= 10e-6:
    Q_prev = np.copy(Q)
    P_prev = np.copy(P)
    
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
        
#     q = -2 * ((A - Q.dot(P.T)).dot(P)) #+ (2 * lambda1 * Q)
#     p = (-2 * (Q.T.dot((A - Q.dot(P.T)))) #+ (2 * lambda1 * P.T)).T
    loop += 1
    print(loop)
    
    

In [None]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0

## For test
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print("cnt  = ", cnt)
print(mse)

error = 0
cnt = 0
## for train
for i in range(train_rating_matrix_centered.shape[0]):
    for j in range(train_rating_matrix_centered.shape[1]):
        if train_rating_matrix[i][j] != 0 and np.isinf(pred_A_lf[i][j]) == False:
            error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
print("cnt = ", cnt)
print(mse)

In [None]:
error = 0
cnt = 0
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - Apred[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print(mse)