In [1]:
import numpy as np
import pandas as pd

In [17]:

def train_test_split(train_fraction):
    rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
    rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)

    rating_df.drop(index= range(20000,1000209), inplace=True)

    train_rating_df = rating_df.sample(frac = train_fraction, random_state=201)
    test_rating_df = rating_df.drop(train_rating_df.index)

    train_users = max(train_rating_df["UserId"])
    train_movies = max(train_rating_df["MovieId"])

    test_users = max(test_rating_df["UserId"])
    test_movies = max(test_rating_df["MovieId"])

    assert train_users == test_users
    assert train_movies == test_movies

    train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)

    zero_mat = np.zeros(train_users)
    for i in range(1, train_movies + 1):
        print(i, end = "\r")
        if i not in train_rating_df.columns:   
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,150))
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,150))
            test_rating_df = test_rating_df.join(zero_df)

    train_rating_df = train_rating_df[range(1, train_movies + 1)]
    test_rating_df = test_rating_df[range(1, test_movies + 1)]

    train_rating_matrix = np.asarray(train_rating_df)
    test_rating_matrix = np.asarray(test_rating_df)

    assert train_rating_matrix.shape == test_rating_matrix.shape

    return train_rating_matrix, test_rating_matrix

In [18]:
train_rating_matrix, test_rating_matrix = Train_Test_Split.train_test_split(0.8)

  after removing the cwd from sys.path.


3952

In [19]:
def mean_center(train_rating_matrix, test_rating_matrix):
    assert train_rating_matrix.shape == test_rating_matrix.shape
    A = train_rating_matrix + test_rating_matrix
    
    train_rating_matrix_centered = np.zeros(A.shape)
    test_rating_matrix_centered = np.zeros(A.shape)
    
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in train_rating_matrix[row].nonzero():
            train_rating_matrix_centered[row][i] = A[row][i] - mean
        for j in test_rating_matrix[row].nonzero():
            test_rating_matrix_centered[row][j] = A[row][j] - mean
    return train_rating_matrix_centered, test_rating_matrix_centered

In [20]:
train_rating_matrix_centered , test_rating_matrix_centered = mean_center(train_rating_matrix, test_rating_matrix)

In [23]:
(train_rating_matrix_centered != 0).sum()

15936

In [24]:
(test_rating_matrix_centered != 0).sum()

3984

In [72]:
def mod(A):
    return (A **2).sum()

In [108]:
u,s,vh = np.linalg.svd(train_rating_matrix_centered)
sig = np.zeros(train_rating_matrix.shape)
sig[:149, :149] = np.diag(s)


In [134]:
k = 40
Q = np.random.uniform(size = (A.shape[0], k))
P = np.random.uniform(size = (A.shape[1],k ))

# Q = np.copy(u)
# P = sig.dot(vh).T

In [135]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print(mse)

9.91600722935245


In [140]:
Q_prev = 0
P_prev = 0

n = 10e-2
lambda1 = 0.1
loop = 0
while mod(Q_prev - Q) >= 10e-3 or mod(P_prev - P) >= 10e-3:
    Q_prev = np.copy(Q)
    P_prev = np.copy(P)
    
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
        
#     q = -2 * ((A - Q.dot(P.T)).dot(P)) #+ (2 * lambda1 * Q)
#     p = (-2 * (Q.T.dot((A - Q.dot(P.T)))) #+ (2 * lambda1 * P.T)).T
    loop += 1
    print(loop)
    print("mod1 = ", mod(Q_prev - Q))
    print("mod2 = ", mod(P_prev - P))
    
    pred_A_lf = Q.dot(P.T)
    error = 0
    cnt = 0
    for i in range(train_rating_matrix_centered.shape[0]):
        for j in range(train_rating_matrix_centered.shape[1]):
            if train_rating_matrix[i][j] != 0:
                error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse = np.sqrt(error/cnt)
    print("cnt = ", error)
    print("Mse = ", mse)
    
    
    

148
mod1 =  126.06200172072101
mod2 =  38.55775205664647
cnt =  1989.1080947441985
Mse =  0.3525893587752081
248
mod1 =  4.149253411081398
mod2 =  21.183068781226176
cnt =  1892.147496386531
Mse =  0.3438883809089196
348
mod1 =  1.0183009996014085
mod2 =  11.58739500043155
cnt =  1814.4805812460716
Mse =  0.336756642589095
448
mod1 =  0.45970531210236343
mod2 =  7.368390698431138
cnt =  1765.5771183192471
Mse =  0.33218755228778957
548
mod1 =  0.28046659156405107
mod2 =  5.192641737097416
cnt =  1733.8840632603774
Mse =  0.32919257882548564
21

KeyboardInterrupt: 

In [141]:
pred_A_lf = Q.dot(P.T)
error = 0
cnt = 0

## For test
for i in range(test_rating_matrix_centered.shape[0]):
    for j in range(test_rating_matrix_centered.shape[1]):
        if test_rating_matrix[i][j] != 0:
            error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1

mse = np.sqrt(error/cnt)
print("cnt  = ", cnt)
print(mse)

error = 0
cnt = 0
## for train
for i in range(train_rating_matrix_centered.shape[0]):
    for j in range(train_rating_matrix_centered.shape[1]):
        if train_rating_matrix[i][j] != 0:
            error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
print("cnt = ", cnt)
print(mse)

cnt  =  4000
1.0248614099669615
cnt =  16000
0.331000926214057


In [None]:
Q_prev = 0
P_prev = 0

n = 10e-3
lambda1 = 10
loop = 0
while mod(Q_prev - Q) >= 10e-1 or mod(P_prev - P) >= 10e-1:
    Q_prev = np.copy(Q)
    P_prev = np.copy(P)
    
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                #P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
    
    for i in range(train_rating_matrix.shape[0]):
        print(i , end = "\r")
        for j in range(train_rating_matrix.shape[1]):
            if train_rating_matrix[i][j] != 0:
                Apred_lf = Q.dot(P.T)
                eij = train_rating_matrix_centered[i][j] - Apred_lf[i][j]
                #Q[i,:] = Q[i,:] + n * (eij * P[j,:] - lambda1 * Q[i,:])
                P[j,:] = P[j,:] + n * (eij * Q[i,:] - lambda1 * P[j,:])
    
        

    loop += 1
    print(loop)
    print("mod1 = ", mod(Q_prev - Q))
    print("mod2 = ", mod(P_prev - P))
    
    pred_A_lf = Q.dot(P.T)
    error = 0
    cnt = 0
    for i in range(train_rating_matrix_centered.shape[0]):
        for j in range(train_rating_matrix_centered.shape[1]):
            if train_rating_matrix[i][j] != 0:
                error += ((train_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse = np.sqrt(error/cnt)
    print("error_train = ", error)
    print("Mse_train = ", mse)
    
    error = 0
    cnt = 0
    for i in range(test_rating_matrix_centered.shape[0]):
        for j in range(test_rating_matrix_centered.shape[1]):
            if test_rating_matrix[i][j] != 0:
                error += ((test_rating_matrix_centered[i][j] - pred_A_lf[i][j]) ** 2)
                cnt += 1
    mse = np.sqrt(error/cnt)
    print("error_test= ", error)
    print("Mse_test = ", mse)
    
    
    