In [1]:
import numpy as np
import pandas as pd

In [2]:
rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)

  


In [3]:
rating_df.drop(index =range(100000,1000209), inplace=True)

In [4]:
rating_matrix = np.asarray(rating_df.pivot(index = "UserId", columns = "MovieId", values = "Rating").fillna(0))

In [5]:
### Mean center

def mean_center(A):
    mean_centered_A = np.copy(A)
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in A[row].nonzero():
            mean_centered_A[row][i] -= mean
    return mean_centered_A

In [6]:
def svd(A):
    
    transpose_flag = 0
    if A.shape[0] > A.shape[1]:
        transpose_flag = 1
        A = A.T
    
    AAt = np.matmul(A, A.T)
    AtA = np.matmul(A.T, A)
    
    eigenvalue_AAt, eigenvector_AAt = np.linalg.eig(AAt)
    eigenvalue_AtA, eigenvector_AtA = np.linalg.eig(AtA)
    
    eigenvector_AAt_transpose = eigenvector_AAt.T
    eigenvector_AtA_transpose = eigenvector_AtA.T
    
    eigenvector_AAt_transpose[eigenvalue_AAt < 10e-2] = 0
    eigenvector_AtA_transpose[eigenvalue_AtA < 10e-2] = 0
    eigenvalue_AAt[eigenvalue_AAt < 10e-2] = 0
    eigenvalue_AtA[eigenvalue_AtA < 10e-2] = 0
    
    eigenvalue_AAt = np.sqrt(eigenvalue_AAt.real)
    eigenvalue_AtA = np.sqrt(eigenvalue_AtA.real)
    
    eigenvector_AAt = eigenvector_AAt.real
    eigenvector_AtA = eigenvector_AtA.real
    
    argsort_eigenval_AAt = np.argsort(-eigenvalue_AAt)
    argsort_eigenval_AtA = np.argsort(-eigenvalue_AtA)
    
    U = np.zeros(eigenvector_AAt.shape)
    sigma1 = np.sort(eigenvalue_AAt)[::-1]
    
    V = np.zeros(eigenvector_AtA.shape)
    sigma2 = (np.sort(eigenvalue_AtA))[::-1]
    for i,j in enumerate(argsort_eigenval_AtA):
        V[:,i] = eigenvector_AtA[:,j]
        
    sigma = np.zeros(A.shape)
    sigma[:, :A.shape[0]] = np.diag(sigma1)
        
    for i in range(U.shape[1]):
        if sigma1[i] != 0:
            U[:,i] = (A.dot(V[:,i]))/sigma1[i]
        else:
            U[:,i] = 0
        
    if transpose_flag == 0:
        return U, sigma, V.T
    else:
        return V, sigma.T, U.T

In [7]:
A = rating_matrix

In [8]:
A.shape

(669, 3264)

In [11]:
column_len = []
for i in range(A.shape[1]):
    length = np.dot(A[:,i], A[:,i])
    column_len.append(length)
    
column_len = np.asarray(column_len)
column_prob = column_len/column_len.sum()

row_len = []
for i in range(A.shape[0]):
    length = np.dot(A[i,:], A[i,:])
    row_len.append(length)
row_len = np.asarray(row_len)
row_prob = row_len/row_len.sum()

In [12]:
columns = 1500
rows = 300
np.random.seed(201)
selected_columns = np.random.choice(range(column_len.shape[0]), size = (columns), replace = True, p = column_prob)
selected_rows = np.random.choice(range(row_len.shape[0]), size = (rows), replace = True, p = row_prob)


In [13]:
selected_rows.sort()
selected_columns.sort()

In [14]:
C = np.zeros((A.shape[0], columns))
R = np.zeros((rows, A.shape[1]))
W = np.zeros((rows, columns))

In [16]:
for i,j in enumerate(selected_columns):
    C[:,i] = A[:,j]/(np.sqrt(columns * column_len[j]))
    
for i,j in enumerate(selected_rows):
    R[i,:] = A[j,:]/(np.sqrt(rows * row_len[j]))

In [17]:
for i,j in enumerate(selected_rows):
    for k,l in enumerate(selected_columns):
        W[i,k] = A[j,l]

In [18]:
X, Z, Y_transpose = svd(W)


In [19]:
#  x, s , y_transpose = np.linalg.svd(W)
# z = np.zeros(W.shape)
# z[:rows, :rows] = np.diag(s)
# Z = z

In [20]:
W_pred = X.dot(Z.dot(Y_transpose))

In [21]:
(((W_pred - W) ** 2).sum()/ (W.shape[0] * W.shape[1]))

1.4755692478892702e-25

In [22]:
Z_plus = np.zeros(Z.shape)
for i in range(Z.shape[0]):
    for j in range(Z.shape[1]):
        if Z[i][j] != 0:
            Z_plus[i][j] = 1/Z[i][j]

In [23]:
U = (Y_transpose.T).dot((Z_plus.T).dot(X.T))

In [24]:
A_pred = C.dot(U.dot(R))

In [25]:
mse = np.sqrt( ((A_pred - A)**2).sum()/ (A.shape[0]*A.shape[1]) )
print(mse)

0.8084692882038685
