In [212]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
import math
import sys

In [213]:
#Users data
usersData = pd.read_csv('users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

#Ratings data
ratingsData = pd.read_csv('ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings_testData = pd.read_csv('ratings_test.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')

#Movies data
moviesData = pd.read_csv('movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')

In [214]:
#Users data
print(usersData.head())

#Ratings data
print(ratingsData.head())
print(ratings_testData.head())

#Movies data
print(moviesData.head())

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)         

In [229]:
#First, convert data into a numpy array
df = ratings_testData.pivot(index = "UserID", columns ="MovieID", values = "Rating") 
X = np.array(df.to_numpy())

print("Data as matrix (no numpy-array yet):")
print(df)
print("\nData as matrix:")
print(X)


Data as matrix (no numpy-array yet):
MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
996       4.0   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN   NaN  ...   
997       4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
998       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
999       NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   4.0   NaN  ...   
1000      5.0   NaN   NaN   NaN   NaN   NaN   NaN  

In [250]:
def matrix_factorization(X, K, num_iter, r, l):
    """
    Function that performs matrix factorization to obtain the empty (NaN) points in the given matrix

    Arguments:
        X: Full User-Movie rating matrix 
        K: Number of latent features
        num_iter: Number of iterations
        r: Regularization parameter
        l: Learning rate
        

    Returns:
        U: User-feature matrix
        M: Feature-Movie matrix
            np.dot(U, M.T) will then give the predicted Rating Matrix 
        
    """
    
    #We will follow the steps given in chapter 3.1 of the gravity-Tikk.pdf paper:
    I, J = X.shape
    U = np.random.rand(I, K) #U is an I x K matrix with randomly distributed weights 
    M = np.random.rand(J, K) #M is an K x J matrix with randomly distributed weights 
    
    SE = 0
     
    for step in range(num_iter):
        print("Iteration: ", step)
        for i in range(I):
            for j in range(J):
                if X[i, j] > 0: #i.e. non-NaN
                    eij = X[i, j] - np.dot(U[i, :], M[j, :])
                    #eij denotes the training error on the (i, j)th example
                    
                    for k in range(K):
                        #For all latent features, apply eq.6&7 in gravity-Tikk.pdf
                        U[i, k] = U[i, k] + l * (2 * eij * M[j, k] - r * U[i, k])
                        M[j, k] = M[j, k] + l * (2 * eij * U[i, k] - r * M[j, k])

                    SE = SE + (X[i,j] - np.dot(U, M.T)[i,j])**2 
                    
    print("Total SE:", SE)
    return U, M, SE

#----------------------------------------------------------------------------------------------------------------
#To test on the array H that is given in the book chapter 9.4:
H = np.array([[5,2,4,4,3],[3,1,2,4,1],[2,np.nan,3,1,4],[2,5,4,3,5],[4,4,5,4,np.nan]]) #let the blank elements be NaN
X = H

#----------------------------------------------------------------------------------------------------------------
#To test on larger array, we can introduce C
# C = np.random.randint(10, 50, size=(10, 10)) #generate values between 10 and 50 because we multiply by 0.1 later

# n_b = 80 # number of nan we want to add 
# C = C*0.1 # converting the data to float as nan is also of type float
 
# index_C = np.random.choice(C.size, n_b, replace=False) # choosing random indexes to put NaN
# C.ravel()[index_C] = np.nan # adding nan to the data.

# C_new = np.round(C) #To get integer values between 1 and 5
# print(C)
# print(C_new)
# X = C
#----------------------------------------------------------------------------------------------------------------

U, M, SE = matrix_factorization(X, K=10, num_iter=10, r=0.005, l=0.05)
predicted_X = np.dot(U, M.T)

print("Original Rating Matrix:")
print(X)
print("\nPredicted Rating Matrix:")
print(predicted_X)

count = np.count_nonzero(~np.isnan(X)) #number of non-NaN values in the original ratings matrix X
RMSE = np.sqrt(SE/count)
print("Total SE:", SE)
print("Total number of non-NaN values in the original ratings matrix X:", count)
print("Total RMSE:", RMSE)

Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Total SE: 9.17123948161988
Original Rating Matrix:
[[ 5.  2.  4.  4.  3.]
 [ 3.  1.  2.  4.  1.]
 [ 2. nan  3.  1.  4.]
 [ 2.  5.  4.  3.  5.]
 [ 4.  4.  5.  4. nan]]

Predicted Rating Matrix:
[[4.94073198 1.92450561 3.85455283 3.99760528 3.00345002]
 [2.98878392 1.01611739 2.09375618 4.02619407 1.01109607]
 [1.9865499  2.25853714 2.87813124 1.00815012 4.00849808]
 [1.94602007 4.9552487  4.02353752 3.0430264  5.01412546]
 [4.01805576 4.0274248  5.03137966 4.00711476 4.86527101]]
Total SE: 9.17123948161988
Total number of non-NaN values in the original ratings matrix X: 23
Total RMSE: 0.6314661849055143
