In [420]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
import math
import sys

import datetime

In [568]:
#Users data
usersData = pd.read_csv('users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

#Ratings data
ratingsData = pd.read_csv('ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
ratings_testData = pd.read_csv('ratings_test.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')

#Movies data
moviesData = pd.read_csv('movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='latin1')

In [569]:
#Users data
print(usersData.head())

#Ratings data
print(ratingsData.head())
print(ratings_testData.head())

#Movies data
print(moviesData.head())

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)         

In [639]:
totalRMSE = 0
totalRMSE_function = 0

def rmse_func(test_matrix, pred_matrix):
    """
    Function that calculates the RMSE between the test and predicted matrices
    
    Arguments:
        test_matrix: The test matrix from 5-fold split
        pred_matrix: The prtedicted training matrix from 5-fold split
        
    """
    #initialize the errors of the previous and current step such that the condition is held for the first loop 
    rmse_old = float('inf')
    rmse_new = np.sqrt(np.nanmean(test_matrix-pred_matrix)**2)
    
    while rmse_new > 0.1 and (rmse_old - rmse_new > 1e-5):

        diff = test_matrix-pred_matrix
        rmse_old = rmse_new
        rmse_new = np.sqrt(np.nanmean(diff**2)) #compute the RMSE between the test and predicted matrices
    
    print("RMSE from the function:", rmse_new)    
    return rmse_new 

def matrix_factorization(X_train, X_test, K, num_iter, r, l):
    totalRMSE = 0
    """
    Function that performs matrix factorization to obtain the empty (NaN) points in the given matrix

    Arguments:
        X: Full User-Movie rating matrix 
        K: Number of latent features
        num_iter: Number of iterations
        r: Regularization parameter
        l: Learning rate
        

    Returns:
        U: User-feature matrix
        M: Feature-Movie matrix
            np.dot(U, M.T) will then give the predicted Rating Matrix 
        
    """
    
    #We will follow the steps given in chapter 3.1 of the gravity-Tikk.pdf paper:
    I, J = X_train.shape
    U = np.random.rand(I, K) #U is an I x K matrix with randomly distributed weights 
    M = np.random.rand(J, K) #M is an K x J matrix with randomly distributed weights 
     
    for iteration in range(num_iter):
        print("\nIteration: ", iteration)
        squared_error = 0
        num_ratings = 0
        
        """
        Because the test matrix and training matrix will not always be the same shape, we cannot subtract them from each
        other to calculate the RMSE in rmse_func on top. Typically the test matrix is smaller than the training matrix,
        so we will add rows and columns of NaN values to make their shapes equal. Since any NaN values are not used when
        calculating the RMSE, this does not change its value. 
        """
        
        X_test_orig = X_test
        
        rows_extra = X_train.shape[0]-X_test.shape[0] #extra rows needed to make shape of X_test same as X_train
        columns_extra = X_train.shape[1]-X_test.shape[1] #extra columns needed to make shape of X_test same as X_train

        nan_matrix_col = np.empty((X_test.shape[0], columns_extra)) #make empty matrix of right shape (extra columns)
        nan_matrix_col[:] = np.nan #fill empty matrix with NaNs
        X_test = np.column_stack((X_test, nan_matrix_col)) #add the extra columns to X_test
        nan_matrix_row = np.empty((rows_extra, X_test.shape[1])) #make empty matrix of right shape (extra rows)
        nan_matrix_row[:] = np.nan #fill empty matrix with NaNs
        X_test = np.row_stack((X_test, nan_matrix_row)) #add the extra rows to X_test
        
        print("Original shape of X_test:", X_test_orig.shape, ", while shape of X_train is:", 
              X_train.shape)
        print("After adding empty (NaN) columns and rows, the shape of X_test is now:", X_test.shape)
        
        for i in range(I):
            for j in range(J):
                if X_train[i, j] >0: #i.e. non-NaN
                    eij = X_train[i, j] - np.dot(U[i, :], M[j, :])
                    #eij denotes the training error on the (i, j)th example
                    squared_error += eij**2
                    num_ratings += 1
                    
                    for k in range(K):
                        #For all latent features, apply eq.6&7 in gravity-Tikk.pdf
                        U[i, k] = U[i, k] + l * (2 * eij * M[j, k] - r * U[i, k])
                        M[j, k] = M[j, k] + l * (2 * eij * U[i, k] - r * M[j, k])
                    

        rmse = np.sqrt(squared_error / num_ratings) #This is the RMSE as calculated from the training error eij
        
        if rmse < 0.001: #if rmse falls below a 0.001 threshold, the algorithm stops, indicating that it has 
                         #converged to a satisfactory solution.
            break 
            
            
        
        
        predicted_X_train = np.dot(U, M.T) #the predicted training matrix

        rmse_function = rmse_func(X_test, predicted_X_train) #rmse determined from the function on top
        print("RMSE from rmse_func from iteration ", iteration, ": ", rmse_function)   
        print("RMSE from the training error in the for-loop after iteration ", iteration, ": ", rmse)    
        
        print("Original training set X_train:\n", X_train) #Print the original training matrix 
        print("Predicted training set predicted_X_train:\n", predicted_X_train) #Print the predicted training matrix
        
    now = datetime.datetime.now()
    print ("Time at end of iteration: ", now.strftime("%Y-%m-%d %H:%M:%S")) #To check how long it takes to run  
   
    return U, M, rmse, rmse_function



"""
To switch between the ratings_testData and the 'complete' ratingsData, switch the '#' in the lines below and in 
the next for-loop.
"""
df_total = ratings_testData.pivot(index = "UserID", columns ="MovieID", values = "Rating") 
#df_total = ratingsData.pivot(index = "UserID", columns ="MovieID", values = "Rating") 
X_total = np.array(df_total.to_numpy()) #Make numpy array of the selected ratings data

count_kfold = 1
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(ratings_testData):
#for train_index, test_index in kf.split(ratingsData):
    now = datetime.datetime.now() #To check how long it takes to run 
    print ("Time at start of fold number", count_kfold, " :", now.strftime("%Y-%m-%d %H:%M:%S"))   
    
    train_set = ratingsData.iloc[train_index]
    test_set = ratingsData.iloc[test_index]
    
    #First, convert data into a numpy array
    df_train = train_set.pivot(index = "UserID", columns ="MovieID", values = "Rating")
    df_test = test_set.pivot(index = "UserID", columns ="MovieID", values = "Rating")
    X_train = np.array(df_train.to_numpy())
    X_test = np.array(df_test.to_numpy())
    
    U, M, rmse, rmse_function= matrix_factorization(X_train, X_test, K=10, num_iter=10, r=0.05, l=0.005)
    
    predicted_X_train = np.dot(U, M.T)

    print("RMSE estimate (from rmse-func) of fold number ", count_kfold, ": ", rmse_function)
    print("RMSE estimate (the training error in the for-loop) of fold number ", count_kfold, ": ", rmse)
    
    totalRMSE += rmse
    totalRMSE_function += rmse_function
    #np.save("predicted_U_"+str(count_kfold), U)
    #np.save("predicted_M_"+str(count_kfold), M)
    count_kfold += 1
    

# Calculate the average RMSE over all folds
averageRMSE = totalRMSE / 5
averageRMSE_function = totalRMSE_function / 5

print("The Average RMSE is", averageRMSE)
print("The Average RMSE (from function) is", averageRMSE_function)




Time at start of fold number 1  : 2023-10-20 11:39:58

Iteration:  0
Original shape of X_test: (998, 2906) , while shape of X_train is: (1000, 3334)
After adding empty (NaN) columns and rows, the shape of X_test is now: (1000, 3334)
RMSE from the function: 1.328206723066356
RMSE from rmse_func from iteration  0 :  1.328206723066356
RMSE from the training error in the for-loop after iteration  0 :  1.0751694672790995
Original training set X_train:
 [[ 5. nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [ 5. nan nan ... nan nan nan]]
Predicted training set predicted_X_train:
 [[5.41306494 4.21991563 4.49929691 ... 3.25670909 3.11784126 3.96363694]
 [5.9017544  4.02864089 4.43467412 ... 3.18574516 3.10224204 4.16638062]
 [5.41273355 4.33703483 4.04443202 ... 2.9967844  2.7475083  4.13588792]
 ...
 [5.6452486  3.7162858  4.35638817 ... 3.2867016  3.32231399 3.68313811]
 [4.76522359 3.8