**TASK 1 - Naive Approaches**

In [10]:
#Install necessary libraries
!pip install scikit-learn
!pip install pandas

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/c3/6c/ea362eef61f05553aaf1a24b3e96b2d0603f5dc71a3bd35688a24ed88843/pandas-2.0.3-cp38-cp38-win_amd64.whl.metadata
  Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/32/4d/aaf7eff5deb402fd9a24a1449a8119f00d74ae9c2efa79f8ef9994261fc2/pytz-2023.3.post1-py2.py3-none-any.whl.metadata
  Downloading pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Downloading pandas-2.0.3-cp38-cp38-win_amd64.whl (10.8 MB)
   ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.8 MB 5.9 MB/s eta 0:00:02
   -- ------------------------------------- 0.8/10.8 MB 8.0 MB/s eta 0:00:02
   ---- 

In [1]:
#Import necessary libraries
from sklearn.metrics import mean_squared_error,mean_absolute_error
import pandas as pd
import numpy as np

# Load the MovieLens 1M dataset using pandas library, We use the seperator as :: to separate the columns whose names were mentioned in the README file provided.
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
print(ratings.shape,ratings.head(2))


users = pd.read_csv('ml-1m/users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender','Age','Occupation','Zip-code'])
print(users.shape,users.head(2))


movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python', encoding='latin-1',
                     names=['MovieID','Title','Genres'])
print(movies.shape,movies.head(2))


"""
Custom K-fold cross-validation script for a dataset.

This script divides a given dataset into 'n_splits' random folds for cross-validation. 
The option to shuffle the data is available, similar to the shuffle parameter in Scikit-Learn's KFold.
Yielding can be more memory-efficient than returning when dealing with large datasets hence we use yielding instead of returning the indices.

Parameters:
- n_splits: Number of cross-validation splits (folds).
- seed: Random seed for reproducibility.
- data: The dataset to be split (Pandas DataFrame).
- shuffle: Determines whether to shuffle the data before splitting (True for random shuffling, False for ordered splitting).

Yields:
- train_indices: Indices for the training set of the current fold.
- test_indices: Indices for the testing set of the current fold.
"""

def K_fold(n_splits, seed,data,shuffle):
    subset_size = len(data) // n_splits  # Calculate the size of each fold
    if shuffle:
        indices = data.index.to_numpy()  # Get the DataFrame's index as a NumPy array
        np.random.seed(seed)  # Set the random seed for reproducibility
        np.random.shuffle(indices)  # Shuffle the indices randomly
    for fold in range(n_splits):
        start = fold * subset_size  # Calculate the start index of the current fold
        end = (fold + 1) * subset_size if fold < n_splits - 1 else len(data)    # Calculate the end index
        test_indices = indices[start:end]                                       # Select indices for the testing set
        train_indices = np.concatenate([indices[:start], indices[end:]])        # Select indices for the training set
        #subset = data[start:end]                                                
        yield train_indices,test_indices

cv = K_fold(n_splits=5, seed=42,data=ratings,shuffle=True) 

(1000209, 4)    UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
(6040, 5)    UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
(3883, 3)    MovieID             Title                        Genres
0        1  Toy Story (1995)   Animation|Children's|Comedy
1        2    Jumanji (1995)  Adventure|Children's|Fantasy


In [57]:
#Testing the K_fold script.
for i, (train,test) in enumerate(cv):
    print(i, train, test)

0 [149989 416292 683230 ... 131932 671155 121958] [895536 899739  55687 ... 477775 424188 293600]
1 [895536 899739  55687 ... 131932 671155 121958] [149989 416292 683230 ... 180197 451837 259541]
2 [895536 899739  55687 ... 131932 671155 121958] [  8213 946406 997350 ...  66898 559054 969525]
3 [895536 899739  55687 ... 131932 671155 121958] [783546 277503 820674 ... 829520  58689 100373]
4 [895536 899739  55687 ... 829520  58689 100373] [607117 537829 803592 ... 131932 671155 121958]


In [58]:
'''
Global Mean Rating: Global mean is the average rating of all the movie ratings in our dataset.
This term acts as the baseline prediction rating for all movies and users.
'''

global_mean_rating = ratings['Rating'].mean()   #Fallback value for recommendation
print('Global mean rating = ', global_mean_rating)

'''
Any movie rating that is greater than the global mean rating value can be recommended to the user
First split the data using custom K_fold split script.
Check for the rating and compare it with the global mean rating
Calculate the mse and mae
'''

# Create a simple recommendation model that recommends movies based on global mean ratings.
def recommend_movies_by_global_mean_ratings(user_id, num_recommendations=10):
    # Filter movies with a rating greater than or equal to the global mean.
    recommended_movies = [movie_id for movie_id in ratings["MovieID"] if  ratings["Rating"]>=global_mean_rating][:num_recommendations]
    return recommended_movies

# Function to calculate Root Mean Square Error (RMSE).
# RMSE measures the root squared of the mean of the error between predicted and actual ratings.
def calculate_rmse(predictions, actual):
    mse = mean_squared_error(actual, predictions)
    rmse = np.sqrt(mse)
    return rmse

# Function to calculate Mean Absolute Error (MAE).
# MAE measures the average absolute difference between predicted and actual ratings.
def calculate_mae(predictions, actual):
    mae = mean_absolute_error(actual, predictions)
    return mae

# Lists to store RMSE and MAE scores for each fold
rmse_scores = []
mae_scores = []

# Split the data using K-fold cross-validation script
cv = K_fold(n_splits=5, seed=42, data=ratings, shuffle=True)

for i, (train_index, test_index) in enumerate(cv):
    # Split the data into training and testing sets for the current fold.
    train_data, test_data = ratings.iloc[train_index], ratings.iloc[test_index]
    
    y_true = test_data['Rating']
    y_pred = [global_mean_rating] * len(test_data["Rating"])

     # Calculate RMSE and MAE for the fold and store the score
    rmse = calculate_rmse(y_pred, y_true)
    rmse_scores.append(rmse)

    mae = calculate_mae(y_pred, y_true)
    mae_scores.append(mae)

#The results for the average RMSE and MAE scores over all folds   
print('The root mean squared error is = ', np.average(rmse_scores))
print('The mean absolute error score is = ', np.average(mae_scores))

Global mean rating: 3.581564453029317
1.1171002038673066
0.9338608614369477


In [71]:
# Create a simple recommendation model based on item mean ratings
# Calculate item mean ratings for each movie
movie_means = ratings.groupby('MovieID')['Rating'].mean().reset_index()
df = ratings.merge(movie_means, on='MovieID', suffixes=('', '_mean')) # Merge the movie mean ratings with the original ratings dataset
print(df.head(2))

# Function to recommend movies based on movie mean ratings
def recommend_movies_by_movie_mean_ratings(movie_id, train_data):
    pred = []
    movie_ids_set = set(train_data["MovieID"]) #We create a set to reduce the search time.
    
    for i in movie_id:
         # Check if the movie ID is in the training dataset else use the global mean rating as a fallback.
        if i in movie_ids_set:
            prediction = (train_data.loc[train_data['MovieID'] == i, 'Rating_mean'].iloc[0])
        else:
            prediction = (global_mean_rating)
        # Round predicted ratings within the valid range [1, 5]
        pred.append(max(1, min(5, prediction)))
    return pred

# Lists to store RMSE and MAE scores for each fold
rmse_scores = []
mae_scores = []

# Split the data using K-fold cross-validation script
cv = K_fold(n_splits=5, seed=42, data=df, shuffle=True)

for i, (train_index, test_index) in enumerate(cv):
    # Split the data into training and testing sets for the current fold.
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]

    y_true =  test_data['Rating']
    y_pred = recommend_movies_by_movie_mean_ratings(test_data["MovieID"], train_data)  #Generate predicted ratings based on movie mean ratings

    # Calculate RMSE and MAE for the fold and store the score
    rmse = calculate_rmse(y_pred, y_true)
    rmse_scores.append(rmse)

    mae = calculate_mae(y_pred, y_true)
    mae_scores.append(mae)
    
# The results for the average RMSE and MAE scores over all folds
print('The root mean squared error is = ', np.average(rmse_scores))
print('The mean absolute error score is = ', np.average(mae_scores))

   UserID  MovieID  Rating  Timestamp  Rating_mean
0       1     1193       5  978300760     4.390725
1       2     1193       5  978298413     4.390725
The root mean squared error is =  0.974876592847774
The mean absolute error score is =  0.7789571731412801


In [72]:
# Create a simple recommendation model based on user mean ratings
# Calculate user mean ratings for each movie
user_means = ratings.groupby('UserID')['Rating'].mean().reset_index()
df = ratings.merge(user_means, on='UserID', suffixes=('', '_mean')) # Merge the user mean ratings with the original ratings dataset
print(df.head(2))

# Function to recommend movies based on movie mean ratings
def recommend_movies_by_user_mean_ratings(user_id, train_data):
    pred = []
    user_ids_set = set(train_data["UserID"])
    
    for i in user_id:
        if i in user_ids_set:
            # Check if the user ID is in the training dataset else use the global mean rating as a fallback.
            pred.append(train_data.loc[train_data['UserID'] == i, 'Rating_mean'].iloc[0])
        else:
            pred.append(global_mean_rating)
    return pred

# Lists to store RMSE and MAE scores for each fold
rmse_scores = []
mae_scores = []

# Split the data using K-fold cross-validation script
cv = K_fold(n_splits=5, seed=42, data=df, shuffle=True)

for i, (train_index, test_index) in enumerate(cv):
    # Split the data into training and testing sets for the current fold.
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]

    y_true =  test_data['Rating']
    y_pred = recommend_movies_by_user_mean_ratings(test_data["UserID"], train_data)  #Generate predicted ratings based on user mean ratings

    # Calculate RMSE and MAE for the fold and store the score
    rmse = calculate_rmse(y_pred, y_true)
    rmse_scores.append(rmse)

    mae = calculate_mae(y_pred, y_true)
    mae_scores.append(mae)
    
#The results for the average RMSE and MAE scores over all folds   
print('The root mean squared error is = ', np.average(rmse_scores))
print('The mean absolute error score is = ', np.average(mae_scores))

   UserID  MovieID  Rating  Timestamp  Rating_mean
0       1     1193       5  978300760     4.188679
1       1      661       3  978302109     4.188679
The root mean squared error is =  1.0284465595659256
The mean absolute error score is =  0.8234169238414688


In [74]:
# Calculate item mean ratings for each movie
movie_means = ratings.groupby('MovieID')['Rating'].mean().reset_index()

# Calculate user mean ratings for each user
user_means = ratings.groupby('UserID')['Rating'].mean().reset_index()

# Merge movie mean ratings with the original ratings dataset
df = ratings.merge(movie_means, on='MovieID', suffixes=('', '_movie_mean'))

# Merge user mean ratings with the dataset
df = df.merge(user_means, on='UserID', suffixes=('', '_user_mean'))
print(df.head(2))

# Function to get the coefficients for the linear regression model
def build_linear_regression_model(train_data):
    """
    Build a linear regression model to estimate alpha, beta, and gamma coefficients.

    Parameters:
    - train_data: Training data containing 'Rating', 'Rating_movie_mean', and 'Rating_user_mean'.

    Returns:
    - coefficients: Coefficients [alpha, beta, gamma] estimated by the model.
    """

    # Create a design matrix X with columns: avguser (alpha), avgmovie (beta), and a constant term (1 for gamma)
    X = np.column_stack((train_data['Rating_user_mean'], train_data['Rating_movie_mean'], np.ones(len(train_data))))

    #NumPy's lstsq function to estimate alpha, beta, and gamma
    coefficients, residuals, rank, singular_values = np.linalg.lstsq(X, train_data['Rating'], rcond=None)
    return coefficients  # coefficients contains [alpha, beta, gamma]

def recommend_movies_by_linear_regression(movie_data, user_data, model):
    """
    Predict using a linear regression model.

    Parameters:
    - user_avg: Average user rating of test data.
    - movie_avgs: Average movie ratings of test data.
    - model: Linear regression model coefficients [alpha, beta, gamma].

    Returns:
    - predictions: Predicted movie ratings based on the model.
    """
    predictions = []
    for i, j in zip(movie_data,user_data):
        if np.isnan(i) or np.isnan(j):
            # Use the global mean rating as the fall-back value when either user or movie data is missing
            predictions.append(global_mean_rating)
        else:
            # Predict ratings using the linear regression model: alpha * user_data + beta * movie_data + gamma
            prediction = model[0] * j + model[1] * i + model[2]
            # Round predictions to be within the valid range [1, 5]
            predictions.append(np.maximum(1, np.minimum(5, prediction)))
    return predictions

# Lists to store RMSE and MAE scores for each fold
rmse_scores = []
mae_scores = []

# Split the data using K-fold cross-validation script
cv = K_fold(n_splits=5, seed=42, data=df, shuffle=True)

for i, (train_index, test_index) in enumerate(cv):
    # Split the data into training and testing sets for the current fold.
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]
    # Build the regression model to get the coefficients 
    model = build_linear_regression_model(train_data)

    # Make movie recommendations for each user in the test set
    y_pred = recommend_movies_by_linear_regression(test_data["Rating_movie_mean"], test_data["Rating_user_mean"], model)
    y_true = test_data['Rating']
    
    # Calculate RMSE and MAE for the fold and store the score
    rmse = calculate_rmse(y_pred, y_true)
    rmse_scores.append(rmse)

    mae = calculate_mae(y_pred,y_true)
    mae_scores.append(mae)
    
# The results for the average RMSE and MAE scores over all folds
print('The root mean squared error is = ', np.average(rmse_scores))
print('The mean absolute error score is = ', np.average(mae_scores))

   UserID  MovieID  Rating  Timestamp  Rating_movie_mean  Rating_user_mean
0       1     1193       5  978300760           4.390725          4.188679
1       1      661       3  978302109           3.464762          4.188679
The root mean squared error is =  0.9155312414686204
The mean absolute error score is =  0.7256333945260849


**TASK 2 - The UV matrix decomposition**
In this task, we used a method called UV-decomposition, which is a dimensionality reduction approach for estimating the missing values in a utility matrix. This approach assumes that the utility matrix $M$ can be represented as the product of two matrices, $U$ as user-related features and $V$ as item-related features.

By decomposing the utility matrix $M$ into two matrices, $U$ and $V$, we are aiming to closely approximate $M$ in the non-blank entries. This allows us to estimate the missing values in the $M$ matrix by computing corresponding entries in the product $UV$.

The final goal is to find the entries of $U$ and $V$ that best approximate the known values of $M$. However, decomposition may not yield a perfect match for all non-blank entries in $M$, especially when there are more known entries than the total entries in $U$ and $V$ combined.

In summary, the decomposition could be outlined in four key areas:
- Preprocessing the Matrix, in our case by applying mean centering to the matrix.
- Initializing Matrices $U$ and $V$, in our case to numbers one.
- Optimizing order of values in U and V matrices, which denote two following notation respectively:
$u_{rs}= \frac{ \sum_{j}v_{sj}(m_{rj}- \sum_{k \neq s}u_{rk}v_{kj})}{ \sum_{j}v^2_{sj}}$ ; $v_{rs}= \frac{ \sum_{i}u_{ir}(m_{is}- \sum_{k \neq r}u_{ik}v_{ks})}{ \sum_{i}u^2_{ir}}$
- Ending the attempt at optimization.

In [2]:
'''
Create a utility matrix M from a user-item ratings. The resulting matrix is designed to represent user ratings for items, where missing values are denoted as NaN.

Parameters:
- ratings: Pandas df with columns 'user', 'movie', and 'rating'.

Returns:
- M: Utility matrix where missing values are represented as NaN.
'''
def makeM(ratings):
    M = np.empty((6040, 3952))
    M[:] = np.nan
    for (u,m,r,_) in ratings:
        M[u-1,m-1] = r
    return M


'''
Function perform an update of the U matrix (part of UV-decomposition), based on the V matrix and the original utility matrix M.

Parameters:
- U: The U matrix
- V: The V matrix
- M: The original M matrix
- n: The number of user
- f: The number of features
'''
def decomposeU(U,V,M,n,f):
    for r in range(n):
        for s in range(f):
            m_array = np.array(M[r,:])
            v_array=np.array(V[s,:])
            v_array[np.isnan(m_array)]=np.nan
            denominator=np.nansum(np.square(v_array))
            if denominator == 0:
                denominator = 1
            sum_array=np.matmul(U[r,:],V[:])-(U[r,s]*V[s,:])
            numerator=np.nansum(V[s,:]*(m_array-sum_array))
            U[r,s]=numerator/denominator


'''
Function perform an update of the V matrix (part of UV-decomposition), based on the U matrix and the original utility matrix M.

Parameters:
- U: The U matrix
- V: The V matrix
- M: The original M matrix
- m: The number of movies
- f: The number of features
'''
def decomposeV(U,V,M,m,f):
    for s in range(0,m):
        for r in range(f):
            m_array=np.array(M[:,s])
            u_array=np.array(U[:,r])
            u_array[np.isnan(m_array)]=np.nan
            denominator=np.nansum(np.square(u_array))
            if denominator == 0:
                denominator = 1
            sum_array = np.matmul(U[:], V[:,s]) - (V[r, s] * U[:, r])
            numerator = np.nansum(U[:, r] * (m_array - sum_array))
            V[r,s]=numerator/denominator

'''
RMSE is used to measure the accuracy of predictive model, in our case in context of UV-decomposition and Matrix Factorization. MSE quantifies the average magnitude of the errors or the discrepancies between the actual and predicted values in a utility matrix M, where lower RMSE values indicate a more accurate model.
'''
def calculate_rmse(matrix, test_data):
    pred = []
    actual = []
    for (user_id, movie_id, rating, _) in test_data:
        pred.append(matrix[user_id-1, movie_id-1])
        actual.append(rating)
    mae_value = mean_absolute_error(actual, pred)
    rmse_value = np.sqrt(mean_squared_error(actual, pred))
    return rmse_value, mae_value


'''
Function is capable of preprocessing a given utility matrix by applying a mean centering technique. This approach calculates and subtracts user and item averages to center the data around zero.

Parameters:
- M: Utility matrix, with blank entries containing missing values (NaN).

Returns:
- M: Normalized matrix that has been adjusted to have a center or mean of zero, removing biases.
- user_average: Matrix storing the calculated average user ratings, where missing values are replaced with zeros.
- movie_average: A matrix storing the calculated average item ratings, where missing values are replaced with zeros.
'''
def normalize_M(M):
    user_average = np.empty(M.shape)
    movie_average = np.empty(M.shape)
    n, m = M.shape

    for i in range(n):
        row = M[i, :]
        if np.any(~np.isnan(row)):
            user_average[i,:] = np.nanmean(row)
        else:
            user_average[i, :] = 0
    for j in range(m):
        col = M[:, j]
        if np.any(~np.isnan(col)):
            movie_average[:,j] = np.nanmean(col)
        else:
            movie_average[:, j] = 0

    M = M - 0.5*user_average - 0.5*movie_average
    return M, user_average, movie_average

In [3]:
cv = K_fold(n_splits=5, seed=42, data=ratings, shuffle=True)

rmse_list = []
mae_list = []

for i, (train_index, test_index) in enumerate(cv):
    train_data, test_data = ratings.iloc[train_index], ratings.iloc[test_index]
    M = makeM(train_data.to_numpy()) # Create a utility matrix M from the training data
    n,m = M.shape # Get the number of users (n) and items (m) in the utility matrix
    f = 2 # Set the number of features (f) to 2

    # Initialize matrices U and V with ones, representing user and item features
    U = np.ones((n, f))
    V = np.ones((f, m))
    M, user_avg, movie_avg = normalize_M(M)

    # Optimize matrix U and V using collaborative filtering and the normalized utility matrix M
    decomposeU(U,V,M,n,f)
    decomposeV(U,V,M,m,f)

    # Compute the predicted utility matrix UV and add back the mean-centered user and item averages
    UV = np.dot(U,V) + 0.5*user_avg + 0.5*movie_avg

    # Calculate the RMSE (Root Mean Square Error) and MSE between the predicted UV and the test data
    rmse_val, mae_val = calculate_rmse(UV, test_data.to_numpy())
    rmse_list.append(rmse_val)
    mae_list.append(mae_val)

print("MAE list:", mae_list)
print("Average MAE:", sum(mae_list) / len(mae_list))
print("RMSE list:", rmse_list)
print("Average RMSE:", sum(rmse_list) / len(rmse_list))

MAE list: [0.7240876673484067, 0.7183742510197288, 0.7185300144505843, 0.7196128560955047, 0.7201305838591022]
Average MAE: 0.7201470745546653
RMSE list: [0.9164468022700587, 0.9110241978208942, 0.9106784848578212, 0.9120150742493378, 0.9135895904985155]
Average RMSE: 0.9127508299393255


**TASK 3 - The Matrix Factorization**

In this subtask, we have tackled the MovieLens 1M dataset problem with Matrix Factorization approach presented in research paper by Gábor Takács et al. MF is a fundamental and yet powerful technique used in collaborative filtering to approximate a user-item matrix as the product of two matrices.

In our case we denoted the user-movie matrix as $X$ and $U$, $M$ for users and movies respectively:
- $U = I $ x $ K$
- $M = K $ x $ J$

For the given problem, where $X$ has many unknown elements, the goal is to find $U$ and $M$ that minimize the sum of squared errors (RMSE) on known elements of $X$.
The optimization process involves gradient descent to update $U$ and $M$ matrices based on calculated error along with defined regularization parameter.

Whole algorithm could be described as a set of following steps:
1. Initialize $U$ and $M$ randomly with small positive values, in our case numbers spanning between 0 and 1. Additionally, we settle the learning rate $λ$ to 0.005 and regularization factor $η$ to 0.05.
2. Iterate, for our purposes - 75 times, with optional condition to terminate when RMSE doesn't decrease during two next iterations:
- Iterate over each known element of X not in the probe subset:
i. Compute the error, where $e_{ij} = x_{ij} - \hat{x}_{ij} \text{ for } (i; j) \in R$
ii. Compute the gradient of the error = $e_{ij}^2$, which can be denoted as:
$\frac{∂}{∂u_{ik}}e_{ij}^2 = -2e_{ij} * m_{kj}$ ; $\frac{∂}{∂m_{kj}}e_{ij}^2 = -2e_{ij} * m_{ik}$
iii. Update the ith row of U and the jth column of M, with respect to following equations:
$u'_{ik} = u_{ik} + \lambda \cdot (2e_{ij} \cdot m_{kj} - \eta \cdot u_{ik})$ ; $m'_{kj} = m_{kj} + \lambda \cdot (2e_{ij} \cdot u_{ik} - \eta \cdot m_{kj})$
- Finally, calculate the RMSE on the probe subset.

In [1]:
'''
Create a user-movie rating matrix from a list of ratings, where rows represent users,
columns represent movies, and each cell contains the corresponding user's rating for movie.

Parameters:
- ratings: Pandas df with columns 'user', 'movie', and 'rating'.

Returns:
- X: User-movie rating matrix where X[i, j] represents the rating of user i for movie j.
'''
def makeX(ratings):
    X = np.empty((6040, 3952))
    M[:] = 0 # Initialize X with zeros
    for (u,m,r,_) in ratings:
        M[u-1,m-1] = r # Fill in the matrix X with known user rating
    return X

In [4]:
cv = K_fold(n_splits=5, seed=42, data=ratings, shuffle=True)

num_factors = 10
num_iter=75
regularization = 0.05
learning_rate=0.005

# Initialize variables to track the best results
lowest_rmse = float('inf')
best_U = None
best_M = None

rmse_list = []
mae_list = []

for split, (train_index, test_index) in enumerate(cv):
    train_data, test_data = ratings.iloc[train_index], ratings.iloc[test_index]

    # Initialize user and movie matrices U and M
    U = np.random.random((6040,num_factors))
    M = np.random.random((num_factors,3952))
    X = train_data.to_numpy()

    rmse_split = []
    mae_split = []

    prev_rmse = float('inf')
    consecutive_iterations = 0

    for iter in range(num_iter):
        print('Iteration:', iter)
        for (u,m,r,_) in X:
            i,j = u-1,m-1

            # Calculate the error eij
            eij = r - np.dot(U[i,:],M[:,j])

            # Update user and movie matrices using gradient descent
            for k in range(num_factors):
                U[i][k] = U[i][k] + learning_rate * (2 * eij * M[k][j] - regularization * U[i][k])
                M[k][j] = M[k][j] + learning_rate * (2 * eij * U[i][k] - regularization * M[k][j])

        # Calculate RMSE and store it
        rmse_val, mae_val = calculate_rmse(np.dot(U,M),test_data.to_numpy())
        rmse_split.append(rmse_val)
        mae_split.append(mae_val)
        print("RMSE:", rmse_val)
        print("MAE:", mae_val)

        # Check for RMSE convergence
        if rmse_val >= prev_rmse:
            consecutive_iterations += 1
            if consecutive_iterations == 2:
                break # Terminate the loop if RMSE doesn't decrease for two consecutive iterations
        else:
            consecutive_iterations = 0
        prev_rmse = rmse_val

    rmse_list.append(rmse_split[-1])
    mae_list.append(mae_split[-1])

    # Update the best results if a lower RMSE is found
    if min(rmse_split) < lowest_rmse:
        lowest_rmse = min(rmse_split)
        best_U = U
        best_M = M

Iteration: 0
RMSE: 0.9395865772397065
MAE: 0.7455756351852879
Iteration: 1
RMSE: 0.9317407850122721
MAE: 0.7386081457396757
Iteration: 2
RMSE: 0.9290093550298499
MAE: 0.736191119072965
Iteration: 3
RMSE: 0.9259107988370959
MAE: 0.733560039735887
Iteration: 4
RMSE: 0.9208048806766637
MAE: 0.7291634101200668
Iteration: 5
RMSE: 0.9147143422418723
MAE: 0.7236768650968995
Iteration: 6
RMSE: 0.9093540206004179
MAE: 0.718747376390171
Iteration: 7
RMSE: 0.9049227591453665
MAE: 0.7146397601560858
Iteration: 8
RMSE: 0.9011702143183328
MAE: 0.711168682662196
Iteration: 9
RMSE: 0.8979411293823012
MAE: 0.7081285902254577
Iteration: 10
RMSE: 0.8951566130148998
MAE: 0.705484729609081
Iteration: 11
RMSE: 0.8927563209818494
MAE: 0.703181561690356
Iteration: 12
RMSE: 0.8906824542259497
MAE: 0.7011708725727439
Iteration: 13
RMSE: 0.8888827554423918
MAE: 0.6994007780097301
Iteration: 14
RMSE: 0.8873128819167571
MAE: 0.6978475593503194
Iteration: 15
RMSE: 0.8859360008845675
MAE: 0.6964868618094879
Iteratio

In [5]:
import os

print("MAE list:", mae_list)
print("Average MAE:", sum(mae_list) / len(mae_list))
print("RMSE list:", rmse_list)
print("Average RMSE:", sum(rmse_list) / len(rmse_list))

Save the U and M matrices weights from the best performed iteration
print("Lowest rmse:", lowest_rmse)
os.makedirs("feature_matrices", exist_ok=True)
np.savetxt(os.path.join("feature_matrices", "movies.csv"), best_M.transpose(), delimiter=',')
np.savetxt(os.path.join("feature_matrices", "users.csv"), best_U, delimiter=',')

MAE list: [0.6832686411084417, 0.6844264088019703, 0.6844609785418375, 0.6827723554788754, 0.681490900904747]
Average MAE: 0.6832838569671744
RMSE list: [0.8730343927259353, 0.8752211592905211, 0.874758117569701, 0.8719011702900236, 0.8716164504475197]
Average RMSE: 0.8733062580647403
