In [6]:
import numpy as np
import pandas as pd
import math
from itertools import product
from tqdm import tqdm
import pickle
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
from PIL import Image
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
rating_df = pd.read_csv('../data/cleaned/cleaned_user_rating.csv')
rating_df_copy = rating_df.copy()
movies_df = pd.read_csv('../data/cleaned/cleaned_movies_details.csv', usecols=['movie_id', 'title', 'genres', 'overview', 'director', 'stars', 'img_url'])

# ID to number
rating_df_copy['user_id_number'] = rating_df_copy['user_id'].astype('category').cat.codes.values
rating_df_copy['movie_id_number'] = rating_df_copy['movie_id'].astype('category').cat.codes.values
# rating_df_copy = rating_df_copy.groupby('movie_id') \
# .filter(lambda x : len(x) >= 30) # 15

train_data = rating_df_copy[['user_id_number', 'movie_id_number', 'rating']].values
print(train_data)

In [14]:
movies_df['genres'] = movies_df['genres'].str.replace(',',' ')
movies_df['genres'] = movies_df['genres'].str.replace('Sci-Fi','SciFi')
movies_df['genres'] = movies_df['genres'].str.replace('Film-Noir','FilmNoir')
movies_df['genres'] = movies_df['genres'].str.replace('Reality-TV','RealityTV')
movies_df['genres'] = movies_df['genres'].str.replace('Talk-Show','TalkShow')

In [None]:
# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-mpnet-base-v2')
# Update content creation for semantic embeddings
movies_df['content'] = movies_df.apply(
    lambda x: f"{x['title']} is a {x['genres']} movie directed by {x['director']} starring {x['stars']}. Overview: {x['overview']}",
    axis=1
)
# Create transformer embeddings
transformer_embeddings = model.encode(movies_df['content'].tolist(), show_progress_bar=True)
# Compute cosine similarity matrix
sim_matrix = cosine_similarity(transformer_embeddings)
np.save('./sim_matrix.npy', sim_matrix)

In [None]:
display(rating_df_copy)
# Build the mappings
number_to_user_id = dict(enumerate(rating_df_copy['user_id'].astype('category').cat.categories))
user_id_to_number = {v: k for k, v in number_to_user_id.items()}
number_to_movie_id = dict(enumerate(rating_df_copy['movie_id'].astype('category').cat.categories))
movie_id_to_number = {v: k for k, v in number_to_movie_id.items()}
# Example: Get user_id_number for a specific user_id
user_id = 'ur127508339'  
user_id_number = user_id_to_number[user_id]
print(user_id_number)
print(movie_id_to_number['tt7737800'])
# Example: Get user_id for a specific user_id_number
user_id_number = 35310  
user_id = number_to_user_id[user_id_number]
print(user_id)

In [None]:
def get_movieURL(movie_id):
    return movies_df[movies_df.movie_id == movie_id].img_url.values[0]

print(get_movieURL('tt7737800'))

In [17]:
def split_ratings_by_user(train_data, random_state=420):
    """
    Split ratings for each user into train, validation, and test sets
    with 60:20:20 split and maintaining rating count distribution
    """
    np.random.seed(random_state)
    
    # Unique users
    users = np.unique(train_data[:, 0])
    
    # Initialize empty lists for split data
    train_data_list = []
    val_data_list = []
    test_data_list = []
    
    for user in users:
        user_ratings = train_data[train_data[:, 0] == user]
        # Sort ratings in ascending order (assuming they are sorted by timestamp or order)
        user_ratings = user_ratings[user_ratings[:, 2].argsort()]
        total_ratings = len(user_ratings)
        train_end = math.ceil(total_ratings * 0.6)
        val_end = train_end + math.ceil(total_ratings * 0.2)
        
        # Split the data
        train_data_list.append(user_ratings[:train_end])
        val_data_list.append(user_ratings[train_end:val_end])
        test_data_list.append(user_ratings[val_end:])
    
    # Concatenate the lists
    train_data = np.vstack(train_data_list)
    val_data = np.vstack(val_data_list)
    test_data = np.vstack(test_data_list)
    
    return train_data, val_data, test_data

In [9]:
# train_data, val_data, test_data = split_ratings_by_user(train_data)
# np.savez_compressed('../cleaned_data/data.npz', train=train_data, val=val_data, test=test_data)

In [18]:
def evaluate_recommendation(cf_model, test_data, n_top=5):
    hit_ratios, ndcg_scores = [], []
    
    for u in np.unique(test_data[:, 0]):
        # Get movies with ratings 8-10 in test data
        user_test_ratings = test_data[test_data[:, 0] == u]
        high_rated_test_movies = user_test_ratings[user_test_ratings[:, 2] >= 8, 1]
        
        if len(high_rated_test_movies) != 0:
            # Get all unrated movies
            train_movies = cf_model.train_data[cf_model.train_data[:, 0] == u, 1]
            all_movies = np.arange(cf_model.n_movies)
            unrated_movies = all_movies[~np.isin(all_movies, np.concatenate([train_movies, user_test_ratings[:, 1]]))]
            
            # Combine the high-rated movie with 99 other unrated movies
            sampled_unrated_movies = np.random.choice(unrated_movies, 99, replace=False)
            
            hit_ratio = 0
            dcg = 0
            # For each high-rated test movie
            for high_rated_movie in high_rated_test_movies:
                
                candidate_movies = np.concatenate([[high_rated_movie], sampled_unrated_movies])
                movie_scores = {i: cf_model.pred(u, i) for i in candidate_movies}
                
                # Sort movies by predicted score
                sorted_movies = sorted(movie_scores, key=movie_scores.get, reverse=True)[:n_top]
                
                # Calculate the benchmark
                if high_rated_movie in sorted_movies:
                    hit_ratio += 1 
                    rank = sorted_movies.index(high_rated_movie) + 1
                    dcg += 1 / math.log2(rank + 1)  # Discount for rank
            hit_ratios.append(hit_ratio / len(high_rated_test_movies))
            ndcg_scores.append(dcg / len(high_rated_test_movies))
            
    return np.mean(hit_ratios), np.mean(ndcg_scores)

In [19]:
def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [12]:
data = np.load('../checkpoints/data.npz')
train_data = data['train']
val_data = data['val']
test_data = data['test']

In [None]:
print(train_data)

## ISMF

In [13]:
class ISMF(object):
    global user_id_to_number, number_to_user_id, movie_id_to_number, number_to_movie_id
    def __init__(self, train_data, test_data, n_factors=10, learning_rate=0.01, n_epochs=10):
        self.train_data = train_data
        self.test_data = test_data
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.n_users = int(np.max(self.train_data[:, 0])) + 1 # 1 because index from 0
        self.n_movies = int(np.max(self.train_data[:, 1])) + 1 
        # P, Q's size may be big at first to add new user/film
        self.P = np.random.normal(scale=1.0 / self.n_factors, size=(self.n_users + 100, self.n_factors))
        self.Q = np.random.normal(scale=1.0 / self.n_factors, size=(self.n_movies + 100, self.n_factors))
    
    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    def fit(self):
        best_loss = float('inf')
        no_improve_epochs = 0
        
        for epoch in range(self.n_epochs):
            np.random.shuffle(self.train_data)
            # Use tqdm to create a progress bar for the inner loop
            for u, i, r in tqdm(self.train_data, desc=f'Epoch {epoch + 1}/{self.n_epochs}', unit='rating'):
                u, i = int(u), int(i)
                pred = self.pred(u, i)
                error = r - pred
                # Update P and Q
                self.P[u, :] += self.learning_rate * error * self.Q[i, :]
                self.Q[i, :] += self.learning_rate * error * self.P[u, :]
            
            train_loss = self.loss(self.train_data)
            test_loss = self.loss(self.test_data)
            print(f"Train loss: {train_loss:.4f}")
            print(f"Test loss: {test_loss:.4f}")
            
            # Early stopping logic
            if test_loss < best_loss + 1e-6:
                best_loss = test_loss
                no_improve_epochs = 0
            else:
                no_improve_epochs += 1
                if no_improve_epochs >= 5:
                    print("Early stopping: No improvement for 5 consecutive epochs.")
                    break

    def incremental_update(self, new_ratings):
        # Convert new_ratings from IDs to numerical indices
        processed_ratings = []
        for user_id, movie_id, rating in new_ratings:
            # Check and update user_id_to_number
            if user_id not in user_id_to_number:
                user_id_to_number[user_id] = self.n_users
                number_to_user_id[self.n_users] = user_id
                self.n_users += 1

            # Check and update movie_id_to_number
            if movie_id not in movie_id_to_number:
                movie_id_to_number[movie_id] = self.n_movies
                number_to_movie_id[self.n_movies] = movie_id
                self.n_movies += 1

            # Convert IDs to numbers and append to processed_ratings
            u = user_id_to_number[user_id]
            i = movie_id_to_number[movie_id]
            processed_ratings.append([u, i, rating])

        # Convert processed_ratings to a NumPy array
        processed_ratings = np.array(processed_ratings)
        # Update the train_data matrix with the new ratings
        self.train_data = np.vstack((self.train_data, processed_ratings))
        # Incremental learning using new ratings
        for u, i, r in processed_ratings:
            u, i, r = int(u), int(i), int(r)
            pred = self.pred(u, i)
            error = r - pred
            # Update P and Q
            self.P[u, :] += self.learning_rate * error * self.Q[i, :]
            self.Q[i, :] += self.learning_rate * error * self.P[u, :]

    def pred(self, u, i):
        return self.P[u, :].dot(self.Q[i, :].T)
            
    def print_recommendation(self, user_id, number=10):
        recommended_items = self.recommend(user_id_to_number[user_id])
        recommended_items = recommended_items[:number]
        
        print(f'Recommended movie(s) for user {user_id} : {recommended_items}')
            
        cols = 5 if number > 5 else number
        rows = math.ceil(number/cols)

        fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
        urls = []

        for i in recommended_items:
            movie_id = number_to_movie_id[i]
            urls.append(get_movieURL(movie_id))
            
        for i, ax in enumerate(axes.flat):
            if i < number:
                ax.imshow(np.array(Image.open(urllib.request.urlopen(urls[i]))))
                fig.tight_layout()
                ax.axis('off')
            else:
                ax.axis('off')
            
    def recommend(self, u):        
        """
        Determine all unrated items should be recommended for user u
        """
        ids = np.where(self.train_data[:, 0] == u)[0]
        items_rated_by_u = self.train_data[ids, 1].tolist()
        recommended_items = {}
        for i in range(self.n_movies):
            if i not in items_rated_by_u:
                recommended_items[i] = self.pred(u, i)
                
        # # Visualization of prediction values distribution
        # pred_values = list(recommended_items.values())
        
        # # Create figure and axis
        # plt.figure(figsize=(10, 6))
        
        # # Option 1: Histogram
        # plt.hist(pred_values, bins=50, edgecolor='black')
        # plt.title(f'Distribution of Predicted Ratings for User {u}')
        # plt.xlabel('Predicted Rating')
        # plt.ylabel('Frequency')
        # plt.show()
        
        # # Option 2: Density plot using seaborn
        # plt.figure(figsize=(10, 6))
        # sns.kdeplot(data=pred_values, fill=True)
        # plt.title(f'Density Distribution of Predicted Ratings for User {u}')
        # plt.xlabel('Predicted Rating')
        # plt.ylabel('Density')
        # plt.show()
        
        # # Optional: Print basic statistics
        # print(f"Mean prediction: {np.mean(pred_values):.2f}")
        # print(f"Median prediction: {np.median(pred_values):.2f}")
        # print(f"Standard deviation: {np.std(pred_values):.2f}")
        # print(f"Min prediction: {min(pred_values):.2f}")
        # print(f"Max prediction: {max(pred_values):.2f}")

        return sorted(recommended_items, key=recommended_items.get, reverse=True)
    
    def loss(self, data):
        L = 0
        for u, i, r in (data):
            u, i = int(u), int(i)
            pred = self.pred(u, i)
            L += (r - pred)**2
        L /= data.shape[0]
        return math.sqrt(L)

In [14]:
# cf_model = ISMF(train_data, n_factors=2, learning_rate=1e-4, n_epochs=100)
# cf_model.print_recommendation('ur127508339')
# # Add new ratings incrementally
# new_ratings = np.array([['ur127508339', 'tt0062292', 3], ['ur6969', 'tt6969', 7]])  # Example new ratings
# cf_model.incremental_update(new_ratings)

In [None]:
# Hyperparameter grid
n_factors_list = [5, 10, 20, 50] # [5, 10, 20, 50]
learning_rates = [1e-3, 1e-2, 5e-2, 1e-1] # [1e-3, 1e-2, 5e-2, 1e-1]

# Open the CSV file and write the header once
csv_file_path = 'results/ismf_results.csv'
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ['n_factors', 'learning_rate', 'HR', 'NDCG']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

# Evaluate across all hyperparameter combinations
for n_factors, learning_rate in product(n_factors_list, learning_rates):
    # Initialize the model
    cf_model = ISMF(train_data, val_data, n_factors=n_factors, learning_rate=learning_rate, n_epochs=100)
    print(f"ismf, n_factors: {n_factors}, learning_rate: {learning_rate}")
    
    # Train the model
    cf_model.fit()
    
    # Evaluate the model
    hit_ratio, ndcg = evaluate_recommendation(cf_model, test_data, n_top=5)
    print(f"Results - n_factors: {n_factors}, learning_rate: {learning_rate}, HR: {hit_ratio:.4f}, NDCG: {ndcg:.4f}")
    
    # Save the model
    model_filename = f"../models/ismf/ismf{n_factors}_lr{learning_rate}.pkl"
    cf_model.save(model_filename)
    
    # Append the result to the CSV file
    with open(csv_file_path, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['n_factors', 'learning_rate', 'HR', 'NDCG'])
        writer.writerow({
            'n_factors': n_factors, 
            'learning_rate': learning_rate, 
            'HR': f"{hit_ratio:.4f}", 
            'NDCG': f"{ndcg:.4f}"
        })

## RISMF

In [None]:
class RISMF(object):
    global user_id_to_number, number_to_user_id, movie_id_to_number, number_to_movie_id
    def __init__(self, train_data, test_data, n_factors=10, learning_rate=0.01, lambda_reg=0.1, n_epochs=10):
        self.train_data = train_data
        self.test_data = test_data
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.n_epochs = n_epochs
        self.n_users = int(np.max(self.train_data[:, 0])) + 1 # 1 because index from 0
        self.n_movies = int(np.max(self.train_data[:, 1])) + 1
         
        # P, Q's size may be big at first to add new user/film
        self.P = np.random.normal(scale=1.0 / self.n_factors, size=(self.n_users + 100, self.n_factors))
        self.Q = np.random.normal(scale=1.0 / self.n_factors, size=(self.n_movies + 100, self.n_factors))
    
    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    def fit(self):
        best_loss = float('inf')
        no_improve_epochs = 0
        
        for epoch in range(self.n_epochs):
            np.random.shuffle(self.train_data)
            # Use tqdm to create a progress bar for the inner loop
            for u, i, r in tqdm(self.train_data, desc=f'Epoch {epoch + 1}/{self.n_epochs}', unit='rating'):
                u, i = int(u), int(i)
                pred = self.pred(u, i)
                error = r - pred
                # Update P and Q
                self.P[u, :] += self.learning_rate * (error * self.Q[i, :] - self.lambda_reg * self.P[u, :])
                self.Q[i, :] += self.learning_rate * (error * self.P[u, :] - self.lambda_reg * self.Q[i, :])

            train_loss = self.loss(self.train_data)
            test_loss = self.loss(self.test_data)
            print(f"Train loss: {train_loss:.4f}")
            print(f"Test loss: {test_loss:.4f}")
            
            # Early stopping logic
            if test_loss < best_loss + 1e-6:
                best_loss = test_loss
                no_improve_epochs = 0
            else:
                no_improve_epochs += 1
                if no_improve_epochs >= 5:
                    print("Early stopping: No improvement for 5 consecutive epochs.")
                    break
                
    def incremental_update(self, new_ratings):
        # Convert new_ratings from IDs to numerical indices
        processed_ratings = []
        for user_id, movie_id, rating in new_ratings:
            # Check and update user_id_to_number
            if user_id not in user_id_to_number:
                user_id_to_number[user_id] = self.n_users
                number_to_user_id[self.n_users] = user_id
                self.n_users += 1

            # Check and update movie_id_to_number
            if movie_id not in movie_id_to_number:
                movie_id_to_number[movie_id] = self.n_movies
                number_to_movie_id[self.n_movies] = movie_id
                self.n_movies += 1

            # Convert IDs to numbers and append to processed_ratings
            u = user_id_to_number[user_id]
            i = movie_id_to_number[movie_id]
            processed_ratings.append([u, i, rating])

        # Convert processed_ratings to a NumPy array
        processed_ratings = np.array(processed_ratings)
        # Update the train_data matrix with the new ratings
        self.train_data = np.vstack((self.train_data, processed_ratings))
        # Incremental learning using new ratings
        for u, i, r in processed_ratings:
            u, i, r = int(u), int(i), int(r)
            pred = self.pred(u, i)
            error = r - pred
            # Update P and Q
            self.P[u, :] += self.learning_rate * (error * self.Q[i, :] - self.lambda_reg * self.P[u, :])
            self.Q[i, :] += self.learning_rate * (error * self.P[u, :] - self.lambda_reg * self.Q[i, :])

    def pred(self, u, i):
        return self.P[u, :].dot(self.Q[i, :].T)
            
    def print_recommendation(self, user_id, number=10):
        recommended_items = self.recommend(user_id_to_number[user_id])
        recommended_items = recommended_items[:number]
        
        print(f'Recommended movie(s) for user {user_id} : {recommended_items}')
            
        cols = 5 if number > 5 else number
        rows = math.ceil(number/cols)

        fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
        urls = []

        for i in recommended_items:
            movie_id = number_to_movie_id[i]
            urls.append(get_movieURL(movie_id))
            
        for i, ax in enumerate(axes.flat):
            if i < number:
                ax.imshow(np.array(Image.open(urllib.request.urlopen(urls[i]))))
                fig.tight_layout()
                ax.axis('off')
            else:
                ax.axis('off')
            
    def recommend(self, u):        
        """
        Determine all unrated items should be recommended for user u
        """
        ids = np.where(self.train_data[:, 0] == u)[0]
        items_rated_by_u = self.train_data[ids, 1].tolist()
        recommended_items = {}
        for i in range(self.n_movies):
            if i not in items_rated_by_u:
                recommended_items[i] = self.pred(u, i)

        return sorted(recommended_items, key=recommended_items.get, reverse=True)
    
    def loss(self, data):
        L = 0
        for u, i, r in (data):
            u, i = int(u), int(i)
            pred = self.pred(u, i)
            L += (r - pred)**2
        L /= data.shape[0]
        return math.sqrt(L)

In [None]:
# Hyperparameter grid
n_factors_list = [5, 10, 20, 50] # [5, 10, 20, 50]
learning_rates = [1e-3, 1e-2, 5e-2, 1e-1] # [1e-3, 1e-2, 5e-2, 1e-1]

# Open the CSV file and write the header once
csv_file_path = 'results/rismf_results.csv'
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ['n_factors', 'learning_rate', 'HR', 'NDCG']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

# Evaluate across all hyperparameter combinations
for n_factors, learning_rate in product(n_factors_list, learning_rates):
    # Initialize the model
    cf_model = RISMF(train_data, val_data, n_factors=n_factors, learning_rate=learning_rate, lambda_reg=0.1, n_epochs=100)
    print(f"rismf, n_factors: {n_factors}, learning_rate: {learning_rate}")
    
    # Train the model
    cf_model.fit()
    
    # Evaluate the model
    hit_ratio, ndcg = evaluate_recommendation(cf_model, test_data, n_top=5)
    print(f"Results - n_factors: {n_factors}, learning_rate: {learning_rate}, HR: {hit_ratio:.4f}, NDCG: {ndcg:.4f}")
    
    # Save the model
    model_filename = f"../checkpoints/rismf/rismf_nf{n_factors}_lr{learning_rate}.pkl"
    cf_model.save(model_filename)
    
    # Append the result to the CSV file
    with open(csv_file_path, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['n_factors', 'learning_rate', 'HR', 'NDCG'])
        writer.writerow({
            'n_factors': n_factors, 
            'learning_rate': learning_rate, 
            'HR': f"{hit_ratio:.4f}", 
            'NDCG': f"{ndcg:.4f}"
        })

In [None]:
# cf_model = load_model("../models/rismf_model_nf2_lr0.01.pkl")
# cf_model.print_recommendation('ur127508339')

# BRISMF

In [None]:
class BRISMF(object):
    global user_id_to_number, number_to_user_id, movie_id_to_number, number_to_movie_id
    def __init__(self, train_data, test_data, n_factors=10, learning_rate=0.01, lambda_reg=0.1, n_epochs=10):
        self.train_data = train_data
        self.test_data = test_data
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.n_epochs = n_epochs
        self.n_users = int(np.max(self.train_data[:, 0])) + 1 # 1 because index from 0
        self.n_movies = int(np.max(self.train_data[:, 1])) + 1
         
        # P, Q's size may be big at first to add new user/film
        self.P = np.random.normal(scale=1.0 / self.n_factors, size=(self.n_users + 100, self.n_factors))
        self.Q = np.random.normal(scale=1.0 / self.n_factors, size=(self.n_movies + 100, self.n_factors))
        
        # Fix P[1, :] and Q[2, :] to 1
        self.P[1, :] = 1
        self.Q[2, :] = 1
    
    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)

    def fit(self):
        best_loss = float('inf')
        no_improve_epochs = 0
        
        for epoch in range(self.n_epochs):
            np.random.shuffle(self.train_data)
            # Use tqdm to create a progress bar for the inner loop
            for u, i, r in tqdm(self.train_data, desc=f'Epoch {epoch + 1}/{self.n_epochs}', unit='rating'):
                u, i = int(u), int(i)
                pred = self.pred(u, i)
                error = r - pred
                
                # Skip updates for fixed rows
                if u != 1:
                    self.P[u, :] += self.learning_rate * (error * self.Q[i, :] - self.lambda_reg * self.P[u, :])
                if i != 2:
                    self.Q[i, :] += self.learning_rate * (error * self.P[u, :] - self.lambda_reg * self.Q[i, :])
            
            train_loss = self.loss(self.train_data)
            test_loss = self.loss(self.test_data)
            print(f"Train loss: {train_loss:.4f}")
            print(f"Test loss: {test_loss:.4f}")
            
            # Early stopping logic
            if test_loss < best_loss + 1e-6:
                best_loss = test_loss
                no_improve_epochs = 0
            else:
                no_improve_epochs += 1
                if no_improve_epochs >= 5:
                    print("Early stopping: No improvement for 5 consecutive epochs.")
                    break
                
    def incremental_update(self, new_ratings):
        # Convert new_ratings from IDs to numerical indices
        processed_ratings = []
        for user_id, movie_id, rating in new_ratings:
            # Check and update user_id_to_number
            if user_id not in user_id_to_number:
                user_id_to_number[user_id] = self.n_users
                number_to_user_id[self.n_users] = user_id
                self.n_users += 1

            # Check and update movie_id_to_number
            if movie_id not in movie_id_to_number:
                movie_id_to_number[movie_id] = self.n_movies
                number_to_movie_id[self.n_movies] = movie_id
                self.n_movies += 1

            # Convert IDs to numbers and append to processed_ratings
            u = user_id_to_number[user_id]
            i = movie_id_to_number[movie_id]
            processed_ratings.append([u, i, rating])

        # Convert processed_ratings to a NumPy array
        processed_ratings = np.array(processed_ratings)
        # Update the train_data matrix with the new ratings
        self.train_data = np.vstack((self.train_data, processed_ratings))
        # Incremental learning using new ratings
        for u, i, r in processed_ratings:
            u, i, r = int(u), int(i), int(r)
            pred = self.pred(u, i)
            error = r - pred
            
            if u != 1:
                self.P[u, :] += self.learning_rate * (error * self.Q[i, :] - self.lambda_reg * self.P[u, :])
            if i != 2:
                self.Q[i, :] += self.learning_rate * (error * self.P[u, :] - self.lambda_reg * self.Q[i, :])

    def pred(self, u, i):
        return self.P[u, :].dot(self.Q[i, :].T)
            
    def print_recommendation(self, user_id, number=10):
        recommended_items = self.recommend(user_id_to_number[user_id])
        recommended_items = recommended_items[:number]
        
        print(f'Recommended movie(s) for user {user_id} : {recommended_items}')
            
        cols = 5 if number > 5 else number
        rows = math.ceil(number/cols)

        fig, axes = plt.subplots(rows, cols, figsize=(cols*3, rows*3))
        urls = []

        for i in recommended_items:
            movie_id = number_to_movie_id[i]
            urls.append(get_movieURL(movie_id))
            
        for i, ax in enumerate(axes.flat):
            if i < number:
                ax.imshow(np.array(Image.open(urllib.request.urlopen(urls[i]))))
                fig.tight_layout()
                ax.axis('off')
            else:
                ax.axis('off')
            
    def recommend(self, u):        
        """
        Determine all unrated items should be recommended for user u
        """
        ids = np.where(self.train_data[:, 0] == u)[0]
        items_rated_by_u = self.train_data[ids, 1].tolist()
        recommended_items = {}
        for i in range(self.n_movies):
            if i not in items_rated_by_u:
                recommended_items[i] = self.pred(u, i)
        print(recommended_items)

        return sorted(recommended_items, key=recommended_items.get, reverse=True)
    
    def loss(self, data):
        L = 0
        for u, i, r in (data):
            u, i = int(u), int(i)
            pred = self.pred(u, i)
            L += (r - pred)**2
        L /= data.shape[0]
        return math.sqrt(L)

In [None]:
# Hyperparameter grid
n_factors_list = [5, 10, 20, 50] # [5, 10, 20, 50]
learning_rates = [1e-3, 1e-2, 5e-2, 1e-1] # [1e-3, 1e-2, 5e-2, 1e-1]

# Open the CSV file and write the header once
csv_file_path = 'results/brismf_results.csv'
with open(csv_file_path, 'w', newline='') as csvfile:
    fieldnames = ['n_factors', 'learning_rate', 'HR', 'NDCG']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

# Evaluate across all hyperparameter combinations
for n_factors, learning_rate in product(n_factors_list, learning_rates):
    # Initialize the model
    cf_model = BRISMF(train_data, val_data, n_factors=n_factors, learning_rate=learning_rate, lambda_reg=0.1, n_epochs=100)
    print(f"brismf, n_factors: {n_factors}, learning_rate: {learning_rate}")
    
    # Train the model
    cf_model.fit()
    
    # Evaluate the model
    hit_ratio, ndcg = evaluate_recommendation(cf_model, test_data, n_top=5)
    print(f"Results - n_factors: {n_factors}, learning_rate: {learning_rate}, HR: {hit_ratio:.4f}, NDCG: {ndcg:.4f}")
    
    # Save the model
    model_filename = f"../checkpoints/brismf/brismf_nf{n_factors}_lr{learning_rate}.pkl"
    cf_model.save(model_filename)
    
    # Append the result to the CSV file
    with open(csv_file_path, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['n_factors', 'learning_rate', 'HR', 'NDCG'])
        writer.writerow({
            'n_factors': n_factors, 
            'learning_rate': learning_rate, 
            'HR': f"{hit_ratio:.4f}", 
            'NDCG': f"{ndcg:.4f}"
        })