# Collaborative Filtering
Consisting of: 
- Memory-based Collaborative Filtering
- Model-based Collaborative Filtering

### Import

In [1]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
import random
import re
import numpy as np
from surprise import AlgoBase
from collections import defaultdict
from surprise import Prediction
import json


### Method definition 
Here are evaluation methods defined for later use

In [2]:
def precision_recall_f1_at_k(predictions, k=10, threshold=3.5):
    """
    Return precision, recall, and F1-score at k metrics for each user.
    
    Args:
    - predictions: List of tuples (uid, iid, true_r, est, _), where:
        - uid: User ID
        - iid: Item ID
        - true_r: True rating
        - est: Estimated rating
        - _: Not used (can be any value)
    - k: Number of top items to consider
    - threshold: Threshold rating for considering an item as relevant
    
    Returns:
    - precisions: Dictionary mapping user IDs to precision@k scores
    - recalls: Dictionary mapping user IDs to recall@k scores
    - f1_scores: Dictionary mapping user IDs to F1-score@k scores
    """
    
    # Map predictions to each user
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    # Initialize precision, recall, and F1-score dictionaries
    precisions = dict()
    recalls = dict()
    f1_scores = dict()
    
    # Calculate precision, recall, and F1-score for each user
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
        # F1-score@K: Harmonic mean of precision and recall
        precision = precisions[uid]
        recall = recalls[uid]
        f1_scores[uid] = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    # Calculate mean metrics
    mean_precision = np.mean(list(precisions.values()))
    mean_recall = np.mean(list(recalls.values()))
    mean_f1_score = np.mean(list(f1_scores.values()))
    
    # Print the metrics
    print("Precision@{}: {:.4f}".format(k, mean_precision))
    print("Recall@{}: {:.4f}".format(k, mean_recall))
    print("F1-score@{}: {:.4f}".format(k, mean_f1_score))
    
    return precisions, recalls, f1_scores


In [3]:
def precision_recall_f1(predictions, threshold=3.5):
    """
    Return precision, recall, and F1-score metrics for all recommendations.
    
    Args:
    - predictions: List of tuples (uid, iid, true_r, est, _), where:
        - uid: User ID
        - iid: Item ID
        - true_r: True rating
        - est: Estimated rating
        - _: Not used (can be any value)
    - threshold: Threshold rating for considering an item as relevant
    
    Returns:
    - precision: Precision score
    - recall: Recall score
    - f1_score: F1-score
    """
    
    # Map predictions to each user
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    # Initialize lists to store relevant and recommended items
    relevant_items = []
    recommended_items = []
    
    # Calculate precision, recall, and F1-score for each user
    for uid, user_ratings in user_est_true.items():
        
        # Extract true ratings for the user
        true_ratings = [true_r for (_, true_r) in user_ratings]
        
        # Extract estimated ratings for the user
        estimated_ratings = [est for (est, _) in user_ratings]
        
        # Identify relevant items
        relevant_items.extend([true_r >= threshold for true_r in true_ratings])
        
        # Identify recommended items
        recommended_items.extend([est >= threshold for est in estimated_ratings])
    
    # Calculate precision, recall, and F1-score
    precision = np.mean(relevant_items) if relevant_items else 0
    recall = np.mean(recommended_items) if recommended_items else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # Print the metrics
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1-score: {:.4f}".format(f1_score))
    
    return precision, recall, f1_score


In [4]:
def serialize_predictions(predictions, filename):
    """
    Serialize a list of Prediction objects to JSON and save it to a file.

    Args:
    - predictions (list): List of Prediction objects.
    - filename (str): Name of the file to save the serialized predictions.

    Returns:
    - None
    """
    serialized_predictions = []
    for pred in predictions:
        serialized_prediction = {
            "uid": pred.uid,
            "iid": pred.iid,
            "r_ui": pred.r_ui,
            "est": pred.est,
            "details": pred.details
        }
        serialized_predictions.append(serialized_prediction)

    with open(filename, 'w') as json_file:
        json.dump(serialized_predictions, json_file)

In [56]:
from collections import defaultdict
from surprise import Prediction

def get_top_predictions(predictions, user_id, top_n=5):
    """
    Generates predictions using Surprise and prints the top n most recommended books for a specific user.

    Parameters:
    - predictions (list of Prediction objects): List of predictions generated by Surprise.
    - user_id (str): ID of the user for whom recommendations are generated.
    - top_n (int): Number of top recommendations to generate. Default is 10.

    Returns:
    - recommendations (list): List of the top N most recommended books for the user.
    """
    # Filter predictions for the specified user
    user_predictions = [pred for pred in predictions if pred.uid == user_id]

    if not user_predictions:
        print(f"No predictions found for user {user_id}")
        return []

    # Sort predictions for the user by estimated rating (descending order)
    user_predictions.sort(key=lambda x: x.est, reverse=True)

    # Extract top N recommended books for the user
    top_books = [pred.iid for pred in user_predictions[:top_n]]
    print(f"User {user_id}: Top {top_n} Recommended Books - {top_books}")

    return top_books

In [45]:
def count_users_by_predictions(predictions):
    """
    Count the number of users with a specific number of predictions.

    Parameters:
    - predictions (list of Prediction objects): List of prediction objects.

    Returns:
    - user_prediction_count (dict): A dictionary where the keys are the number of predictions
      and the values are the count of users with that many predictions.
    """
    # Initialize a defaultdict to count occurrences of each uid
    uid_count = defaultdict(int)

    # Loop through the predictions and count uids
    for prediction in predictions:
        uid_count[prediction.uid] += 1

    # Initialize a defaultdict to count the number of users with a specific number of predictions
    user_prediction_count = defaultdict(int)

    # Loop through uid_count to populate user_prediction_count
    for count in uid_count.values():
        user_prediction_count[count] += 1

    # Convert to a regular dictionary for cleaner output
    user_prediction_count = dict(user_prediction_count)

    # Sort the user_prediction_count dictionary by the number of predictions in descending order
    sorted_user_prediction_count = sorted(user_prediction_count.items(), key=lambda x: x[0], reverse=True)

    # Print the sorted user prediction counts
    for num_predictions, num_users in sorted_user_prediction_count:
        print(f"{num_users} users have {num_predictions} predictions")

    #return user_prediction_count

### Data Preparation

In [6]:
path = os.path.expanduser('../data/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0], dtype={'publication_year': str})

In [55]:
reader = Reader(rating_scale=(1,5)) # rating scale range

# Load the dataset from DataFrame
data = Dataset.load_from_df(data_preprocessed[["reviewerID", "asin", "rating"]], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)


## Collaborative Filtering Recommender System (Memory-Based)

### 1. Item-based collaborative filtering with KNNWithMeans

In [8]:
def item_based_KNNWithMeans_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/item_based_KNNWithMeans_recommender_system.json')

    # Calculate evaluation metrics
    print("Item-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

### 2. Item-based collaborative filtering with KNNBasic

In [9]:
def item_based_KNNBasic_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/item_based_KNNBasic_recommender_system.json')

    # Calculate evaluation metrics
    print("Item-based Model with KNNBasic: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

### 3. User-Based Collaborative Filtering with KNNWithMeans

In [10]:
def user_based_KNNWithMeans_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['pearson', 'cosine'],
                                  'user_based': [True]}
                 }


    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/user_based_KNNWithMeans_recommender_system.json')

    # Calculate evaluation metrics
    print("User-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

### 4. User-Based Collaborative Filtering with KNNBasic

In [11]:
def user_based_KNNBasic_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'user_based': [True]}
              }

    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/user_based_KNNBasic_recommender_system.json')


    # Calculate evaluation metrics
    print("User-based Model with KNNBasic: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

## Model-Based Recommender System

### Singular Value Decomposition (SVD)
 SVD is a matrix factorization technique commonly used in collaborative filtering-based recommender systems. 

In [12]:
def SVD_recommender_system(trainset, testset, data):
    """
    Trains an SVD-based recommender system using grid search for hyperparameter tuning,
    and evaluates it on the provided test set.
    
    Parameters:
    - trainset (surprise.Trainset): Training set for the recommender system.
    - testset (list of tuples): Test set for evaluation.
    - data (surprise.Dataset): Full dataset for grid search.
    
    Returns:
    - test_pred (list of Prediction objects): Predictions on the test set.
    """
    
    # Define the parameter grid for grid search
    param_grid = {
        "n_factors": [25, 30, 40, 55, 60, 75, 90, 100, 110],  # Number of latent factors in SVD model
        "n_epochs": [10, 20, 30], # Number of training epochs
        "lr_all": [0.005, 0.025, 0.125], # Learning rate for all parameters
        "reg_all": [0.08, 0.16, 0.32], # Regularization term for all parameters
        "random_state": [0], # Seed for reproducibility
    }

    # Perform grid search
    gs = GridSearchCV(SVD, 
                      param_grid, 
                      measures=["rmse", "mae"], 
                      cv=5, 
                      refit=True, # Refit on the entire dataset using best params
                      n_jobs=-1,  # Use all available CPUs for parallel processing
                      joblib_verbose=2 # Verbosity level for parallel processing
                     )
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    params = gs.best_params['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = SVD(n_epochs = params['n_epochs'], 
               lr_all = params['lr_all'], 
               n_factors = params['n_factors'],
               reg_all = params['reg_all'],
               random_state = params["random_state"])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/SVD_recommender_system.json')


    # Calculate evaluation metrics
    print("SVD: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred


### Singular Value Decomposition Plus Plus (SVD++)
SVD++ extends the basic SVD approach to include implicit feedback and user biases.

In [13]:
def SVDpp_recommender_system(trainset, testset, data):
    """
    Trains an SVD++-based recommender system using grid search for hyperparameter tuning,
    and evaluates it on the provided test set.

    Parameters:
    - trainset (surprise.Trainset): Training set for the recommender system.
    - testset (list of tuples): Test set for evaluation.
    - data (surprise.Dataset): Full dataset for grid search.

    Returns:
    - test_pred (list of Prediction objects): Predictions on the test set.
    """
    
    # Define the parameter grid for grid search
    param_grid = {
        "n_factors": [25, 30, 40, 55, 60, 75, 90, 100, 110],  # Number of latent factors in SVD model
        "n_epochs": [10, 20, 30], # Number of training epochs
        "lr_all": [0.005, 0.025, 0.125], # Learning rate for all parameters
        "reg_all": [0.08, 0.16, 0.32], # Regularization term for all parameters
        "random_state": [0], # Seed for reproducibility
    }

    # Perform grid search
    gs = GridSearchCV(SVDpp, 
                      param_grid, 
                      measures=["rmse", "mae"], 
                      cv=5, 
                      refit=True, # Refit on the entire dataset using best params
                      n_jobs=-1,  # Use all available CPUs for parallel processing
                      joblib_verbose=2 # Verbosity level for parallel processing
                     )
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    params = gs.best_params['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = SVDpp(n_epochs = params['n_epochs'], 
               lr_all = params['lr_all'], 
               n_factors = params['n_factors'],
               reg_all = params['reg_all'],
               random_state = params["random_state"])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/SVDpp_recommender_system.json')


    # Calculate evaluation metrics
    print("SVD++: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred


## Run predictions

In [14]:
prediction_item_based_KNNWithMeans = item_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [46]:
top_item_based_with_KNNWithMeans = get_top_predictions(prediction_item_based_KNNWithMeans, 'A143XY6COA69A')
count_users_by_predictions(prediction_item_based_KNNWithMeans)

User A143XY6COA69A: Top 10 Recommended Books - ['B00NNEIL0Q', 'B00OAGWXDW', 'B00JFHKEOG']
1 users have 16 predictions
1 users have 13 predictions
2 users have 9 predictions
1 users have 8 predictions
3 users have 7 predictions
10 users have 6 predictions
22 users have 5 predictions
62 users have 4 predictions
187 users have 3 predictions
985 users have 2 predictions
8668 users have 1 predictions


In [15]:
prediction_item_based_KNNBasic = item_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [47]:
top_item_based_with_KNNBasic = get_top_predictions(prediction_item_based_KNNBasic, 'A143XY6COA69A')
count_users_by_predictions(prediction_item_based_KNNBasic)

User A143XY6COA69A: Top 10 Recommended Books - ['B00JFHKEOG', 'B00OAGWXDW', 'B00NNEIL0Q']
1 users have 16 predictions
1 users have 13 predictions
2 users have 9 predictions
1 users have 8 predictions
3 users have 7 predictions
10 users have 6 predictions
22 users have 5 predictions
62 users have 4 predictions
187 users have 3 predictions
985 users have 2 predictions
8668 users have 1 predictions


In [16]:
prediction_user_based_KNNWithMeans = user_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [49]:
top_user_based_with_KNNWithMeans = get_top_predictions(prediction_user_based_KNNWithMeans, 'A143XY6COA69A')
count_users_by_predictions(prediction_user_based_KNNWithMeans)

User A143XY6COA69A: Top 10 Recommended Books - ['B00JFHKEOG', 'B00OAGWXDW', 'B00NNEIL0Q']
1 users have 16 predictions
1 users have 13 predictions
2 users have 9 predictions
1 users have 8 predictions
3 users have 7 predictions
10 users have 6 predictions
22 users have 5 predictions
62 users have 4 predictions
187 users have 3 predictions
985 users have 2 predictions
8668 users have 1 predictions


In [17]:
prediction_user_based_KNNBasic = user_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [50]:
top_user_based_with_KNNBasic = get_top_predictions(prediction_user_based_KNNBasic, 'A143XY6COA69A')
count_users_by_predictions(prediction_user_based_KNNBasic)

User A143XY6COA69A: Top 10 Recommended Books - ['B00JFHKEOG', 'B00OAGWXDW', 'B00NNEIL0Q']
1 users have 16 predictions
1 users have 13 predictions
2 users have 9 predictions
1 users have 8 predictions
3 users have 7 predictions
10 users have 6 predictions
22 users have 5 predictions
62 users have 4 predictions
187 users have 3 predictions
985 users have 2 predictions
8668 users have 1 predictions


In [19]:
prediction_SVD = SVD_recommender_system(trainset, testset, data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   45.5s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  3.7min finished


Best RMSE: 0.919304715973112
Best Parameters: {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.025, 'reg_all': 0.32, 'random_state': 0}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9302  0.9155  0.9233  0.9121  0.9179  0.9198  0.0063  
MAE (testset)     0.7001  0.6921  0.6983  0.6916  0.6908  0.6946  0.0038  
Fit time          0.40    0.40    0.44    0.49    0.52    0.45    0.05    
Test time         0.09    0.11    0.07    0.09    0.13    0.10    0.02    
SVD: Test Set
RMSE: 0.9503
MAE:  0.7066
Precision@10: 0.8379
Recall@10: 0.9878
F1-score@10: 0.8310
Precision: 0.8217
Recall: 0.9698
F1-score: 0.8896


In [51]:
top_SVD = get_top_predictions(prediction_SVD, 'A143XY6COA69A')
count_users_by_predictions(prediction_SVD)

User A143XY6COA69A: Top 10 Recommended Books - ['B00NNEIL0Q', 'B00OAGWXDW', 'B00JFHKEOG']
1 users have 16 predictions
1 users have 13 predictions
2 users have 9 predictions
1 users have 8 predictions
3 users have 7 predictions
10 users have 6 predictions
22 users have 5 predictions
62 users have 4 predictions
187 users have 3 predictions
985 users have 2 predictions
8668 users have 1 predictions


In [20]:
prediction_SVDpp = SVDpp_recommender_system(trainset, testset, data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 83.2min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 85.4min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed: 87.3min finished


Best RMSE: 0.9190647121254729
Best Parameters: {'n_factors': 30, 'n_epochs': 20, 'lr_all': 0.025, 'reg_all': 0.32, 'random_state': 0}
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9235  0.9320  0.9203  0.9074  0.9180  0.9202  0.0080  
MAE (testset)     0.6915  0.7005  0.6933  0.6896  0.6955  0.6941  0.0037  
Fit time          0.85    0.82    0.68    0.72    0.83    0.78    0.07    
Test time         0.12    0.13    0.16    0.17    0.12    0.14    0.02    
SVD++: Test Set
RMSE: 0.9507
MAE:  0.7065
Precision@10: 0.8372
Recall@10: 0.9888
F1-score@10: 0.8312
Precision: 0.8217
Recall: 0.9714
F1-score: 0.8903


In [52]:
top_SVDpp = get_top_predictions(prediction_SVDpp, 'A143XY6COA69A')
count_users_by_predictions(prediction_SVDpp)

User A143XY6COA69A: Top 10 Recommended Books - ['B00NNEIL0Q', 'B00OAGWXDW', 'B00JFHKEOG']
1 users have 16 predictions
1 users have 13 predictions
2 users have 9 predictions
1 users have 8 predictions
3 users have 7 predictions
10 users have 6 predictions
22 users have 5 predictions
62 users have 4 predictions
187 users have 3 predictions
985 users have 2 predictions
8668 users have 1 predictions


## Error Analysis - Cold Start Problem

In [54]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
import numpy as np

# Assuming you already have the data_preprocessed DataFrame and the Reader instance
reader = Reader(rating_scale=(1, 5))

# Load the dataset from DataFrame
data = Dataset.load_from_df(data_preprocessed[["reviewerID", "asin", "rating"]], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Train an SVD model (or any other model)
algo = SVD(random_state=42)
algo.fit(trainset)

# Predict ratings for the test set
predictions = algo.test(testset)

# Get the item IDs (asin) from the training set
trainset_item_ids = set(trainset.all_items())
trainset_item_ids = set(trainset.to_raw_iid(iid) for iid in trainset_item_ids)

# Get the item IDs (asin) from the test set
testset_item_ids = set([iid for (_, iid, _) in testset])

# Find item IDs that are in the test set but not in the training set
test_only_item_ids = testset_item_ids - trainset_item_ids

# Debugging information
print(f"Total items in training set: {len(trainset_item_ids)}")
print(f"Total items in test set: {len(testset_item_ids)}")
print(f"Total cold start items: {len(test_only_item_ids)}")
print(f"Cold start item IDs: {test_only_item_ids}")

# Filter predictions for cold start items
cold_start_predictions = [pred for pred in predictions if pred.iid in test_only_item_ids]

# Ensure we have predictions for cold start items
if not cold_start_predictions:
    print("No cold start predictions found.")
else:
    # Compute error metrics for cold start items
    cold_start_rmse = accuracy.rmse(cold_start_predictions, verbose=True)
    cold_start_mae = accuracy.mae(cold_start_predictions, verbose=True)

    # Compute overall error metrics for comparison
    overall_rmse = accuracy.rmse(predictions, verbose=True)
    overall_mae = accuracy.mae(predictions, verbose=True)

    # Print the results
    print(f"Cold Start RMSE: {cold_start_rmse}")
    print(f"Cold Start MAE: {cold_start_mae}")
    print(f"Overall RMSE: {overall_rmse}")
    print(f"Overall MAE: {overall_mae}")


Total items in training set: 1011
Total items in test set: 1011
Total cold start items: 0
Cold start item IDs: set()
No cold start predictions found.


# Hybrid Approach

### User-Based Collaborative Filtering + Content-Based Filtering
Run after these predictions are calculated

In [22]:
# Create a dummy algorithm class that does nothing for the DummyAlgorithm class 
# that initializes the trainset attribute with a dummy value. Here's how you can modify the code:

class DummyAlgorithm(AlgoBase):
    def __init__(self):
        pass

    def fit(self, trainset):
        self.trainset = trainset  # Initialize trainset attribute with a dummy value
        pass

    def estimate(self, u, i):
        return 5  # Return a dummy rating of 5 for all predictions


def hybrid_recommender_system(prediction, data_contentBased, data):
    # Convert prediction to a dictionary with user IDs as keys
    user_item_recs = {}
    for pred in prediction:
        user_id = pred.uid
        item_id = pred.iid
        if user_id not in user_item_recs:
            user_item_recs[user_id] = []
        user_item_recs[user_id].append(item_id)

    # Initialize dictionary for hybrid recommendations
    hybrid_recs = {}

    # For each user, generate recommendations
    for user_id, items in user_item_recs.items():
        # Initialize list to store recommendations for the user
        user_recommendations = items.copy()  # Store original recommendations

        # For each item recommended to the user
        for item in items:
            # Get similar item recommendations from TF-IDF
            similar_items = data_contentBased.get(item)

            # Check if similar_items is not None before iterating
            if similar_items is not None:
                # Extend user_recommendations with similar items (excluding already recommended items)
                user_recommendations.extend([item for item in similar_items if item not in items])

        # Count occurrences of each item ID (ASIN) in user_recommendations
        item_counts = {}
        for item in user_recommendations:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

        # Sort items by count (descending order) and prioritize original recommendations if counts are equal
        sorted_items = sorted(item_counts.items(), key=lambda x: (-x[1], x[0]))

        # Take the top ten ASINs from sorted_items
        top_ten_asins = [item[0] for item in sorted_items[:10]]

        # Add user recommendations to the hybrid recommendations
        hybrid_recs[user_id] = top_ten_asins

    # Cross-validate the hybrid recommender system
    dummy_algo = DummyAlgorithm()
    cross_val_results = cross_validate(dummy_algo, data, measures=['mae', 'rmse'], cv=5, verbose=True)
    avg_mae = np.mean(cross_val_results['test_mae'])
    avg_rmse = np.mean(cross_val_results['test_rmse'])

    # Calculate MAE and RMSE on the test data
    actual_ratings = []
    predicted_ratings = []
    for user_id, items in hybrid_recs.items():
        for item in items:
            actual_rating = [rating for (uid, iid, rating, _) in data.raw_ratings if uid == user_id and iid == item]
            if actual_rating:  # Check if actual rating exists
                actual_ratings.append(actual_rating[0])
                predicted_ratings.append(5)  # Assuming all predicted ratings are 5 (can be replaced with actual predictions)

    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    # Print MAE and RMSE
    print("MAE:", mae)
    print("RMSE:", rmse)

    return hybrid_recs, avg_mae, avg_rmse, mae, rmse

# Call the hybrid recommender system function
prediction_hybrid, avg_mae, avg_rmse, mae, rmse = hybrid_recommender_system(prediction_user_based_KNNWithMeans, pred_content_based_recommender_system, data)


NameError: name 'pred_content_based_recommender_system' is not defined