# Collaborative Filtering
Consisting of: 
- Memory-based Collaborative Filtering
- Model-based Collaborative Filtering

### Import

In [1]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
import random
import re
import numpy as np
from surprise import AlgoBase
from collections import defaultdict
from surprise import Prediction
import json


### Method definition 
Here are evaluation methods defined for later use

In [2]:
def precision_recall_f1_at_k(predictions, k=5, threshold=3.5):
    """
    Return precision, recall, and F1-score at k metrics for each user.
    
    Args:
    - predictions: List of tuples (uid, iid, true_r, est, _), where:
        - uid: User ID
        - iid: Item ID
        - true_r: True rating
        - est: Estimated rating
        - _: Not used (can be any value)
    - k: Number of top items to consider
    - threshold: Threshold rating for considering an item as relevant
    
    Returns:
    - precisions: Dictionary mapping user IDs to precision@k scores
    - recalls: Dictionary mapping user IDs to recall@k scores
    - f1_scores: Dictionary mapping user IDs to F1-score@k scores
    """
    
    # Map predictions to each user
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    # Initialize precision, recall, and F1-score dictionaries
    precisions = dict()
    recalls = dict()
    f1_scores = dict()
    
    # Calculate precision, recall, and F1-score for each user
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        
        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])
        
        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
        # F1-score@K: Harmonic mean of precision and recall
        precision = precisions[uid]
        recall = recalls[uid]
        f1_scores[uid] = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    # Calculate mean metrics
    mean_precision = np.mean(list(precisions.values()))
    mean_recall = np.mean(list(recalls.values()))
    mean_f1_score = np.mean(list(f1_scores.values()))
    
    # Print the metrics
    print("Precision@{}: {:.4f}".format(k, mean_precision))
    print("Recall@{}: {:.4f}".format(k, mean_recall))
    print("F1-score@{}: {:.4f}".format(k, mean_f1_score))
    
    return precisions, recalls, f1_scores


In [3]:
def precision_recall_f1(predictions, threshold=3.5):
    """
    Return precision, recall, and F1-score metrics for all recommendations.
    
    Args:
    - predictions: List of tuples (uid, iid, true_r, est, _), where:
        - uid: User ID
        - iid: Item ID
        - true_r: True rating
        - est: Estimated rating
        - _: Not used (can be any value)
    - threshold: Threshold rating for considering an item as relevant
    
    Returns:
    - precision: Precision score
    - recall: Recall score
    - f1_score: F1-score
    """
    
    # Map predictions to each user
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    # Initialize lists to store relevant and recommended items
    relevant_items = []
    recommended_items = []
    
    # Calculate precision, recall, and F1-score for each user
    for uid, user_ratings in user_est_true.items():
        
        # Extract true ratings for the user
        true_ratings = [true_r for (_, true_r) in user_ratings]
        
        # Extract estimated ratings for the user
        estimated_ratings = [est for (est, _) in user_ratings]
        
        # Identify relevant items
        relevant_items.extend([true_r >= threshold for true_r in true_ratings])
        
        # Identify recommended items
        recommended_items.extend([est >= threshold for est in estimated_ratings])
    
    # Calculate precision, recall, and F1-score
    precision = np.mean(relevant_items) if relevant_items else 0
    recall = np.mean(recommended_items) if recommended_items else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # Print the metrics
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1-score: {:.4f}".format(f1_score))
    
    return precision, recall, f1_score


In [4]:
def average_precision_at_k(recommended_items, relevant_items, k):
    """
    Calculate average precision at K
    """
    if not relevant_items:
        return 0.0
    
    relevant_set = set(relevant_items)
    score = 0.0
    num_hits = 0.0
    
    for i, item in enumerate(recommended_items[:k]):
        if item in relevant_set:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    
    return score / min(len(relevant_set), k)

def map_at_k(predictions, k=5, threshold=3.5):
    """
    Calculate mean average precision at K (MAP@K)
    
    Args:
    - predictions: List of tuples (uid, iid, true_r, est, _), where:
        - uid: User ID
        - iid: Item ID
        - true_r: True rating
        - est: Estimated rating
        - _: Not used (can be any value)
    - k: Number of top items to consider
    - threshold: Threshold rating for considering an item as relevant
    
    Returns:
    - map_k: Mean average precision at K
    """
    
    # Map predictions to each user
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    
    average_precisions = []
    
    # Calculate average precision at K for each user
    for uid, user_ratings in user_est_true.items():
        
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # Separate recommended and relevant items
        recommended_items = [iid for (iid, est, true_r) in user_ratings]
        relevant_items = [iid for (iid, est, true_r) in user_ratings if true_r >= threshold]
        
        # Calculate average precision at K
        avg_precision = average_precision_at_k(recommended_items, relevant_items, k)
        average_precisions.append(avg_precision)
    
    # Calculate MAP@K
    map_k = np.mean(average_precisions)
    
    print("MAP@{}: {:.4f}".format(k, map_k))
    
    return map_k

In [5]:
def serialize_predictions(predictions, filename):
    """
    Serialize a list of Prediction objects to JSON and save it to a file.

    Args:
    - predictions (list): List of Prediction objects.
    - filename (str): Name of the file to save the serialized predictions.

    Returns:
    - None
    """
    serialized_predictions = []
    for pred in predictions:
        serialized_prediction = {
            "uid": pred.uid,
            "iid": pred.iid,
            "r_ui": pred.r_ui,
            "est": pred.est,
            "details": pred.details
        }
        serialized_predictions.append(serialized_prediction)

    with open(filename, 'w') as json_file:
        json.dump(serialized_predictions, json_file)

In [6]:
from collections import defaultdict
from surprise import Prediction

def get_top_predictions(predictions, user_id, top_n=5):
    """
    Generates predictions using Surprise and prints the top n most recommended books for a specific user.

    Parameters:
    - predictions (list of Prediction objects): List of predictions generated by Surprise.
    - user_id (str): ID of the user for whom recommendations are generated.
    - top_n (int): Number of top recommendations to generate. Default is 10.

    Returns:
    - recommendations (list): List of the top N most recommended books for the user.
    """
    # Filter predictions for the specified user
    user_predictions = [pred for pred in predictions if pred.uid == user_id]

    if not user_predictions:
        print(f"No predictions found for user {user_id}")
        return []

    # Sort predictions for the user by estimated rating (descending order)
    user_predictions.sort(key=lambda x: x.est, reverse=True)

    # Extract top N recommended books for the user
    top_books = [pred.iid for pred in user_predictions[:top_n]]
    print(f"User {user_id}: Top {top_n} Recommended Books - {top_books}")

    return top_books

In [7]:
def count_users_by_predictions(predictions):
    """
    Count the number of users with a specific number of predictions.

    Parameters:
    - predictions (list of Prediction objects): List of prediction objects.

    Returns:
    - user_prediction_count (dict): A dictionary where the keys are the number of predictions
      and the values are the count of users with that many predictions.
    """
    # Initialize a defaultdict to count occurrences of each uid
    uid_count = defaultdict(int)

    # Loop through the predictions and count uids
    for prediction in predictions:
        uid_count[prediction.uid] += 1

    # Initialize a defaultdict to count the number of users with a specific number of predictions
    user_prediction_count = defaultdict(int)

    # Loop through uid_count to populate user_prediction_count
    for count in uid_count.values():
        user_prediction_count[count] += 1

    # Convert to a regular dictionary for cleaner output
    user_prediction_count = dict(user_prediction_count)

    # Sort the user_prediction_count dictionary by the number of predictions in descending order
    sorted_user_prediction_count = sorted(user_prediction_count.items(), key=lambda x: x[0], reverse=True)

    # Print the sorted user prediction counts
    for num_predictions, num_users in sorted_user_prediction_count:
        print(f"{num_users} users have {num_predictions} predictions")

    #return user_prediction_count

### Data Preparation

In [8]:
path = os.path.expanduser('../data/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0], dtype={'publication_year': str})

In [9]:
reader = Reader(rating_scale=(1,5)) # rating scale range

# Load the dataset from DataFrame
data = Dataset.load_from_df(data_preprocessed[["reviewerID", "asin", "rating"]], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)


## Collaborative Filtering Recommender System (Memory-Based)

### 1. Item-based collaborative filtering with KNNWithMeans

In [10]:
def item_based_KNNWithMeans_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/item_based_KNNWithMeans_recommender_system.json')

    # Calculate evaluation metrics
    print("Item-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

### 2. Item-based collaborative filtering with KNNBasic

In [11]:
def item_based_KNNBasic_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/item_based_KNNBasic_recommender_system.json')

    # Calculate evaluation metrics
    print("Item-based Model with KNNBasic: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

### 3. User-Based Collaborative Filtering with KNNWithMeans

In [12]:
def user_based_KNNWithMeans_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['pearson', 'cosine'],
                                  'user_based': [True]}
                 }


    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/user_based_KNNWithMeans_recommender_system.json')

    # Calculate evaluation metrics
    print("User-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

### 4. User-Based Collaborative Filtering with KNNBasic

In [13]:
def user_based_KNNBasic_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'user_based': [True]}
              }

    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/user_based_KNNBasic_recommender_system.json')


    # Calculate evaluation metrics
    print("User-based Model with KNNBasic: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred

## Model-Based Recommender System

### Singular Value Decomposition (SVD)
 SVD is a matrix factorization technique commonly used in collaborative filtering-based recommender systems. 

In [14]:
def SVD_recommender_system(trainset, testset, data):
    """
    Trains an SVD-based recommender system using grid search for hyperparameter tuning,
    and evaluates it on the provided test set.
    
    Parameters:
    - trainset (surprise.Trainset): Training set for the recommender system.
    - testset (list of tuples): Test set for evaluation.
    - data (surprise.Dataset): Full dataset for grid search.
    
    Returns:
    - test_pred (list of Prediction objects): Predictions on the test set.
    """
    
    # Define the parameter grid for grid search
    param_grid = {
        "n_factors": [25, 30, 40, 55, 60, 75, 90, 100, 110],  # Number of latent factors in SVD model
        "n_epochs": [10, 20, 30], # Number of training epochs
        "lr_all": [0.005, 0.025, 0.125], # Learning rate for all parameters
        "reg_all": [0.08, 0.16, 0.32], # Regularization term for all parameters
        "random_state": [0], # Seed for reproducibility
    }

    # Perform grid search
    gs = GridSearchCV(SVD, 
                      param_grid, 
                      measures=["rmse", "mae"], 
                      cv=5, 
                      refit=True, # Refit on the entire dataset using best params
                      n_jobs=-1,  # Use all available CPUs for parallel processing
                      joblib_verbose=2 # Verbosity level for parallel processing
                     )
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    params = gs.best_params['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = SVD(n_epochs = params['n_epochs'], 
               lr_all = params['lr_all'], 
               n_factors = params['n_factors'],
               reg_all = params['reg_all'],
               random_state = params["random_state"])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/SVD_recommender_system.json')


    # Calculate evaluation metrics
    print("SVD: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred


### Singular Value Decomposition Plus Plus (SVD++)
SVD++ extends the basic SVD approach to include implicit feedback and user biases.

In [15]:
def SVDpp_recommender_system(trainset, testset, data):
    """
    Trains an SVD++-based recommender system using grid search for hyperparameter tuning,
    and evaluates it on the provided test set.

    Parameters:
    - trainset (surprise.Trainset): Training set for the recommender system.
    - testset (list of tuples): Test set for evaluation.
    - data (surprise.Dataset): Full dataset for grid search.

    Returns:
    - test_pred (list of Prediction objects): Predictions on the test set.
    """
    
    # Define the parameter grid for grid search
    param_grid = {
        "n_factors": [25, 30, 40, 55, 60, 75, 90, 100, 110],  # Number of latent factors in SVD model
        "n_epochs": [10, 20, 30], # Number of training epochs
        "lr_all": [0.005, 0.025, 0.125], # Learning rate for all parameters
        "reg_all": [0.08, 0.16, 0.32], # Regularization term for all parameters
        "random_state": [0], # Seed for reproducibility
    }

    # Perform grid search
    gs = GridSearchCV(SVDpp, 
                      param_grid, 
                      measures=["rmse", "mae"], 
                      cv=5, 
                      refit=True, # Refit on the entire dataset using best params
                      n_jobs=-1,  # Use all available CPUs for parallel processing
                      joblib_verbose=2 # Verbosity level for parallel processing
                     )
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    params = gs.best_params['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = SVDpp(n_epochs = params['n_epochs'], 
               lr_all = params['lr_all'], 
               n_factors = params['n_factors'],
               reg_all = params['reg_all'],
               random_state = params["random_state"])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)
    serialize_predictions(test_pred, '../data/SVDpp_recommender_system.json')


    # Calculate evaluation metrics
    print("SVD++: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    precision_k, recall_k, f1_score_k = precision_recall_f1_at_k(test_pred)
    precision, recall, f1_score = precision_recall_f1(test_pred)
    
    return test_pred


## Run predictions

### 1. Item-based collaborative filtering with KNNWithMeans

In [16]:
prediction_item_based_KNNWithMeans = item_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [17]:
top_item_based_with_KNNWithMeans = get_top_predictions(prediction_item_based_KNNWithMeans, 'A143XY6COA69A')
#count_users_by_predictions(prediction_item_based_KNNWithMeans)

User A143XY6COA69A: Top 5 Recommended Books - ['B00MS9DVPM', 'B00FA2B9S0', 'B0096VSP60', 'B00HFXK4W4', 'B00ENTIAJM']


In [19]:
map_k = map_at_k(prediction_item_based_KNNWithMeans)

MAP@5: 0.8587


### 2. Item-based collaborative filtering with KNNBasic

In [20]:
prediction_item_based_KNNBasic = item_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [21]:
top_item_based_with_KNNBasic = get_top_predictions(prediction_item_based_KNNBasic, 'A143XY6COA69A')
#count_users_by_predictions(prediction_item_based_KNNBasic)

User A143XY6COA69A: Top 5 Recommended Books - ['B00UXYQ9LM', 'B00FA2B9S0', 'B00ENTIAJM', 'B00MS9DVPM', 'B00RY2IAMM']


In [22]:
map_k = map_at_k(prediction_item_based_KNNBasic)

MAP@5: 0.8386


### 3. User-Based Collaborative Filtering with KNNWithMeans

In [23]:
prediction_user_based_KNNWithMeans = user_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [24]:
top_user_based_with_KNNWithMeans = get_top_predictions(prediction_user_based_KNNWithMeans, 'A143XY6COA69A')
#count_users_by_predictions(prediction_user_based_KNNWithMeans)

User A143XY6COA69A: Top 5 Recommended Books - ['B0096VSP60', 'B00FA2B9S0', 'B00MS9DVPM', 'B00DGI5VDE', 'B00HFXK4W4']


In [25]:
map_k = map_at_k(prediction_user_based_KNNWithMeans)

MAP@5: 0.8546


### 4. User-Based Collaborative Filtering with KNNBasic

In [26]:
prediction_user_based_KNNBasic = user_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [27]:
top_user_based_with_KNNBasic = get_top_predictions(prediction_user_based_KNNBasic, 'A143XY6COA69A')
#count_users_by_predictions(prediction_user_based_KNNBasic)

User A143XY6COA69A: Top 5 Recommended Books - ['B0096VSP60', 'B00MS9DVPM', 'B00FA2B9S0', 'B00DGI5VDE', 'B00HFXK4W4']


In [28]:
map_k = map_at_k(prediction_user_based_KNNBasic)

MAP@5: 0.8436


### Singular Value Decomposition (SVD)

In [29]:
prediction_SVD = SVD_recommender_system(trainset, testset, data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  2.9min finished


Best RMSE: 0.7254860294806246
Best Parameters: {'n_factors': 30, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.08, 'random_state': 0}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7177  0.7293  0.7396  0.7064  0.7248  0.7236  0.0112  
MAE (testset)     0.5209  0.5186  0.5264  0.5041  0.5180  0.5176  0.0074  
Fit time          0.89    0.77    0.50    0.62    0.69    0.69    0.13    
Test time         0.25    0.11    0.07    0.10    0.08    0.12    0.07    
SVD: Test Set
RMSE: 0.7323
MAE:  0.5224
Precision@5: 0.8857
Recall@5: 0.6264
F1-score@5: 0.6826
Precision: 0.8302
Recall: 0.9116
F1-score: 0.8690


In [30]:
top_SVD = get_top_predictions(prediction_SVD, 'A143XY6COA69A')
#count_users_by_predictions(prediction_SVD)

User A143XY6COA69A: Top 5 Recommended Books - ['B00MS9DVPM', 'B00FA2B9S0', 'B0096VSP60', 'B00DGI5VDE', 'B00RY2IAMM']


In [31]:
map_k = map_at_k(prediction_SVD)

MAP@5: 0.8608


### Singular Value Decomposition Plus Plus (SVD++)

In [32]:
prediction_SVDpp = SVDpp_recommender_system(trainset, testset, data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed: 45.6min finished


Best RMSE: 0.7245743998745633
Best Parameters: {'n_factors': 25, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.08, 'random_state': 0}
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7326  0.7203  0.7181  0.7259  0.7267  0.7247  0.0051  
MAE (testset)     0.5218  0.5165  0.5126  0.5200  0.5191  0.5180  0.0032  
Fit time          4.82    5.01    4.14    4.91    6.53    5.08    0.79    
Test time         0.63    0.56    0.51    0.84    0.67    0.64    0.11    
SVD++: Test Set
RMSE: 0.7323
MAE:  0.5221
Precision@5: 0.8843
Recall@5: 0.6266
F1-score@5: 0.6819
Precision: 0.8302
Recall: 0.9121
F1-score: 0.8692


In [33]:
top_SVDpp = get_top_predictions(prediction_SVDpp, 'A143XY6COA69A')
#count_users_by_predictions(prediction_SVDpp)

User A143XY6COA69A: Top 5 Recommended Books - ['B00FA2B9S0', 'B00MS9DVPM', 'B0096VSP60', 'B00HFXK4W4', 'B00DGI5VDE']


In [34]:
map_k = map_at_k(prediction_SVDpp)

MAP@5: 0.8615
