# Evaluator Module
The Evaluator module creates evaluation reports.

Reports contain evaluation metrics depending on models specified in the evaluation config.

In [None]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# third parties imports
import numpy as np 
import pandas as pd
# -- add new imports here --

# local imports
from configs import EvalConfig
from constants import Constant as C
from loaders import export_evaluation_report
from loaders import load_ratings
from surprise import model_selection, accuracy
from models import ModelBaseline1, ModelBaseline2, ModelBaseline3, ModelBaseline4, get_top_n
# -- add new imports here --

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 1. Model validation functions
Validation functions are a way to perform crossvalidation on recommender system models. 

In [None]:
def generate_split_predictions(algo, ratings_dataset, eval_config):
    testsize = eval_config.test_size
    trainset, testset = model_selection.train_test_split(ratings_dataset, test_size=testsize)
    """Generate predictions on a random test set specified in eval_config"""
    algo.fit(trainset)
    predictions = algo.test(testset)     
    # -- implement the function generate_split_predictions --
    return predictions


def generate_loo_top_n(algo, ratings_dataset, eval_config):
    loo = model_selection.LeaveOneOut(n_splits=1)
    n = eval_config.top_n_value
    trainset, testset = next(loo.split(ratings_dataset))
    anti_testset = trainset.build_anti_testset()

    """Generate top-n recommendations for each user on a random Leave-one-out split (LOO)"""
    algo.fit(trainset)
    predictions = algo.test(anti_testset)
    anti_testset_top_n = get_top_n(predictions, n)

    # -- implement the function generate_loo_top_n --
    return anti_testset_top_n, testset


def generate_full_top_n(algo, ratings_dataset, eval_config):
    n = eval_config.top_n_value
    trainset = ratings_dataset.build_full_trainset()
    testset = trainset.build_anti_testset()

    algo.fit(trainset)
    predictions = algo.test(testset)
    anti_testset_top_n = get_top_n(predictions, n)

    """Generate top-n recommendations for each user with full training set (LOO)"""
    # -- implement the function generate_full_top_n --
    return anti_testset_top_n


def precompute_information():
    ratings_dataset = load_ratings(False)
    """ Returns a dictionary that precomputes relevant information for evaluating in full mode
    
    Dictionary keys:
    - precomputed_dict["item_to_rank"] : contains a dictionary mapping movie ids to rankings
    - (-- for your project, add other relevant information here -- )
    """

    # Compter le nombre de notes pour chaque item
    item_counts = ratings_dataset[C.ITEM_ID_COL].value_counts()

    # Utiliser rank pour attribuer un rang basé sur la popularité (1 = le plus populaire)
    item_to_rank = item_counts.rank(ascending=False, method="min").to_dict()

    precomputed_dict = {}
    precomputed_dict["item_to_rank"] = item_to_rank

    
    return precomputed_dict                


def create_evaluation_report(eval_config, sp_ratings, precomputed_dict, available_metrics):
    """ Create a DataFrame evaluating various models on metrics specified in an evaluation config.  
    """
    evaluation_dict = {}
    for model_name, model, arguments in eval_config.models:
        print(f'Handling model {model_name}')
        algo = model(**arguments)
        evaluation_dict[model_name] = {}
        
        # Type 1 : split evaluations
        if len(eval_config.split_metrics) > 0:
            print('Training split predictions')
            predictions = generate_split_predictions(algo, sp_ratings, eval_config)
            for metric in eval_config.split_metrics:
                print(f'- computing metric {metric}')
                assert metric in available_metrics['split']
                evaluation_function, parameters =  available_metrics["split"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(predictions, **parameters) 

        # Type 2 : loo evaluations
        if len(eval_config.loo_metrics) > 0:
            print('Training loo predictions')
            anti_testset_top_n, testset = generate_loo_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.loo_metrics:
                assert metric in available_metrics['loo']
                evaluation_function, parameters =  available_metrics["loo"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(anti_testset_top_n, testset, **parameters)
        
        # Type 3 : full evaluations
        if len(eval_config.full_metrics) > 0:
            print('Training full predictions')
            anti_testset_top_n = generate_full_top_n(algo, sp_ratings, eval_config)
            for metric in eval_config.full_metrics:
                assert metric in available_metrics['full']
                evaluation_function, parameters =  available_metrics["full"][metric]
                evaluation_dict[model_name][metric] = evaluation_function(
                    anti_testset_top_n,
                    **precomputed_dict,
                    **parameters
                )
        
    return pd.DataFrame.from_dict(evaluation_dict).T

# 2. Evaluation metrics
Implement evaluation metrics for either rating predictions (split metrics) or for top-n recommendations (loo metric, full metric)

In [None]:
def get_hit_rate(anti_testset_top_n, testset):
    """
    Compute the average hit rate over the users (LOO metric).

    A hit (1) happens when the movie in the testset has been picked by the top-n recommender.
    A fail (0) happens when the movie in the testset has not been picked by the top-n recommender.

    Parameters:
    - anti_testset_top_n: dict, top-n recommendations for each user (user_id -> list of (movie_id, score))
    - testset: list of tuples, each tuple is (user_id, movie_id, rating_value)

    Returns:
    - hit_rate: float, the proportion of users for whom the testset movie is in the top-n recommendations.
    """
    # testset-> DataFrame
    testset_df = pd.DataFrame(testset, columns=["user_id", "movie_id", "rating_value"])

    reco_data = []
    # anti_testset_top_n -> DataFrame
    for user_id, recommendations in anti_testset_top_n.items():
        for movie_id, score in recommendations:
            reco_data.append({"user_id": user_id, "movie_id": movie_id, "score": score})

    reco_df = pd.DataFrame(reco_data)
    
    # inner join on the two dataframes to get the hits 
    merged_df = pd.merge(testset_df, reco_df, on=["user_id", "movie_id"], how="inner")

    # compute hit rate
    hit_rate = len(merged_df) / len(testset_df) if len(testset_df) > 0 else 0
    return hit_rate


def get_novelty(anti_testset_top_n, item_to_rank):
    """Compute the average novelty of the top-n recommendation over the users (full metric)
    
    The novelty is defined as the average ranking of the movies recommended

    **Limites**
    Cette métrique présente des limites en termes de sensibilité à la distribution des données. 
    En effet, le rang ne capture pas l'écart de popularité entre les films.
    """
    reco_data = []
    # anti_testset_top_n -> DataFrame
    for user_id, recommendations in anti_testset_top_n.items():
        for movie_id, score in recommendations:
            reco_data.append({"user_id": user_id, "movie_id": movie_id, "score": score})
    
    reco_df = pd.DataFrame(reco_data)
    # map rankings based on item_to_rank
    reco_df["rank"] = reco_df["movie_id"].map(item_to_rank)

    # compute average ranking
    average_rank_sum = reco_df["rank"].mean() if not reco_df.empty else 0
    return average_rank_sum

# 3. Evaluation workflow
Load data, evaluate models and save the experimental outcomes

In [None]:

AVAILABLE_METRICS = {
    "split": {
        "mae": (accuracy.mae, {'verbose': False}),
        # -- add new split metrics here --
        "rmse":(accuracy.rmse, {'verbose': False}),
    },
    "loo" : {
        "hitrate":(get_hit_rate, {})
    },
    "full" : {
        "novelty":(get_novelty, {})
    }
    
    # -- add new types of metrics here --
}

ratingsdata = load_ratings(True)

sp_ratings = load_ratings(surprise_format=True)
precomputed_dict = precompute_information()
evaluation_report = create_evaluation_report(EvalConfig, sp_ratings, precomputed_dict, AVAILABLE_METRICS)
display(evaluation_report)

export_evaluation_report(evaluation_report)

'''
Observations :

L'algorithme qui présente la novelty la plus basse est le 4 (SVD), 
qui recommande donc des films plus populaires en moyenne.

Elle se démarque également par une erreur moyenne absolue et une erreur quadratique moyenne plus basses que les autres, 
la rendant plus fiable que les autres algorithmes.
'''






Handling model baseline_1
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
Handling model baseline_2
Training split predictions
- computing metric mae
- computing metric rmse
Training loo predictions
Training full predictions
