In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score

from collections import defaultdict

from tqdm import tqdm

In [142]:
root = '../KuaiRec 2.0/'

# joined_train_data = pd.read_csv(root + 'data_exports/joined_train_data.csv')

# Get predicted validation scores and actual validation scores
prediction_scores = pd.read_csv(root + 'results/w_clustering_batch_size512_num_epochs30_lr0.001_embedding_dim64_dropout0.3_decay0.01.csv')
# joined_val_data = pd.read_csv(root + 'data_exports/joined_val_data.csv')

# video_data = pd.read_csv(root + 'data/kuairec_caption_category_translated.csv', index_col=0)

## Get user watch history

We want to be able to filter out videos that the user has already watched. This is so that we recommend new videos instead.

In [3]:
def get_user_watch_history(data):
    """
    Args: 
        data: DataFrame of user watch history. Must contain columns 'user_id' and 'video_id'.
        
    Returns:
        A dictionary with user_id as key and a set of video_ids that the user has watched as value.
    """
    watch_history_dict = defaultdict(set)
    for user in data['user_id'].unique():
        watch_history_dict[user] = set(data[data['user_id'] == user]['video_id'])
    return watch_history_dict

In [4]:
user_watch_history = get_user_watch_history(joined_train_data)

## Getting ground truth videos for each user

In [5]:
def get_ground_truth(ground_truth_df, valid_users, valid_videos, user_watch_history):
    """
    Args:
        ground_truth_df: DataFrame with the ground truth watch ratios.
        videos_in_train_data: List of video_ids that are present in the training data.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the ground truth watch ratios. It only contains videos that are present in training data and that the user has not watched before.
        Users that are not in the training data are filtered out as well, as we cannot make recommendations for them.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    ground_truth_new = pd.DataFrame(columns=['user_id', 'video_id', 'watch_ratio'])

    for user in ground_truth_df['user_id'].unique():
        if user not in valid_users:
            continue
        user_ground_truth = ground_truth_df[ground_truth_df['user_id'] == user].copy()
        user_ground_truth = user_ground_truth[~user_ground_truth['video_id'].isin(user_watch_history[user])]
        user_ground_truth = user_ground_truth[user_ground_truth['video_id'].isin(valid_videos)]

        ground_truth_new = pd.concat([ground_truth_new, user_ground_truth])

    # Sort by watch_ratio in descending order
    ground_truth_new = ground_truth_new.sort_values(by=['user_id', 'watch_ratio'], ascending=[True, False])
    return ground_truth_new

In [6]:
users_in_train_data = set(joined_train_data['user_id'])
videos_in_train_data = set(joined_train_data['video_id'])

ground_truth = get_ground_truth(joined_val_data[['user_id', 'video_id', 'watch_ratio']], users_in_train_data, videos_in_train_data, user_watch_history)

In [7]:
# Ground truth scores for user 14
ground_truth[ground_truth['user_id'] == 14]

Unnamed: 0,user_id,video_id,watch_ratio
11,14,8766,3.318871
702,14,8799,3.185954
607,14,2735,2.598506
602,14,4201,2.478148
573,14,4015,2.319912
...,...,...,...
131,14,7297,0.032396
991,14,4021,0.032293
180,14,4141,0.032250
61,14,7461,0.029277


## Getting recommendations for each user

In [8]:
def get_user_recommendations(prediction_scores, user_watch_history):
    """
    Args:
        prediction_scores: DataFrame with the predicted watch_ratios.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the recommendations for a specific user. It only contains videos that the user has not watched before.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    prediction_scores = prediction_scores.rename(columns={'watch_ratio': 'predicted_watch_ratio'})

    recommendations_new = pd.DataFrame(columns=['user_id', 'video_id', 'predicted_watch_ratio'])

    for user in prediction_scores['user_id'].unique():
        user_recommendations = prediction_scores[prediction_scores['user_id'] == user]
        user_recommendations = user_recommendations[~user_recommendations['video_id'].isin(user_watch_history[user])]
        
        recommendations_new = pd.concat([recommendations_new, user_recommendations])

    # Sort predicted_watch_ratio in descending order
    recommendations_new = recommendations_new.sort_values(by=['user_id', 'predicted_watch_ratio'], ascending=[True, False])
    return recommendations_new

In [143]:
# videos_in_val_data = set(joined_val_data['video_id'])

recommendations = get_user_recommendations(prediction_scores, user_watch_history)

In [144]:
# Recommendations for user 14
recommendations[recommendations['user_id'] == 14]

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
444,14,8799,1.254961e+00,0.0
188,14,8814,1.218694e+00,0.0
359,14,5952,1.109251e+00,0.0
69,14,10095,1.067484e+00,0.0
13,14,8819,1.048624e+00,0.0
...,...,...,...,...
197,14,3978,9.120815e-07,0.0
468,14,9908,7.757050e-07,0.0
114,14,2662,5.595499e-07,0.0
495,14,4021,3.743697e-07,0.0


In [11]:
def get_top_k_for_user(k, user_id, df):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        df: DataFrame containing the scores for all users, sorted by score in descending order.
    
    Returns:
        DataFrame with the top k scores.
    """
    return df[df['user_id'] == user_id].head(k)

In [145]:
k = 50

# Get top 50 ground truth and recommendations for user 14
top_50_ground_truth_user_14 = get_top_k_for_user(k, 14, ground_truth)
top_50_recommendations_user_14 = get_top_k_for_user(k, 14, recommendations)

# top_50_ground_truth_user_14
top_50_recommendations_user_14

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
444,14,8799,1.254961,0.0
188,14,8814,1.218694,0.0
359,14,5952,1.109251,0.0
69,14,10095,1.067484,0.0
13,14,8819,1.048624,0.0
141,14,7343,1.047345,0.0
272,14,7336,0.995488,0.0
374,14,996,0.990358,0.0
39,14,8834,0.988275,0.0
94,14,4282,0.952604,0.0


## Evaluation Metrics

### Category-Aware NDCG@k

In [13]:
def get_category_tally_at_k(recommendations, video_info):
    """
    Args:
        recommendations: DataFrame with the top k recommendations for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        Dictionary with the category as key and the number of videos in each category as value.
    """
    tally = defaultdict(int)

    for video_id in recommendations['video_id']:
        category = video_info.loc[str(video_id)]['english_first_level_category_name']
        tally[category] += 1
    
    return tally

def get_category_ndcg_at_k(recommendations, ground_truth, video_info):
    """
    Args:
        recommendations: DataFrame with the top k video recommendations for a specific user.
        ground_truth: DataFrame with the ground truth videos for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        NDCG score for the categories of the top k recommendations.
    """
    cat_tally_reco = get_category_tally_at_k(recommendations, video_info)
    cat_tally_gt = get_category_tally_at_k(ground_truth, video_info)

    cat_tally_reco_adjusted = {}
    for category in cat_tally_gt:
        cat_tally_reco_adjusted[category] = cat_tally_reco.get(category, 0)

    return ndcg_score([list(cat_tally_gt.values())], [list(cat_tally_reco_adjusted.values())])

In [146]:
# Get the category-aware NDCG@50 for user 14
get_category_ndcg_at_k(top_50_recommendations_user_14, top_50_ground_truth_user_14, video_data)

0.8157117834834173

In [15]:
def get_average_ndcg_at_k(k, ground_truth, recommendations, video_info, by_cluster):
    """
    Args:
        k: The number of recommendations to return.
        ground_truth: DataFrame with the ground truth watch ratios, sorted by descending watch_ratio.
        recommendations: DataFrame with all video recommendations, sorted by descending predicted_watch_ratio.
        video_info: DataFrame with information about the videos.
        by_cluster: Boolean indicating whether to calculate the average ndcg@k per cluster.

    Returns:
        The average category-aware NDCG@k for all users, and a dictionary with the NDCG@k per cluster.
    """
    all_ndcg_scores = []

    if by_cluster:
        cluster_scores = {}
        ground_truth_users = set(ground_truth['user_id'].unique())
        
        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_recommendations = recommendations[recommendations['cluster'] == cluster]
            users_in_cluster = set(cluster_recommendations['user_id'].unique())
            
            # Filter users in both ground truth and the current cluster's recommendations
            valid_users = ground_truth_users.intersection(users_in_cluster)
            cluster_ndcg_scores = []

            for user_id in tqdm(valid_users):
                user_recommendations_top_k = get_top_k_for_user(k, user_id, cluster_recommendations)
                user_ground_truth_top_k = get_top_k_for_user(k, user_id, ground_truth)

                user_ndcg_score = get_category_ndcg_at_k(user_recommendations_top_k, user_ground_truth_top_k, video_info)

                cluster_ndcg_scores.append(user_ndcg_score)
                all_ndcg_scores.append(user_ndcg_score)

            # Store the mean NDCG score per cluster
            cluster_scores[cluster] = np.mean(cluster_ndcg_scores) if cluster_ndcg_scores else 0

    else:
        for user_id in tqdm(ground_truth['user_id'].unique()):
            user_recommendations_top_k = get_top_k_for_user(k, user_id, recommendations)
            user_ground_truth_top_k = get_top_k_for_user(k, user_id, ground_truth)

            user_ndcg_score = get_category_ndcg_at_k(user_recommendations_top_k, user_ground_truth_top_k, video_info)

            all_ndcg_scores.append(user_ndcg_score)
        cluster_scores = None

    return np.mean(all_ndcg_scores), cluster_scores

### Distinct Categories @ k

In [16]:
def get_user_distinct_categories_at_k(k, user_id, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The number of distinct categories in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    categories = set()

    for video_id in top_k['video_id']:
        category = video_data.loc[str(video_id)]['english_first_level_category_name']
        categories.add(category)
    
    return len(categories)

In [147]:
# Get Distinct Categories @ 50 for user 14
get_user_distinct_categories_at_k(50, 14, recommendations)

21

In [18]:
def get_average_distinct_categories_at_k(k, recommendations, by_cluster):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        by_cluster: Boolean indicating whether to calculate the average distinct categories per cluster.
    
    Returns:
        The overall average number of distinct categories in the top k recommendations, 
        and a dictionary with the average number of distinct categories per cluster (if by_cluster is True).
    """
    all_distinct_categories = []

    if by_cluster:
        cluster_distinct_categories = {}
        users_in_val = set(joined_val_data['user_id'])

        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_distinct_categories_list = []
            
            # Get users in the current cluster
            users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
            valid_users = users_in_val.intersection(users_in_cluster)

            for user_id in tqdm(valid_users):
                user_distinct_categories = get_user_distinct_categories_at_k(k, user_id, recommendations)
                cluster_distinct_categories_list.append(user_distinct_categories)
                all_distinct_categories.append(user_distinct_categories)

            cluster_distinct_categories[cluster] = np.mean(cluster_distinct_categories_list) if cluster_distinct_categories_list else 0
    else:
        for user_id in tqdm(recommendations['user_id'].unique()):
            user_distinct_categories = get_user_distinct_categories_at_k(k, user_id, recommendations)
            all_distinct_categories.append(user_distinct_categories)
        cluster_distinct_categories = None

    return np.mean(all_distinct_categories), cluster_distinct_categories

### Average watch ratio @ k

In [19]:
def get_user_avg_watch_ratio_at_k(k, user_id, recommendations, watch_ratio_column):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The average watch_ratio in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    return np.mean(top_k[watch_ratio_column])

In [148]:
# Get avg watch ratio @ 50 for user 14
get_user_avg_watch_ratio_at_k(50, 14, recommendations, 'predicted_watch_ratio')

0.8851038762000001

In [21]:
def get_avg_watch_ratio_at_k(k, recommendations, by_cluster):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        by_cluster: Boolean indicating whether to calculate the average watch_ratio per cluster.
        
    Returns:
        The overall average watch_ratio in the top k ground truth videos, a dictionary with the average watch_ratio per cluster,
        the overall average predicted_watch_ratio in the top k recommendations, and a dictionary with the average predicted_watch_ratio per cluster (if by_cluster is True).
    """
    all_avg_watch_ratios_list = []
    all_avg_predicted_watch_ratios_list = []
    
    if by_cluster:
        cluster_avg_watch_ratios = {}
        cluster_avg_predicted_watch_ratios = {}
        users_in_val = set(joined_val_data['user_id'])

        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_avg_watch_ratios_list = []
            cluster_avg_predicted_watch_ratios_list = []

            # Get users in the current cluster and intersect with validation users
            users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
            valid_users = users_in_val.intersection(users_in_cluster)

            for user_id in tqdm(valid_users):
                user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, joined_val_data, 'watch_ratio')
                user_avg_predicted_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

                cluster_avg_watch_ratios_list.append(user_avg_watch_ratio)
                cluster_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)
                
                all_avg_watch_ratios_list.append(user_avg_watch_ratio)
                all_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)
            
            # Calculate and store cluster-specific averages
            cluster_avg_watch_ratios[cluster] = np.mean(cluster_avg_watch_ratios_list) if cluster_avg_watch_ratios_list else 0
            cluster_avg_predicted_watch_ratios[cluster] = np.mean(cluster_avg_predicted_watch_ratios_list) if cluster_avg_predicted_watch_ratios_list else 0
    else:
        # Process all users without clustering
        for user_id in tqdm(recommendations['user_id'].unique()):
            user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, joined_val_data, 'watch_ratio')
            user_avg_predicted_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

            all_avg_watch_ratios_list.append(user_avg_watch_ratio)
            all_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)

        cluster_avg_watch_ratios = None
        cluster_avg_predicted_watch_ratios = None
    
    # Return overall averages and cluster-specific dictionaries if `by_cluster` is True
    return (
        np.mean(all_avg_watch_ratios_list),
        cluster_avg_watch_ratios,
        np.mean(all_avg_predicted_watch_ratios_list),
        cluster_avg_predicted_watch_ratios
    )

### Precision@k, Recall@k, F1Score@k

In [None]:
# def get_user_precision_recall_f1_at_k(k, user_id, recommendations, ground_truth, threshold):
#     """
#     Args:
#         k: The number of recommendations to return.
#         user_id: The user for which to get recommendations.
#         recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
#         ground_truth: DataFrame with the ground truth watch ratios.
#         threshold: The threshold for the watch ratio.
    
#     Returns:
#         Precision, recall, and F1 score at k for a specific user.
#     """
#     top_k_recommendations = get_top_k_for_user(k, user_id, recommendations)
#     top_k_ground_truth = get_top_k_for_user(k, user_id, ground_truth)

#     tp = 0
#     fp = 0
#     fn = 0

#     for video_id in top_k_recommendations['video_id']:
#         if video_id in top_k_ground_truth['video_id'].values:
#             if top_k_ground_truth[top_k_ground_truth['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
#                 # If the video is in top_k_ground_truth and watch ratio is above the threshold, it is a true positive
#                 tp += 1
#             else:
#                 # If the video is in top_k_ground_truth but watch ratio is below the threshold, it is a false positive
#                 fp += 1
#         else:
#             # If video is not in top_k_ground_truth, it is a false positive
#             fp += 1

#     for video_id in top_k_ground_truth['video_id']:
#         if video_id not in top_k_recommendations['video_id'].values:
#             if top_k_ground_truth[top_k_ground_truth['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
#                 # If the video is in top_k_ground_truth but not in top_k_recommendations, it is a false negative
#                 fn += 1

#     precision = tp / (tp + fp) if tp + fp > 0 else 0
#     recall = tp / (tp + fn) if tp + fn > 0 else 0
#     f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
#     print(f'True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}')
#     return precision, recall, f1

In [181]:
def get_user_precision_recall_f1_at_k(k, user_id, recommendations_with_actual_scores, threshold):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
    
    Returns:
        Precision, recall, and F1 score at k for a specific user.
    """

    # Get top k recommendations for the user 
    top_k_recommendations = get_top_k_for_user(k, user_id, recommendations_with_actual_scores)
     
    # Get number of relevant videos in recommendations
    is_relevant_predicted = top_k_recommendations['predicted_watch_ratio'] >= threshold
    is_relevant_actual = top_k_recommendations['watch_ratio'] >= threshold

    tp = 0
    fp = 0
    fn = 0
    for i in range(len(top_k_recommendations)):
        # if is_relevant_predicted[i]:
        if is_relevant_predicted.iloc[i]:
            if is_relevant_actual.iloc[i]:
                tp += 1
            else:
                fp += 1
        else:
            if is_relevant_actual.iloc[i]:
                fn += 1

    # Calculate Precision, Recall, F1
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    # print(f'True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}')
     
    return precision, recall, f1

In [162]:
recommendations_with_actual_scores = recommendations.merge(joined_val_data[['user_id', 'video_id', 'watch_ratio']], on=['user_id', 'video_id'])

get_top_k_for_user(50, 14, recommendations_with_actual_scores) 

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster,watch_ratio
0,14,8799,1.254961,0.0,3.185954
1,14,8814,1.218694,0.0,1.251556
2,14,5952,1.109251,0.0,1.39762
3,14,10095,1.067484,0.0,1.443898
4,14,8819,1.048624,0.0,1.409508
5,14,7343,1.047345,0.0,1.206901
6,14,7336,0.995488,0.0,0.563166
7,14,996,0.990358,0.0,0.690826
8,14,8834,0.988275,0.0,0.738996
9,14,4282,0.952604,0.0,1.634527


In [184]:
val_data_users = set(joined_val_data['user_id'])
val_data_users

{14,
 19,
 21,
 23,
 24,
 36,
 37,
 41,
 51,
 55,
 64,
 73,
 75,
 97,
 98,
 102,
 120,
 127,
 129,
 131,
 135,
 136,
 137,
 140,
 155,
 157,
 165,
 166,
 169,
 172,
 174,
 176,
 185,
 193,
 203,
 221,
 223,
 224,
 226,
 229,
 234,
 235,
 240,
 241,
 242,
 249,
 261,
 262,
 276,
 279,
 294,
 297,
 298,
 320,
 322,
 323,
 328,
 332,
 335,
 346,
 352,
 357,
 364,
 366,
 368,
 370,
 373,
 385,
 386,
 396,
 407,
 412,
 434,
 438,
 442,
 452,
 458,
 472,
 475,
 477,
 488,
 493,
 506,
 509,
 522,
 531,
 534,
 536,
 537,
 538,
 543,
 545,
 547,
 555,
 558,
 565,
 570,
 572,
 578,
 579,
 582,
 597,
 598,
 602,
 612,
 617,
 623,
 626,
 632,
 634,
 638,
 639,
 642,
 655,
 659,
 663,
 682,
 683,
 699,
 709,
 723,
 724,
 726,
 728,
 735,
 757,
 763,
 764,
 765,
 767,
 771,
 781,
 785,
 791,
 795,
 798,
 800,
 808,
 814,
 815,
 827,
 833,
 838,
 839,
 840,
 843,
 845,
 846,
 847,
 849,
 857,
 860,
 865,
 867,
 877,
 885,
 890,
 891,
 893,
 896,
 901,
 909,
 925,
 928,
 932,
 942,
 944,
 946,
 951,
 

In [189]:
# Precision, recall, and F1 @ 50 for user 14
precision, recall, f1 = get_user_precision_recall_f1_at_k(100, 55, recommendations_with_actual_scores, 0.8)
precision, recall, f1

# no false negatives because all our recommendations are above the threshold -> relevant

(0.625, 0.15625, 0.25)

In [179]:
def get_precision_recall_f1_at_k(k, recommendations_with_actual_scores, threshold, by_cluster=True):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
        by_cluster: Boolean indicating whether to calculate the precision, recall, and F1 score per cluster.
    
    Returns:
        The overall average precision, recall, and F1 score at k, and a dictionary with the average precision, recall, and F1 score per cluster if by_cluster is True.
    """
    all_precision_list = []
    all_recall_list = []
    all_f1_list = []

    if by_cluster:
        cluster_precision = {}
        cluster_recall = {}
        cluster_f1 = {}
        users_in_val = set(ground_truth['user_id'])

        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_precision_list = []
            cluster_recall_list = []
            cluster_f1_list = []

            # Get users in the current cluster and intersect with validation users
            users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
            valid_users = users_in_val.intersection(users_in_cluster)

            for user_id in tqdm(valid_users):
                user_precision, user_recall, user_f1 = get_user_precision_recall_f1_at_k(
                    k, user_id, recommendations_with_actual_scores, threshold
                )

                cluster_precision_list.append(user_precision)
                cluster_recall_list.append(user_recall)
                cluster_f1_list.append(user_f1)

                all_precision_list.append(user_precision)
                all_recall_list.append(user_recall)
                all_f1_list.append(user_f1)
            
            # Calculate cluster-specific averages
            cluster_precision[cluster] = np.mean(cluster_precision_list) if cluster_precision_list else 0
            cluster_recall[cluster] = np.mean(cluster_recall_list) if cluster_recall_list else 0
            cluster_f1[cluster] = np.mean(cluster_f1_list) if cluster_f1_list else 0
    else:
        for user_id in tqdm(ground_truth['user_id'].unique()):
            user_precision, user_recall, user_f1 = get_user_precision_recall_f1_at_k(
                k, user_id, recommendations_with_actual_scores, threshold
            )

            all_precision_list.append(user_precision)
            all_recall_list.append(user_recall)
            all_f1_list.append(user_f1)

        cluster_precision = None
        cluster_recall = None
        cluster_f1 = None
    
    return (
        np.mean(all_precision_list),
        np.mean(all_recall_list),
        np.mean(all_f1_list),
        cluster_precision,
        cluster_recall,
        cluster_f1
    )

In [187]:
avg_precision, avg_recall, avg_f1, cluster_precision, cluster_recall, cluster_f1 =  get_precision_recall_f1_at_k(100, recommendations_with_actual_scores, 0.7, by_cluster=True)

100%|██████████| 269/269 [00:07<00:00, 35.38it/s]
100%|██████████| 419/419 [00:11<00:00, 35.16it/s]
100%|██████████| 345/345 [00:09<00:00, 35.63it/s]
100%|██████████| 378/378 [00:11<00:00, 33.07it/s]


In [188]:
# Put into dataframe
cluster_precision_df = pd.DataFrame(cluster_precision.items(), columns=['cluster', 'precision'])
cluster_recall_df = pd.DataFrame(cluster_recall.items(), columns=['cluster', 'recall'])
cluster_f1_df = pd.DataFrame(cluster_f1.items(), columns=['cluster', 'f1'])

cluster_comparison = cluster_precision_df.merge(cluster_recall_df, on='cluster').merge(cluster_f1_df, on='cluster')

# Add overall averages
cluster_comparison = cluster_comparison.append({'cluster': 'Overall', 'precision': avg_precision, 'recall': avg_recall, 'f1': avg_f1}, ignore_index=True)

cluster_comparison

  cluster_comparison = cluster_comparison.append({'cluster': 'Overall', 'precision': avg_precision, 'recall': avg_recall, 'f1': avg_f1}, ignore_index=True)


Unnamed: 0,cluster,precision,recall,f1
0,0.0,0.558818,0.18895,0.216912
1,1.0,0.650688,0.112526,0.16593
2,2.0,0.615409,0.171563,0.224187
3,3.0,0.645451,0.146906,0.182074
4,Overall,0.623144,0.150741,0.194218


## Calculation of Evaluation Metrics

In [159]:
def get_all_metrics(k1, k2, ground_truth, recommendations, video_info, threshold, by_cluster):
    """
    Args:
        k1: The number of recommendations to return for NDCG@k, distinct categories, and avg watch ratio.
        k2: The number of recommendations to return for precision, recall, and F1 score.
        ground_truth: DataFrame with the ground truth watch ratios, sorted by descending watch_ratio.
        recommendations: DataFrame with all video recommendations, sorted by descending predicted_watch_ratio.
        video_info: DataFrame with information about the videos.
        threshold: The threshold for the watch ratio to calculate binary labels.
        by_cluster: Boolean indicating whether to calculate the metrics per cluster.

    Returns:
        Dataframe of all evaluation metrics.
    """
    overall_ndcg, cluster_ndcg = get_average_ndcg_at_k(k1, ground_truth, recommendations, video_info, by_cluster)
    overall_distinct_categories, cluster_distinct_categories = get_average_distinct_categories_at_k(k1, recommendations, by_cluster)
    overall_avg_watch_ratio, cluster_avg_watch_ratio, overall_avg_predicted_watch_ratio, cluster_avg_predicted_watch_ratio = get_avg_watch_ratio_at_k(k1, recommendations, by_cluster)
    avg_precision, avg_recall, avg_f1, cluster_precision, cluster_recall, cluster_f1 = get_precision_recall_f1_at_k(k2, recommendations, ground_truth, threshold, by_cluster)

    metrics_df = pd.DataFrame(columns=['cluster', f'NDCG@{k1}', f'Distinct Categories @ {k1}', f'Avg Watch Ratio @ {k1}', f'Avg Predicted Watch Ratio @ {k1}', 
                                       f'Avg Precision@{k2}', f'Avg Recall@{k2}', f'Avg F1@{k2}'])
    if by_cluster:
        for cluster in recommendations['cluster'].unique():
            metrics_df = pd.concat([
                metrics_df,
                pd.DataFrame({
                    'cluster': cluster,
                    f'NDCG@{k1}': cluster_ndcg[cluster],
                    f'Distinct Categories @ {k1}': cluster_distinct_categories[cluster],
                    f'Avg Watch Ratio @ {k1}': cluster_avg_watch_ratio[cluster],
                    f'Avg Predicted Watch Ratio @ {k1}': cluster_avg_predicted_watch_ratio[cluster],
                    f'Avg Precision@{k2}': cluster_precision[cluster],
                    f'Avg Recall@{k2}': cluster_recall[cluster],
                    f'Avg F1@{k2}': cluster_f1[cluster]
                }, index=[0])
            ])
    metrics_df['cluster'] = metrics_df['cluster'].astype(int)
    metrics_df = metrics_df.sort_values(by='cluster')
    
    metrics_df = pd.concat([
        metrics_df,
        pd.DataFrame({
            'cluster': 'Overall',
            f'NDCG@{k1}': overall_ndcg,
            f'Distinct Categories @ {k1}': overall_distinct_categories,
            f'Avg Watch Ratio @ {k1}': overall_avg_watch_ratio,
            f'Avg Predicted Watch Ratio @ {k1}': overall_avg_predicted_watch_ratio,
            f'Avg Precision@{k2}': avg_precision,
            f'Avg Recall@{k2}': avg_recall,
            f'Avg F1@{k2}': avg_f1
        }, index=[0])
    ])

    return metrics_df

### With Segmentation

In [28]:
import itertools

In [None]:
k1 = 50
k2 = 50
threshold = 0.8

In [157]:
metrics_df = get_all_metrics(k1, k2, ground_truth, recommendations, video_data, threshold, by_cluster=True)

NameError: name 'get_all_metrics' is not defined

In [31]:
metrics_df

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50,Avg Watch Ratio @ 50,Avg Predicted Watch Ratio @ 50,Avg Precision@500,Avg Recall@500,Avg F1@500
0,0,0.888035,20.431227,0.85177,0.964339,0.42504,0.921066,0.571381
0,1,0.900216,22.594272,0.843245,0.962078,0.463439,0.920827,0.608127
0,2,0.886396,21.35942,0.827713,1.026723,0.431132,0.929328,0.580943
0,3,0.906808,21.71164,0.843225,0.957878,0.44488,0.943618,0.596111
0,Overall,0.896281,21.643515,0.841067,0.97719,0.443247,0.929057,0.591256


In [37]:
# Put into dataframe
precision_recall_f1_df = pd.DataFrame(columns=['cluster', 'Precision', 'Recall', 'F1'])
for cluster in recommendations['cluster'].unique():
    precision_recall_f1_df = pd.concat([
        precision_recall_f1_df,
        pd.DataFrame({
            'cluster': cluster,
            'Precision': cluster_precision[cluster],
            'Recall': cluster_recall[cluster],
            'F1': cluster_f1[cluster]
        }, index=[0])
    ])

precision_recall_f1_df = pd.concat([
    precision_recall_f1_df,
    pd.DataFrame({
        'cluster': 'Overall',
        'Precision': avg_precision,
        'Recall': avg_recall,
        'F1': avg_f1
    }, index=[0])
])

precision_recall_f1_df

Unnamed: 0,cluster,Precision,Recall,F1
0,0.0,0.133234,0.133234,0.133234
0,2.0,0.177333,0.177333,0.177333
0,1.0,0.185919,0.185919,0.185919
0,3.0,0.202804,0.202804,0.202804
0,Overall,0.178299,0.178299,0.178299


### Without Segmentation

In [None]:
files = ['batch_size512_num_epochs10_lr0.001_embedding_dim64_dropout0.3_decay0.01']

metrics_dict_unsegmented = {}

for file in files:
    prediction_scores = pd.read_csv(root + f'results/wo_clustering_{file}.csv')

    recommendations = get_user_recommendations(prediction_scores, videos_in_val_data, user_watch_history)

    metrics_df = get_all_metrics(k1, k2, ground_truth, recommendations, video_data, threshold, by_cluster=False)
    
    print(metrics_df)
    metrics_df.to_csv(root + f'results/metrics_wo_clustering_{file}.csv', index=False)

    metrics_dict_unsegmented[param_str] = metrics_df

Getting metrics for model with hyperparameters: {'batch_size': 128, 'num_epochs': 10, 'lr': 0.001, 'embedding_dim': 64, 'dropout': 0.3, 'alpha': 0.01, 'beta': 0.05}


100%|██████████| 1411/1411 [01:57<00:00, 11.97it/s]
100%|██████████| 1411/1411 [01:04<00:00, 21.88it/s]
100%|██████████| 1411/1411 [00:50<00:00, 27.88it/s]
100%|██████████| 1411/1411 [03:29<00:00,  6.73it/s]

   cluster   NDCG@50  Distinct Categories @ 50  Avg Watch Ratio @ 50  \
0  Overall  0.879138                 20.328136              0.841067   

   Avg Precision@500  Avg Recall@500  Avg F1@500  \
0           0.423718        0.898895     0.56732   

   Avg Predicted Watch Ratio @ 50  
0                        8.166317  





In [51]:
metrics_dict_unsegmented['batch_size128_num_epochs10_lr0.001_embedding_dim64_dropout0.3_alpha0.01_beta0.01']

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50,Avg Watch Ratio @ 50,Avg Precision@500,Avg Recall@500,Avg F1@500,Avg Predicted Watch Ratio @ 50
0,Overall,0.879138,20.328136,0.841067,0.423718,0.898895,0.56732,8.166317
