In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score

from collections import defaultdict

from tqdm import tqdm

In [2]:
root = '../KuaiRec 2.0/'

prediction_scores = pd.read_csv(root + 'results/ncf_predictions_combined.csv')

joined_train_data = pd.read_csv(root + 'data_exports/joined_train_data.csv')
joined_val_data = pd.read_csv(root + 'data_exports/joined_val_data.csv')

video_data = pd.read_csv(root + 'data/kuairec_caption_category_translated.csv', index_col=0)

In [3]:
# Rename
prediction_scores = prediction_scores.rename(columns={'watch_ratio': 'predicted_watch_ratio'})

# Sort predictions
prediction_scores = prediction_scores.sort_values(by=['cluster', 'user_id', 'video_id'])
prediction_scores

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
2693116,14,103,1.222314,0
2693117,14,109,1.531232,0
2693118,14,120,0.881308,0
2693119,14,122,0.810715,0
2693120,14,128,0.634935,0
...,...,...,...,...
2693111,7159,10099,0.176296,3
2693112,7159,10105,0.770872,3
2693113,7159,10120,0.864906,3
2693114,7159,10122,0.629939,3


## Get user watch history

We want to be able to filter out videos that the user has already watched. This is so that we recommend new videos instead.

In [4]:
def get_user_watch_history(data):
    """
    Args: 
        data: DataFrame of user watch history. Must contain columns 'user_id' and 'video_id'.
        
    Returns:
        A dictionary with user_id as key and a set of video_ids that the user has watched as value.
    """
    watch_history_dict = defaultdict(set)
    for user in data['user_id'].unique():
        watch_history_dict[user] = set(data[data['user_id'] == user]['video_id'])
    return watch_history_dict

In [5]:
user_watch_history = get_user_watch_history(joined_train_data)

## Getting ground truth videos for each user

In [7]:
def get_ground_truth(ground_truth_df, valid_users, valid_videos, user_watch_history):
    """
    Args:
        ground_truth_df: DataFrame with the ground truth watch ratios.
        videos_in_train_data: List of video_ids that are present in the training data.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the ground truth watch ratios. It only contains videos that are present in training data and that the user has not watched before.
        Users that are not in the training data are filtered out as well, as we cannot make recommendations for them.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    ground_truth_new = pd.DataFrame(columns=['user_id', 'video_id', 'watch_ratio'])

    for user in ground_truth_df['user_id'].unique():
        if user not in valid_users:
            continue
        user_ground_truth = ground_truth_df[ground_truth_df['user_id'] == user].copy()
        user_ground_truth = user_ground_truth[~user_ground_truth['video_id'].isin(user_watch_history[user])]
        user_ground_truth = user_ground_truth[user_ground_truth['video_id'].isin(valid_videos)]

        ground_truth_new = pd.concat([ground_truth_new, user_ground_truth])

    # Sort by watch_ratio in descending order
    ground_truth_new = ground_truth_new.sort_values(by=['user_id', 'watch_ratio'], ascending=[True, False])
    return ground_truth_new

In [8]:
users_in_train_data = set(joined_train_data['user_id'])
videos_in_train_data = set(joined_train_data['video_id'])

ground_truth = get_ground_truth(joined_val_data[['user_id', 'video_id', 'watch_ratio']], users_in_train_data, videos_in_train_data, user_watch_history)

In [9]:
# Ground truth scores for user 14
ground_truth[ground_truth['user_id'] == 14]

Unnamed: 0,user_id,video_id,watch_ratio
11,14,8766,3.318871
702,14,8799,3.185954
607,14,2735,2.598506
602,14,4201,2.478148
573,14,4015,2.319912
...,...,...,...
131,14,7297,0.032396
991,14,4021,0.032293
180,14,4141,0.032250
61,14,7461,0.029277


## Getting recommendations for each user

In [None]:
def get_user_recommendations(prediction_scores, valid_videos, user_watch_history):
    """
    Args:
        prediction_scores: DataFrame with the predicted watch_ratios.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the recommendations for a specific user. It only contains videos that the user has not watched before.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    recommendations_new = pd.DataFrame(columns=['user_id', 'video_id', 'predicted_watch_ratio'])
    
    # Only consider videos that are in validation set
    prediction_scores = prediction_scores[prediction_scores['video_id'].isin(valid_videos)]

    for user in prediction_scores['user_id'].unique():
        user_recommendations = prediction_scores[prediction_scores['user_id'] == user].copy()
        user_recommendations = user_recommendations[~user_recommendations['video_id'].isin(user_watch_history[user])]
        
        recommendations_new = pd.concat([recommendations_new, user_recommendations])

    # Sort by prediction in descending order
    recommendations_new = recommendations_new.sort_values(by=['user_id', 'predicted_watch_ratio'], ascending=[True, False])
    return recommendations_new

In [11]:
videos_in_val_data = set(joined_val_data['video_id'])

recommendations = get_user_recommendations(prediction_scores, videos_in_val_data, user_watch_history)

In [13]:
# Recommendations for user 14
recommendations[recommendations['user_id'] == 14]

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
2693489,14,816,2.459153,0.0
2693942,14,2672,1.921901,0.0
2693493,14,825,1.845788,0.0
2694262,14,4186,1.832490,0.0
2694119,14,3992,1.806678,0.0
...,...,...,...,...
2694874,14,7251,0.000000,0.0
2694924,14,7335,0.000000,0.0
2694951,14,7461,0.000000,0.0
2695077,14,8545,0.000000,0.0


In [14]:
def get_top_k_for_user(k, user_id, df):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        df: DataFrame containing the scores for all users, sorted by score in descending order.
    
    Returns:
        DataFrame with the top k scores.
    """
    return df[df['user_id'] == user_id].head(k)

In [15]:
k = 50

# Get top 50 ground truth and recommendations for user 14
top_50_ground_truth_user_14 = get_top_k_for_user(k, 14, ground_truth)
top_50_recommendations_user_14 = get_top_k_for_user(k, 14, recommendations)

# top_50_ground_truth_user_14
top_50_recommendations_user_14

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
2693489,14,816,2.459153,0.0
2693942,14,2672,1.921901,0.0
2693493,14,825,1.845788,0.0
2694262,14,4186,1.83249,0.0
2694119,14,3992,1.806678,0.0
2693201,14,372,1.77552,0.0
2695203,14,8748,1.716819,0.0
2694871,14,7246,1.712584,0.0
2695346,14,9854,1.696038,0.0
2695136,14,8624,1.652539,0.0


## Evaluation Metrics

### Category-Aware NDCG@k

In [16]:
def get_category_tally_at_k(recommendations, video_info):
    """
    Args:
        recommendations: DataFrame with the top k recommendations for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        Dictionary with the category as key and the number of videos in each category as value.
    """
    tally = defaultdict(int)

    for video_id in recommendations['video_id']:
        category = video_info.loc[str(video_id)]['english_first_level_category_name']
        tally[category] += 1
    
    return tally

def get_category_ndcg_at_k(recommendations, ground_truth, video_info):
    """
    Args:
        recommendations: DataFrame with the top k video recommendations for a specific user.
        ground_truth: DataFrame with the ground truth videos for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        NDCG score for the categories of the top k recommendations.
    """
    cat_tally_reco = get_category_tally_at_k(recommendations, video_info)
    cat_tally_gt = get_category_tally_at_k(ground_truth, video_info)

    cat_tally_reco_adjusted = {}
    for category in cat_tally_gt:
        cat_tally_reco_adjusted[category] = cat_tally_reco.get(category, 0)

    return ndcg_score([list(cat_tally_gt.values())], [list(cat_tally_reco_adjusted.values())])

In [17]:
# Get the category-aware NDCG@50 for user 14
get_category_ndcg_at_k(top_50_recommendations_user_14, top_50_ground_truth_user_14, video_data)

0.7618081067801695

In [None]:
def get_average_ndcg_at_k(k, ground_truth, recommendations, video_info, by_cluster):
    """
    Args:
        k: The number of recommendations to return.
        ground_truth: DataFrame with the ground truth watch ratios, sorted by descending watch_ratio.
        recommendations: DataFrame with all video recommendations, sorted by descending predicted_watch_ratio.
        video_info: DataFrame with information about the videos.
        by_cluster: Boolean indicating whether to calculate the average ndcg@k per cluster.

    Returns:
        The average category-aware NDCG@k for all users, and a dictionary with the NDCG@k per cluster.
    """
    all_ndcg_scores = []

    if by_cluster:
        cluster_scores = {}
        ground_truth_users = set(ground_truth['user_id'].unique())
        
        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_recommendations = recommendations[recommendations['cluster'] == cluster]
            users_in_cluster = set(cluster_recommendations['user_id'].unique())
            
            # Filter users in both ground truth and the current cluster's recommendations
            valid_users = ground_truth_users.intersection(users_in_cluster)
            cluster_ndcg_scores = []

            for user_id in tqdm(valid_users):
                user_recommendations_top_k = get_top_k_for_user(k, user_id, cluster_recommendations)
                user_ground_truth_top_k = get_top_k_for_user(k, user_id, ground_truth)

                user_ndcg_score = get_category_ndcg_at_k(user_recommendations_top_k, user_ground_truth_top_k, video_info)

                cluster_ndcg_scores.append(user_ndcg_score)
                all_ndcg_scores.append(user_ndcg_score)

            # Store the mean NDCG score per cluster
            cluster_scores[cluster] = np.mean(cluster_ndcg_scores) if cluster_ndcg_scores else 0

    else:
        for user_id in tqdm(ground_truth['user_id'].unique()):
            user_recommendations_top_k = get_top_k_for_user(k, user_id, recommendations)
            user_ground_truth_top_k = get_top_k_for_user(k, user_id, ground_truth)

            user_ndcg_score = get_category_ndcg_at_k(user_recommendations_top_k, user_ground_truth_top_k, video_info)

            all_ndcg_scores.append(user_ndcg_score)
        cluster_scores = None

    return np.mean(all_ndcg_scores), cluster_scores

### Distinct Categories @ k

In [24]:
def get_user_distinct_categories_at_k(k, user_id, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The number of distinct categories in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    categories = set()

    for video_id in top_k['video_id']:
        category = video_data.loc[str(video_id)]['english_first_level_category_name']
        categories.add(category)
    
    return len(categories)

In [25]:
# Get Distinct Categories @ 50 for user 14
get_user_distinct_categories_at_k(50, 14, recommendations)

19

In [None]:
def get_average_distinct_categories_at_k(k, recommendations, by_cluster):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        by_cluster: Boolean indicating whether to calculate the average distinct categories per cluster.
    
    Returns:
        The overall average number of distinct categories in the top k recommendations, 
        and a dictionary with the average number of distinct categories per cluster (if by_cluster is True).
    """
    all_distinct_categories = []

    if by_cluster:
        cluster_distinct_categories = {}
        users_in_val = set(joined_val_data['user_id'])

        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_distinct_categories_list = []
            
            # Get users in the current cluster
            users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
            valid_users = users_in_val.intersection(users_in_cluster)

            for user_id in tqdm(valid_users):
                user_distinct_categories = get_user_distinct_categories_at_k(k, user_id, recommendations)
                cluster_distinct_categories_list.append(user_distinct_categories)
                all_distinct_categories.append(user_distinct_categories)

            cluster_distinct_categories[cluster] = np.mean(cluster_distinct_categories_list) if cluster_distinct_categories_list else 0
    else:
        for user_id in tqdm(recommendations['user_id'].unique()):
            user_distinct_categories = get_user_distinct_categories_at_k(k, user_id, recommendations)
            all_distinct_categories.append(user_distinct_categories)
        cluster_distinct_categories = None

    return np.mean(all_distinct_categories), cluster_distinct_categories

### Average watch ratio @ k

In [33]:
def get_user_avg_watch_ratio_at_k(k, user_id, recommendations, watch_ratio_column):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The average watch_ratio in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    return np.mean(top_k[watch_ratio_column])

In [34]:
# Get avg watch ratio @ 50 for user 14
get_user_avg_watch_ratio_at_k(50, 14, recommendations, 'predicted_watch_ratio')

1.566932475566864

In [None]:
# def get_avg_watch_ratio_at_k(k, recommendations, by_cluster):
#     """
#     Args:
#         k: The number of recommendations to return.
#         recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
#         by_cluster: Boolean indicating whether to calculate the average watch_ratio per cluster.
        
#     Returns:
#         The overall average watch_ratio in the top k ground truth videos, a dictionary with the average watch_ratio per cluster,
#         the overall average predicted_watch_ratio in the top k recommendations, and a dictionary with the average predicted_watch_ratio per cluster (if by_cluster is True).
#     """
#     all_avg_watch_ratios_list = []
#     all_avg_predicted_watch_ratios_list = []
    
#     if by_cluster:
#         cluster_avg_watch_ratios = {}
#         cluster_avg_predicted_watch_ratios = {}
#         users_in_val = set(joined_val_data['user_id'])

#         for cluster in sorted(recommendations['cluster'].unique()):
#             cluster_avg_watch_ratios_list = []
#             cluster_avg_predicted_watch_ratios_list = []

#             # Get users in the current cluster and intersect with validation users
#             users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
#             valid_users = users_in_val.intersection(users_in_cluster)

#             for user_id in tqdm(valid_users):
#                 user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, joined_val_data, 'watch_ratio')
#                 user_avg_predicted_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

#                 cluster_avg_watch_ratios_list.append(user_avg_watch_ratio)
#                 cluster_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)
                
#                 all_avg_watch_ratios_list.append(user_avg_watch_ratio)
#                 all_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)
            
#             # Calculate and store cluster-specific averages
#             cluster_avg_watch_ratios[cluster] = np.mean(cluster_avg_watch_ratios_list) if cluster_avg_watch_ratios_list else 0
#             cluster_avg_predicted_watch_ratios[cluster] = np.mean(cluster_avg_predicted_watch_ratios_list) if cluster_avg_predicted_watch_ratios_list else 0
#     else:
#         # Process all users without clustering
#         for user_id in tqdm(recommendations['user_id'].unique()):
#             user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, joined_val_data, 'watch_ratio')
#             user_avg_predicted_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

#             all_avg_watch_ratios_list.append(user_avg_watch_ratio)
#             all_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)

#         cluster_avg_watch_ratios = None
#         cluster_avg_predicted_watch_ratios = None
    
#     # Return overall averages and cluster-specific dictionaries if `by_cluster` is True
#     return (
#         np.mean(all_avg_watch_ratios_list),
#         cluster_avg_watch_ratios,
#         np.mean(all_avg_predicted_watch_ratios_list),
#         cluster_avg_predicted_watch_ratios
#     )

In [35]:
def get_avg_watch_ratio_at_k(k, recommendations, by_cluster):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        by_cluster: Boolean indicating whether to calculate the average watch_ratio per cluster.
        
    Returns:
        The overall average watch_ratio in the top k recommendations, and a dictionary with the average predicted_watch_ratio per cluster (if by_cluster is True).
    """
    all_avg_watch_ratios_list = []
    
    if by_cluster:
        cluster_avg_watch_ratios = {}
        users_in_val = set(joined_val_data['user_id'])

        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_avg_watch_ratios_list = []

            # Get users in the current cluster and intersect with validation users
            users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
            valid_users = users_in_val.intersection(users_in_cluster)

            for user_id in tqdm(valid_users):
                user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

                cluster_avg_watch_ratios_list.append(user_avg_watch_ratio)
                all_avg_watch_ratios_list.append(user_avg_watch_ratio)
            
            cluster_avg_watch_ratios[cluster] = np.mean(cluster_avg_watch_ratios_list) if cluster_avg_watch_ratios_list else 0
    else:
        for user_id in tqdm(recommendations['user_id'].unique()):
            user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

            all_avg_watch_ratios_list.append(user_avg_watch_ratio)
        cluster_avg_watch_ratios = None
    
    return np.mean(all_avg_watch_ratios_list), cluster_avg_watch_ratios

### Precision@k, Recall@k, F1Score@k

In [41]:
def get_user_precision_recall_f1_at_k(k, user_id, recommendations, ground_truth, threshold):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
    
    Returns:
        Precision, recall, and F1 score at k for a specific user.
    """
    top_k_recommendations = get_top_k_for_user(k, user_id, recommendations)
    top_k_ground_truth = get_top_k_for_user(k, user_id, ground_truth)

    tp = 0
    fp = 0
    fn = 0

    for video_id in top_k_recommendations['video_id']:
        if video_id in top_k_ground_truth['video_id'].values:
            if top_k_ground_truth[top_k_ground_truth['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
                # If the video is in top_k_ground_truth and watch ratio is above the threshold, it is a true positive
                tp += 1
            else:
                # If the video is in top_k_ground_truth but watch ratio is below the threshold, it is a false positive
                fp += 1
        else:
            # If video is not in top_k_ground_truth, it is a false positive
            fp += 1

    for video_id in top_k_ground_truth['video_id']:
        if video_id not in top_k_recommendations['video_id'].values:
            if top_k_ground_truth[top_k_ground_truth['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
                # If the video is in top_k_ground_truth but not in top_k_recommendations, it is a false negative
                fn += 1

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

In [43]:
# Precision, recall, and F1 @ 50 for user 14
precision, recall, f1 = get_user_precision_recall_f1_at_k(500, 14, recommendations, ground_truth, 0.5)
precision, recall, f1

(0.634, 0.8386243386243386, 0.7220956719817768)

In [44]:
def get_precision_recall_f1_at_k(k, recommendations, ground_truth, threshold, by_cluster=True):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
        by_cluster: Boolean indicating whether to calculate the precision, recall, and F1 score per cluster.
    
    Returns:
        The overall average precision, recall, and F1 score at k, and a dictionary with the average precision, recall, and F1 score per cluster if by_cluster is True.
    """
    all_precision_list = []
    all_recall_list = []
    all_f1_list = []

    if by_cluster:
        cluster_precision = {}
        cluster_recall = {}
        cluster_f1 = {}
        users_in_val = set(ground_truth['user_id'])

        for cluster in sorted(recommendations['cluster'].unique()):
            cluster_precision_list = []
            cluster_recall_list = []
            cluster_f1_list = []

            # Get users in the current cluster and intersect with validation users
            users_in_cluster = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
            valid_users = users_in_val.intersection(users_in_cluster)

            for user_id in tqdm(valid_users):
                user_precision, user_recall, user_f1 = get_user_precision_recall_f1_at_k(
                    k, user_id, recommendations, ground_truth, threshold
                )

                cluster_precision_list.append(user_precision)
                cluster_recall_list.append(user_recall)
                cluster_f1_list.append(user_f1)

                all_precision_list.append(user_precision)
                all_recall_list.append(user_recall)
                all_f1_list.append(user_f1)
            
            # Calculate cluster-specific averages
            cluster_precision[cluster] = np.mean(cluster_precision_list) if cluster_precision_list else 0
            cluster_recall[cluster] = np.mean(cluster_recall_list) if cluster_recall_list else 0
            cluster_f1[cluster] = np.mean(cluster_f1_list) if cluster_f1_list else 0
    else:
        for user_id in tqdm(ground_truth['user_id'].unique()):
            user_precision, user_recall, user_f1 = get_user_precision_recall_f1_at_k(
                k, user_id, recommendations, ground_truth, threshold
            )

            all_precision_list.append(user_precision)
            all_recall_list.append(user_recall)
            all_f1_list.append(user_f1)

        cluster_precision = None
        cluster_recall = None
        cluster_f1 = None
    
    return (
        np.mean(all_precision_list),
        np.mean(all_recall_list),
        np.mean(all_f1_list),
        cluster_precision,
        cluster_recall,
        cluster_f1
    )

## Calculation of Evaluation Metrics

In [55]:
def get_all_metrics(k1, k2, ground_truth, recommendations, video_info, threshold, by_cluster):
    """
    Args:
        k1: The number of recommendations to return for NDCG@k, distinct categories, and avg watch ratio.
        k2: The number of recommendations to return for precision, recall, and F1 score.
        ground_truth: DataFrame with the ground truth watch ratios, sorted by descending watch_ratio.
        recommendations: DataFrame with all video recommendations, sorted by descending predicted_watch_ratio.
        video_info: DataFrame with information about the videos.
        threshold: The threshold for the watch ratio to calculate binary labels.
        by_cluster: Boolean indicating whether to calculate the metrics per cluster.

    Returns:
        Dataframe of all evaluation metrics.
    """
    overall_ndcg, cluster_ndcg = get_average_ndcg_at_k(k1, ground_truth, recommendations, video_info, by_cluster)
    overall_distinct_categories, cluster_distinct_categories = get_average_distinct_categories_at_k(k1, recommendations, by_cluster)
    overall_avg_watch_ratio, cluster_avg_watch_ratio = get_avg_watch_ratio_at_k(k1, recommendations, by_cluster)
    avg_precision, avg_recall, avg_f1, cluster_precision, cluster_recall, cluster_f1 = get_precision_recall_f1_at_k(k2, recommendations, ground_truth, threshold, by_cluster)

    metrics_df = pd.DataFrame(columns=['cluster', f'NDCG@{k1}', f'Distinct Categories @ {k1}', f'Avg Watch Ratio @ {k1}', f'Avg Precision@{k2}', f'Avg Recall@{k2}', f'Avg F1@{k2}'])
    if by_cluster:
        for cluster in recommendations['cluster'].unique():
            metrics_df = pd.concat([
                metrics_df,
                pd.DataFrame({
                    'cluster': cluster,
                    f'NDCG@{k1}': cluster_ndcg[cluster],
                    f'Distinct Categories @ {k1}': cluster_distinct_categories[cluster],
                    f'Avg Watch Ratio @ {k1}': cluster_avg_watch_ratio[cluster],
                    f'Avg Precision@{k2}': cluster_precision[cluster],
                    f'Avg Recall@{k2}': cluster_recall[cluster],
                    f'Avg F1@{k2}': cluster_f1[cluster]
                }, index=[0])
            ])
    metrics_df['cluster'] = metrics_df['cluster'].astype(int)
    metrics_df = metrics_df.sort_values(by='cluster')
    
    metrics_df = pd.concat([
        metrics_df,
        pd.DataFrame({
            'cluster': 'Overall',
            f'NDCG@{k1}': overall_ndcg,
            f'Distinct Categories @ {k1}': overall_distinct_categories,
            f'Avg Watch Ratio @ {k1}': overall_avg_watch_ratio,
            f'Avg Precision@{k2}': avg_precision,
            f'Avg Recall@{k2}': avg_recall,
            f'Avg F1@{k2}': avg_f1
        }, index=[0])
    ])

    return metrics_df

In [56]:
k1 = 50
k2 = 500
threshold = 0.7

metrics_df = get_all_metrics(k1, k2, ground_truth, recommendations, video_data, threshold, by_cluster=True)

100%|██████████| 269/269 [00:14<00:00, 18.52it/s]
100%|██████████| 419/419 [00:24<00:00, 17.34it/s]
100%|██████████| 345/345 [00:20<00:00, 17.09it/s]
100%|██████████| 378/378 [00:21<00:00, 17.64it/s]
100%|██████████| 269/269 [00:11<00:00, 22.73it/s]
100%|██████████| 419/419 [00:18<00:00, 22.47it/s]
100%|██████████| 345/345 [00:15<00:00, 22.20it/s]
100%|██████████| 378/378 [00:16<00:00, 22.33it/s]
100%|██████████| 269/269 [00:09<00:00, 28.73it/s]
100%|██████████| 419/419 [00:14<00:00, 28.94it/s]
100%|██████████| 345/345 [00:11<00:00, 28.80it/s]
100%|██████████| 378/378 [00:13<00:00, 29.01it/s]
100%|██████████| 269/269 [00:37<00:00,  7.21it/s]
100%|██████████| 419/419 [01:00<00:00,  6.96it/s]
100%|██████████| 345/345 [00:55<00:00,  6.20it/s]
100%|██████████| 378/378 [00:53<00:00,  7.10it/s]


In [58]:
metrics_df

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50,Avg Watch Ratio @ 50,Avg Precision@500,Avg Recall@500,Avg F1@500
0,0,0.881977,20.349442,1.34331,0.419613,0.91125,0.564507
0,1,0.874977,22.393795,1.793264,0.457387,0.910071,0.60055
0,2,0.866198,21.136232,1.525789,0.424859,0.916716,0.572677
0,3,0.873572,21.600529,1.605655,0.439272,0.932517,0.588796
0,Overall,0.873789,21.484054,1.591824,0.437379,0.917934,0.583715


In [57]:
metrics_df_overall = get_all_metrics(k1, k2, ground_truth, recommendations, video_data, threshold, by_cluster=False)

100%|██████████| 1411/1411 [01:57<00:00, 11.97it/s]
100%|██████████| 1411/1411 [01:04<00:00, 21.76it/s]
100%|██████████| 1411/1411 [00:49<00:00, 28.61it/s]
100%|██████████| 1411/1411 [03:15<00:00,  7.21it/s]


In [59]:
metrics_df_overall

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50,Avg Watch Ratio @ 50,Avg Precision@500,Avg Recall@500,Avg F1@500
0,Overall,0.873789,21.484054,1.591824,0.437379,0.917934,0.583715
