In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score

from collections import defaultdict

from tqdm import tqdm

In [2]:
root = '../KuaiRec 2.0/'

prediction_scores = pd.read_csv(root + 'results/ncf_predictions_combined.csv')
# recommendations_caption = pd.read_csv(root + 'recommendations/recommendations_caption.csv', index_col=0)
# recommendations_random = pd.read_csv(root + 'recommendations/recommendations_random.csv', index_col=0)
joined_train_data = pd.read_csv(root + 'data_exports/joined_train_data.csv')
joined_val_data = pd.read_csv(root + 'data_exports/joined_val_data.csv')

video_data = pd.read_csv(root + 'data/kuairec_caption_category_translated.csv', index_col=0)

In [34]:
# Rename
prediction_scores = prediction_scores.rename(columns={'watch_ratio': 'predicted_watch_ratio'})

# Sort predictions
prediction_scores = prediction_scores.sort_values(by=['cluster', 'user_id', 'video_id'])
prediction_scores

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
2693116,14,103,1.222314,0
2693117,14,109,1.531232,0
2693118,14,120,0.881308,0
2693119,14,122,0.810715,0
2693120,14,128,0.634935,0
...,...,...,...,...
2693111,7159,10099,0.176296,3
2693112,7159,10105,0.770872,3
2693113,7159,10120,0.864906,3
2693114,7159,10122,0.629939,3


## Get user watch history

We want to be able to filter out videos that the user has already watched. This is so that we recommend new videos instead.

In [4]:
def get_user_watch_history(data):
    """
    Args: 
        data: DataFrame of user watch history. Must contain columns 'user_id' and 'video_id'.
        
    Returns:
        A dictionary with user_id as key and a set of video_ids that the user has watched as value.
    """
    watch_history_dict = defaultdict(set)
    for user in data['user_id'].unique():
        watch_history_dict[user] = set(data[data['user_id'] == user]['video_id'])
    return watch_history_dict

In [5]:
user_watch_history = get_user_watch_history(joined_train_data)

## Getting ground truth videos for each user

In [None]:
def get_ground_truth(ground_truth_df, valid_users, valid_videos, user_watch_history):
    """
    Args:
        ground_truth_df: DataFrame with the ground truth watch ratios.
        videos_in_train_data: List of video_ids that are present in the training data.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the ground truth watch ratios. It only contains videos that the user has not watched before, and videos that are present in training data.
        Users that are not in the training data are filtered out as well, as we cannot make recommendations for them.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    ground_truth_new = pd.DataFrame(columns=['user_id', 'video_id', 'watch_ratio'])

    for user in ground_truth_df['user_id'].unique():
        if user not in valid_users:
            continue
        user_ground_truth = ground_truth_df[ground_truth_df['user_id'] == user].copy()
        user_ground_truth = user_ground_truth[~user_ground_truth['video_id'].isin(user_watch_history[user])]
        user_ground_truth = user_ground_truth[user_ground_truth['video_id'].isin(valid_videos)]

        ground_truth_new = pd.concat([ground_truth_new, user_ground_truth])

    # Sort by watch_ratio in descending order
    ground_truth_new = ground_truth_new.sort_values(by=['user_id', 'watch_ratio'], ascending=[True, False])
    return ground_truth_new

In [None]:
users_in_train_data = set(joined_train_data['user_id'])
videos_in_train_data = set(joined_train_data['video_id'])

ground_truth = get_ground_truth(joined_val_data[['user_id', 'video_id', 'watch_ratio']], users_in_train_data, videos_in_train_data, user_watch_history)

In [30]:
# Ground truth scores for user 14
ground_truth[ground_truth['user_id'] == 14]

Unnamed: 0,user_id,video_id,watch_ratio
11,14,8766,3.318871
702,14,8799,3.185954
607,14,2735,2.598506
602,14,4201,2.478148
573,14,4015,2.319912
...,...,...,...
131,14,7297,0.032396
991,14,4021,0.032293
180,14,4141,0.032250
61,14,7461,0.029277


## Getting recommendations for each user

In [46]:
def get_user_recommendations(prediction_scores, user_watch_history):
    """
    Args:
        prediction_scores: DataFrame with the predicted watch_ratios.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the recommendations for a specific user. It only contains videos that the user has not watched before.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    recommendations_new = pd.DataFrame(columns=['user_id', 'video_id', 'predicted_watch_ratio'])
    for user in prediction_scores['user_id'].unique():
        user_recommendations = prediction_scores[prediction_scores['user_id'] == user].copy()
        user_recommendations = user_recommendations[~user_recommendations['video_id'].isin(user_watch_history[user])]
        
        recommendations_new = pd.concat([recommendations_new, user_recommendations])

    # Sort by prediction in descending order
    recommendations_new = recommendations_new.sort_values(by=['user_id', 'predicted_watch_ratio'], ascending=[True, False])
    return recommendations_new

In [47]:
recommendations = get_user_recommendations(prediction_scores, user_watch_history)

In [48]:
# Recommendations for user 14
recommendations[recommendations['user_id'] == 14]

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
2693489,14,816,2.459153,0.0
2693942,14,2672,1.921901,0.0
2693493,14,825,1.845788,0.0
2694262,14,4186,1.832490,0.0
2694119,14,3992,1.806678,0.0
...,...,...,...,...
2694874,14,7251,0.000000,0.0
2694924,14,7335,0.000000,0.0
2694951,14,7461,0.000000,0.0
2695077,14,8545,0.000000,0.0


In [49]:
def get_top_k_for_user(k, user_id, df):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        df: DataFrame containing the scores for all users, sorted by score in descending order.
    
    Returns:
        DataFrame with the top k scores.
    """
    return df[df['user_id'] == user_id].head(k)

In [51]:
k = 50

# Get top 50 ground truth and recommendations for user 14
top_50_ground_truth_user_14 = get_top_k_for_user(k, 14, ground_truth)
top_50_recommendations_user_14 = get_top_k_for_user(k, 14, recommendations)

# top_50_ground_truth_user_14
top_50_recommendations_user_14

Unnamed: 0,user_id,video_id,predicted_watch_ratio,cluster
2693489,14,816,2.459153,0.0
2693942,14,2672,1.921901,0.0
2693493,14,825,1.845788,0.0
2694262,14,4186,1.83249,0.0
2694119,14,3992,1.806678,0.0
2693201,14,372,1.77552,0.0
2695203,14,8748,1.716819,0.0
2694871,14,7246,1.712584,0.0
2695346,14,9854,1.696038,0.0
2695136,14,8624,1.652539,0.0


## Calcuation of Evaluation Metrics

### Category-Aware NDCG@k

In [None]:
def get_category_tally_at_k(recommendations, video_info):
    """
    Args:
        recommendations: DataFrame with the top k recommendations for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        Dictionary with the category as key and the number of videos in each category as value.
    """
    tally = defaultdict(int)

    for video_id in recommendations['video_id']:
        category = video_info.loc[str(video_id)]['english_first_level_category_name']
        tally[category] += 1
    
    return tally

def get_category_ndcg_at_k(recommendations, ground_truth, video_info):
    """
    Args:
        recommendations: DataFrame with the top k video recommendations for a specific user.
        ground_truth: DataFrame with the ground truth videos for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        NDCG score for the categories of the top k recommendations.
    """
    cat_tally_reco = get_category_tally_at_k(recommendations, video_info)
    cat_tally_gt = get_category_tally_at_k(ground_truth, video_info)

    cat_tally_reco_adjusted = {}
    for category in cat_tally_gt:
        cat_tally_reco_adjusted[category] = cat_tally_reco.get(category, 0)

    return ndcg_score([list(cat_tally_gt.values())], [list(cat_tally_reco_adjusted.values())])

In [53]:
# Get the category-aware NDCG@50 for user 14
get_category_ndcg_at_k(top_50_recommendations_user_14, top_50_ground_truth_user_14, video_data)

0.7919654817258412

In [None]:
# Get average category-aware NDCG@50 for all users, and per cluster
def get_average_ndcg_at_k(k, ground_truth, recommendations, video_info):
    """
    Args:
        k: The number of recommendations to return.
        ground_truth: DataFrame with the ground truth watch ratios, sorted by descending watch_ratio.
        recommendations: DataFrame with all video recommendations, sorted by descending predicted_watch_ratio.
        video_info: DataFrame with information about the videos.

    Returns:
        The average category-aware NDCG@50 for all users, and a dictionary with the NDCG@50 per cluster.
    """
    all_ndcg_scores = []
    cluster_scores = {}

    for cluster in sorted(recommendations['cluster'].unique()):
        cluster_ndcg_scores = []

        users_in_train = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
        users_in_val = set(ground_truth['user_id'])

        # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
        users_in_val = users_in_val.intersection(users_in_train)
        for user_id in tqdm(users_in_val):
            user_recommendations_top_k = get_top_k_for_user(k, user_id, recommendations)
            user_ground_truth_top_k = get_top_k_for_user(k, user_id, ground_truth)

            user_ndcg_score = get_category_ndcg_at_k(user_recommendations_top_k, user_ground_truth_top_k, video_info)

            cluster_ndcg_scores.append(user_ndcg_score)
            all_ndcg_scores.append(user_ndcg_score)
        cluster_scores[cluster] = np.mean(cluster_ndcg_scores)
    
    return np.mean(all_ndcg_scores), cluster_scores

In [56]:
overall_ndcg, cluster_ndcg = get_average_ndcg_at_k(50, ground_truth, recommendations, video_data)

100%|██████████| 269/269 [00:21<00:00, 12.24it/s]
100%|██████████| 345/345 [00:27<00:00, 12.45it/s]
100%|██████████| 419/419 [00:35<00:00, 11.90it/s]
100%|██████████| 378/378 [00:32<00:00, 11.65it/s]


In [162]:
# Show in dataframe
metrics_df = pd.DataFrame(cluster_ndcg.items(), columns=['cluster', f'NDCG@{k}'])
metrics_df['cluster'] = metrics_df['cluster'].astype(int)
metrics_df = metrics_df.sort_values(by='cluster')

metrics_df = pd.concat([metrics_df, pd.DataFrame([['Overall', overall_ndcg]], columns=['cluster', 'NDCG@50'])])

metrics_df

Unnamed: 0,cluster,NDCG@50
0,0,0.883611
2,1,0.873609
1,2,0.871041
3,3,0.876206
0,Overall,0.875584


### Distinct Categories @ k

In [69]:
def get_user_distinct_categories_at_k(k, user_id, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The number of distinct categories in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    categories = set()

    for video_id in top_k['video_id']:
        category = video_data.loc[str(video_id)]['english_first_level_category_name']
        categories.add(category)
    
    return len(categories)

In [71]:
# Get Distinct Categories @ 50 for user 14
get_user_distinct_categories_at_k(50, 14, recommendations)

19

In [None]:
def get_average_distinct_categories_at_k(k, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
    
    Returns:
        The overall average number of distinct categories in the top k recommendations, and a dictionary with the average number of distinct categories per cluster.
    """
    all_distinct_categories = []
    cluster_distinct_categories = {}

    for cluster in sorted(recommendations['cluster'].unique()):
        cluster_distinct_categories_list = []

        users_in_train = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
        users_in_val = set(joined_val_data['user_id'])

        # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
        users_in_val = users_in_val.intersection(users_in_train)
        for user_id in tqdm(users_in_val):
            user_distinct_categories = get_user_distinct_categories_at_k(k, user_id, recommendations)

            cluster_distinct_categories_list.append(user_distinct_categories)
            all_distinct_categories.append(user_distinct_categories)
        cluster_distinct_categories[cluster] = np.mean(cluster_distinct_categories_list)

    return np.mean(all_distinct_categories), cluster_distinct_categories

In [73]:
overall_distinct_categories, cluster_distinct_categories = get_average_distinct_categories_at_k(50, recommendations)

100%|██████████| 269/269 [00:11<00:00, 22.79it/s]
100%|██████████| 345/345 [00:15<00:00, 22.76it/s]
100%|██████████| 419/419 [00:19<00:00, 21.46it/s]
100%|██████████| 378/378 [00:17<00:00, 21.06it/s]


In [163]:
cluster_distinct_categories_df = pd.DataFrame(cluster_distinct_categories.items(), columns=['cluster', f'Distinct Categories @ {k}'])
cluster_distinct_categories_df['cluster'] = cluster_distinct_categories_df['cluster'].astype(int)

cluster_distinct_categories_df = pd.concat([cluster_distinct_categories_df, pd.DataFrame([['Overall', overall_distinct_categories]], columns=['cluster', f'Distinct Categories @ {k}'])])

# Merge with metrics_df
metrics_df2 = metrics_df.merge(cluster_distinct_categories_df, on='cluster', how='left')
metrics_df2

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50
0,0,0.883611,20.30855
1,1,0.873609,22.360382
2,2,0.871041,21.075362
3,3,0.876206,21.507937
4,Overall,0.875584,21.426648


### Average watch ratio @ k

In [87]:
def get_user_avg_watch_ratio_at_k(k, user_id, recommendations, watch_ratio_column):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The average watch_ratio in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    return np.mean(top_k[watch_ratio_column])

In [88]:
# Get avg watch ratio @ 50 for user 14
get_user_avg_watch_ratio_at_k(50, 14, recommendations, 'predicted_watch_ratio')

1.569566261768341

In [94]:
def get_avg_watch_ratio_at_k(k, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The overall average watch_ratio in the top k ground truth videos, a dictionary with the average watch_ratio per cluster, 
        the overall average predicted_watch_ratio in the top k recommendations, and a dictionary with the average predicted_watch_ratio per cluster.
    """
    all_avg_watch_ratios_list = []
    cluster_avg_watch_ratios = {}

    all_avg_predicted_watch_ratios_list = []
    cluster_avg_predicted_watch_ratios = {}

    for cluster in sorted(recommendations['cluster'].unique()):
        cluster_avg_watch_ratios_list = []
        cluster_avg_predicted_watch_ratios_list = []

        users_in_train = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
        users_in_val = set(joined_val_data['user_id'])

        # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
        users_in_val = users_in_val.intersection(users_in_train)
        for user_id in tqdm(users_in_val):
            user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, joined_val_data, 'watch_ratio')
            user_avg_predicted_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'predicted_watch_ratio')

            cluster_avg_watch_ratios_list.append(user_avg_watch_ratio)
            cluster_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)
            
            all_avg_watch_ratios_list.append(user_avg_watch_ratio)
            all_avg_predicted_watch_ratios_list.append(user_avg_predicted_watch_ratio)
        cluster_avg_watch_ratios[cluster] = np.mean(cluster_avg_watch_ratios_list)
        cluster_avg_predicted_watch_ratios[cluster] = np.mean(cluster_avg_predicted_watch_ratios_list)

    return np.mean(all_avg_watch_ratios_list), cluster_avg_watch_ratios, np.mean(all_avg_predicted_watch_ratios_list), cluster_avg_predicted_watch_ratios

In [95]:
overall_avg_watch_ratio, cluster_avg_watch_ratio, overall_avg_predicted_watch_ratio, cluster_avg_predicted_watch_ratio = get_avg_watch_ratio_at_k(50, recommendations)

100%|██████████| 269/269 [00:09<00:00, 27.84it/s]
100%|██████████| 419/419 [00:16<00:00, 26.19it/s]
100%|██████████| 345/345 [00:13<00:00, 26.06it/s]
100%|██████████| 378/378 [00:15<00:00, 25.13it/s]


In [165]:
# Avg watch ratio in the top 50 ground truth videos
avg_watch_ratio_df = pd.DataFrame(cluster_avg_watch_ratio.items(), columns=['cluster', f'Avg Watch Ratio @ {k}'])
avg_watch_ratio_df['cluster'] = avg_watch_ratio_df['cluster'].astype(int)
avg_watch_ratio_df = pd.concat([avg_watch_ratio_df, pd.DataFrame([['Overall', overall_avg_watch_ratio]], columns=['cluster', f'Avg Watch Ratio @ {k}'])])

# Avg predicted watch ratio in the top 50 recommendations
avg_predicted_watch_ratio_df = pd.DataFrame(cluster_avg_predicted_watch_ratio.items(), columns=['cluster', f'Avg Predicted Watch Ratio @ {k}'])
avg_predicted_watch_ratio_df['cluster'] = avg_predicted_watch_ratio_df['cluster'].astype(int)
avg_predicted_watch_ratio_df = pd.concat([avg_predicted_watch_ratio_df, pd.DataFrame([['Overall', overall_avg_predicted_watch_ratio]], columns=['cluster', f'Avg Predicted Watch Ratio @ {k}'])])

# Merge with metrics_df
metrics_df3 = metrics_df2.merge(avg_watch_ratio_df, on='cluster', how='left')
metrics_df3 = metrics_df3.merge(avg_predicted_watch_ratio_df, on='cluster', how='left')

metrics_df3

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50,Avg Watch Ratio @ 50,Avg Predicted Watch Ratio @ 50
0,0,0.883611,20.30855,0.85177,1.350928
1,1,0.873609,22.360382,0.843245,1.800831
2,2,0.871041,21.075362,0.827713,1.532607
3,3,0.876206,21.507937,0.843225,1.612592
4,Overall,0.875584,21.426648,0.841067,1.599048


### Precision@k, Recall@k, F1Score@k

In [None]:
def get_user_precision_recall_f1_at_k(k, user_id, recommendations, ground_truth, threshold):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
    
    Returns:
        Precision, recall, and F1 score at k for a specific user.
    """
    top_k_recommendations = get_top_k_for_user(k, user_id, recommendations)
    top_k_ground_truth = get_top_k_for_user(k, user_id, ground_truth)

    tp = 0
    fp = 0
    fn = 0

    for video_id in top_k_recommendations['video_id']:
        if video_id in top_k_ground_truth['video_id'].values:
            if top_k_ground_truth[top_k_ground_truth['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
                # If the video is in top_k_ground_truth and watch ratio is above the threshold, it is a true positive
                tp += 1
            else:
                # If the video is in top_k_ground_truth but watch ratio is below the threshold, it is a false positive
                fp += 1
        else:
            # If video is not in top_k_ground_truth, it is a false positive
            fp += 1

    for video_id in top_k_ground_truth['video_id']:
        if video_id not in top_k_recommendations['video_id'].values:
            if top_k_ground_truth[top_k_ground_truth['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
                # If the video is in top_k_ground_truth but not in top_k_recommendations, it is a false negative
                fn += 1

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    print(tp, fp, fn)
    return precision, recall, f1

In [177]:
threshold = 0.5

# Precision, recall, and F1 @ 50 for user 14
precision, recall, f1 = get_user_precision_recall_f1_at_k(50, 14, recommendations, ground_truth, threshold)
precision, recall, f1

0.12
6 44 44


(0.12, 0.12, 0.12)

In [159]:
def get_precision_recall_f1_at_k(k, recommendations, ground_truth, threshold):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
    
    Returns:
        The overall average precision, recall, and F1 score at k, and a dictionary with the average precision, recall, and F1 score per cluster.
    """
    all_precision_list = []
    all_recall_list = []
    all_f1_list = []

    cluster_precision = {}
    cluster_recall = {}
    cluster_f1 = {}

    for cluster in sorted(recommendations['cluster'].unique()):
        cluster_precision_list = []
        cluster_recall_list = []
        cluster_f1_list = []

        users_in_train = set(recommendations[recommendations['cluster'] == cluster]['user_id'])
        users_in_val = set(joined_val_data['user_id'])

        # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
        users_in_val = users_in_val.intersection(users_in_train)
        for user_id in tqdm(users_in_val):
            user_precision, user_recall, user_f1 = get_user_precision_recall_f1_at_k(k, user_id, recommendations, ground_truth, threshold)

            cluster_precision_list.append(user_precision)
            cluster_recall_list.append(user_recall)
            cluster_f1_list.append(user_f1)

            all_precision_list.append(user_precision)
            all_recall_list.append(user_recall)
            all_f1_list.append(user_f1)
        cluster_precision[cluster] = np.mean(cluster_precision_list)
        cluster_recall[cluster] = np.mean(cluster_recall_list)
        cluster_f1[cluster] = np.mean(cluster_f1_list)

    return np.mean(all_precision_list), np.mean(all_recall_list), np.mean(all_f1_list), cluster_precision, cluster_recall, cluster_f1

In [170]:
k = 400
avg_precision, avg_recall, avg_f1, cluster_precision, cluster_recall, cluster_f1 = get_precision_recall_f1_at_k(k, recommendations, ground_truth, threshold)

100%|██████████| 269/269 [00:36<00:00,  7.47it/s]
100%|██████████| 419/419 [00:59<00:00,  7.08it/s]
100%|██████████| 345/345 [00:50<00:00,  6.87it/s]
100%|██████████| 378/378 [00:56<00:00,  6.68it/s]


In [172]:
avg_precision_df = pd.DataFrame(cluster_precision.items(), columns=['cluster', f'Avg Precision@{k}'])
avg_precision_df['cluster'] = avg_precision_df['cluster'].astype(int)
avg_precision_df = pd.concat([avg_precision_df, pd.DataFrame([['Overall', avg_precision]], columns=['cluster', f'Avg Precision@{k}'])])

avg_recall_df = pd.DataFrame(cluster_recall.items(), columns=['cluster', f'Avg Recall@{k}'])
avg_recall_df['cluster'] = avg_recall_df['cluster'].astype(int)
avg_recall_df = pd.concat([avg_recall_df, pd.DataFrame([['Overall', avg_recall]], columns=['cluster', f'Avg Recall@{k}'])])

avg_f1_df = pd.DataFrame(cluster_f1.items(), columns=['cluster', f'Avg F1@{k}'])
avg_f1_df['cluster'] = avg_f1_df['cluster'].astype(int)
avg_f1_df = pd.concat([avg_f1_df, pd.DataFrame([['Overall', avg_f1]], columns=['cluster', f'Avg F1@{k}'])])

avg_precision_recall_f1_df = avg_precision_df.merge(avg_recall_df, on='cluster', how='left')
avg_precision_recall_f1_df = avg_precision_recall_f1_df.merge(avg_f1_df, on='cluster', how='left')
avg_precision_recall_f1_df

Unnamed: 0,cluster,Avg Precision@400,Avg Recall@400,Avg F1@400
0,0,0.520191,0.750933,0.603915
1,1,0.58104,0.761183,0.651489
2,2,0.54609,0.772937,0.630305
3,3,0.566247,0.792889,0.651574
4,Overall,0.556931,0.770597,0.637262


In [166]:
# Merge with metrics_df
metrics_df4 = metrics_df3.merge(avg_precision_recall_f1_df, on='cluster', how='left')
metrics_df4

Unnamed: 0,cluster,NDCG@50,Distinct Categories @ 50,Avg Watch Ratio @ 50,Avg Predicted Watch Ratio @ 50,Avg Precision@50,Avg Recall@50,Avg F1@50
0,0,0.883611,20.30855,0.85177,1.350928,0.095093,0.095093,0.095093
1,1,0.873609,22.360382,0.843245,1.800831,0.104678,0.104678,0.104678
2,2,0.871041,21.075362,0.827713,1.532607,0.090319,0.090319,0.090319
3,3,0.876206,21.507937,0.843225,1.612592,0.111217,0.111217,0.111217
4,Overall,0.875584,21.426648,0.841067,1.599048,0.101091,0.101091,0.101091
