In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score

from collections import defaultdict

from tqdm import tqdm

In [176]:
root = '../'

prediction_scores = pd.read_csv(root + 'recommendations/recommendations_caption_2.csv', index_col=0)
prediction_scores_random = pd.read_csv(root + 'recommendations/recommendations_random.csv', index_col=0)
joined_train_data = pd.read_csv(root + 'data_exports/joined_train_data.csv')
joined_val_data = pd.read_csv(root + 'data_exports/joined_val_data.csv')

video_data = pd.read_csv(root + 'data/kuairec_caption_category_translated.csv', index_col=0)

In [177]:
# Rename
prediction_scores = prediction_scores.rename(columns={'watch_ratio': 'predicted_watch_ratio'})
prediction_scores_random = prediction_scores_random.rename(columns={'watch_ratio': 'predicted_watch_ratio'})

# Sort predictions
prediction_scores = prediction_scores.sort_values(by=['user_id', 'video_id'])
prediction_scores_random = prediction_scores_random.sort_values(by=['user_id', 'video_id'])
prediction_scores

Unnamed: 0,user_id,video_id,predicted_watch_ratio
0,14,0,4.739207
1,14,1,4.587039
2,14,2,4.758113
3,14,3,4.664595
4,14,4,4.527781
...,...,...,...
12585121,7162,10723,4.786916
12585122,7162,10724,4.605312
12585123,7162,10725,4.621752
12585124,7162,10726,4.626483


## Get user watch history

We want to be able to filter out videos that the user has already watched. This is so that we recommend new videos instead.

In [10]:
def get_user_watch_history(data):
    """
    Args: 
        data: DataFrame of user watch history. Must contain columns 'user_id' and 'video_id'.
        
    Returns:
        A dictionary with user_id as key and a set of video_ids that the user has watched as value.
    """
    watch_history_dict = defaultdict(set)
    for user in data['user_id'].unique():
        watch_history_dict[user] = set(data[data['user_id'] == user]['video_id'])
    return watch_history_dict

In [11]:
user_watch_history = get_user_watch_history(joined_train_data)

## Getting ground truth videos for each user

In [12]:
def get_ground_truth(ground_truth_df, valid_videos, user_watch_history):
    """
    Args:
        ground_truth_df: DataFrame with the ground truth watch ratios.
        videos_in_train_data: List of video_ids that are present in the training data.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the ground truth watch ratios. It only contains videos that the user has not watched before, and videos that are present in training data.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    ground_truth_new = pd.DataFrame(columns=['user_id', 'video_id', 'watch_ratio'])

    for user in ground_truth_df['user_id'].unique():
        user_ground_truth = ground_truth_df[ground_truth_df['user_id'] == user].copy()
        user_ground_truth = user_ground_truth[~user_ground_truth['video_id'].isin(user_watch_history[user])]
        user_ground_truth = user_ground_truth[user_ground_truth['video_id'].isin(valid_videos)]

        ground_truth_new = pd.concat([ground_truth_new, user_ground_truth])

    # Sort by watch_ratio in descending order
    ground_truth_new = ground_truth_new.sort_values(by=['user_id', 'watch_ratio'], ascending=[True, False])
    return ground_truth_new

In [13]:
videos_in_train_data = set(joined_train_data['video_id'])

ground_truth = get_ground_truth(joined_val_data[['user_id', 'video_id', 'watch_ratio']], videos_in_train_data, user_watch_history)

In [14]:
# Ground truth scores for user 14
ground_truth[ground_truth['user_id'] == 14]

Unnamed: 0,user_id,video_id,watch_ratio
11,14,8766,3.318871
702,14,8799,3.185954
607,14,2735,2.598506
602,14,4201,2.478148
573,14,4015,2.319912
...,...,...,...
131,14,7297,0.032396
991,14,4021,0.032293
180,14,4141,0.032250
61,14,7461,0.029277


## Getting recommendations for each user

In [None]:
def get_user_recommendations(prediction_scores, user_watch_history):
    """
    Args:
        prediction_scores: DataFrame with the predicted watch_ratios.
        user_watch_history: Dictionary with user_id as key and a list of video_ids that the user has watched as value.
    
    Returns:
        DataFrame with the recommendations for a specific user. It only contains videos that the user has not watched before.
        The dataframe is sorted by user in ascending order and watch_ratio in descending order.
    """
    recommendations_list = []
    for user in tqdm(prediction_scores['user_id'].unique()):
        user_recommendations = prediction_scores[prediction_scores['user_id'] == user].copy()
        user_recommendations = user_recommendations[~user_recommendations['video_id'].isin(user_watch_history[user])]
        
        recommendations_list.append(user_recommendations)

    # Concatenate all at once
    recommendations_new = pd.concat(recommendations_list)

    # Sort by prediction in descending order
    recommendations_new = recommendations_new.sort_values(by=['user_id', 'predicted_watch_ratio'], ascending=[True, False])
    return recommendations_new

In [235]:
recommendations = get_user_recommendations(prediction_scores, user_watch_history)
recommendations_random = get_user_recommendations(prediction_scores_random, user_watch_history)

100%|██████████| 1411/1411 [00:10<00:00, 140.76it/s]
100%|██████████| 1411/1411 [00:10<00:00, 139.52it/s]


In [190]:
# Recommendations for user 14
recommendations[recommendations['user_id'] == 14]

Unnamed: 0,user_id,video_id,predicted_watch_ratio
1881,14,2449,4.822654
2327,14,2966,4.821256
4076,14,4989,4.821235
368,14,488,4.815921
2506,14,3145,4.814930
...,...,...,...
7569,14,9140,4.753068
3113,14,3822,4.753046
7421,14,8992,4.753025
1737,14,2160,4.753019


In [238]:
reco_grp = recommendations.groupby('user_id')
reco_grp_random = recommendations_random.groupby('user_id')
ground_truth_grp = ground_truth.groupby('user_id')

In [66]:
def get_top_k_for_user(k, user_id, df):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        df: DataFrame containing the scores for all users, sorted by score in descending order.
    
    Returns:
        DataFrame with the top k scores.
    """
    if isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
        return df.get_group(user_id).head(k)

    return df[df['user_id'] == user_id].head(k)

In [194]:
k = 50

# Get top 50 ground truth and recommendations for user 14
top_50_ground_truth_user_14 = get_top_k_for_user(k, 14, ground_truth)
top_50_recommendations_user_14 = get_top_k_for_user(k, 14, reco_grp)
top_50_recommendations_user_14_random = get_top_k_for_user(k, 14, reco_grp_random)

# top_50_ground_truth_user_14
top_50_recommendations_user_14

Unnamed: 0,user_id,video_id,predicted_watch_ratio
1881,14,2449,4.822654
2327,14,2966,4.821256
4076,14,4989,4.821235
368,14,488,4.815921
2506,14,3145,4.81493
6113,14,7485,4.812589
3199,14,3980,4.811253
3407,14,4318,4.810647
835,14,1177,4.810339
5505,14,6664,4.810316


## Calcuation of Evaluation Metrics

### Category-Aware NDCG@k

In [40]:
def get_category_tally_at_k(recommendations, video_info):
    """
    Args:
        recommendations: DataFrame with the top k recommendations for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        Dictionary with the category as key and the number of videos in each category as value.
    """
    tally = defaultdict(int)

    for video_id in recommendations['video_id']:
        category = video_info.loc[str(video_id)]['english_first_level_category_name']
        tally[category] += 1
    
    return tally

def get_category_ndcg_at_k(recommendations, ground_truth, video_info):
    """
    Args:
        recommendations: DataFrame with the top k video recommendations for a specific user.
        ground_truth: DataFrame with the ground truth videos for a specific user.
        video_info: DataFrame with information about the videos.

    Returns:
        NDCG score for the categories of the top k recommendations.
    """
    cat_tally_reco = get_category_tally_at_k(recommendations, video_info)
    cat_tally_gt = get_category_tally_at_k(ground_truth, video_info)

    cat_tally_reco_adjusted = {}
    for category in cat_tally_gt:
        cat_tally_reco_adjusted[category] = cat_tally_reco.get(category, 0)
        
    return ndcg_score([list(cat_tally_gt.values())], [list(cat_tally_reco_adjusted.values())])

In [198]:
# Get the category-aware NDCG@50 for user 14
print(get_category_ndcg_at_k(top_50_recommendations_user_14, top_50_ground_truth_user_14, video_data))
print(get_category_ndcg_at_k(top_50_recommendations_user_14_random, top_50_ground_truth_user_14, video_data))

0.7071373787534083
0.9331335159393579


In [None]:
# Get average category-aware NDCG@50 for all users, and per cluster
def get_average_ndcg_at_k(k, ground_truth, recommendations, video_info):
    """
    Args:
        k: The number of recommendations to return.
        ground_truth: DataFrame with the ground truth watch ratios, sorted by descending watch_ratio.
        recommendations: DataFrame with all video recommendations, sorted by descending predicted_watch_ratio.
        video_info: DataFrame with information about the videos.

    Returns:
        The average category-aware NDCG@50 for all users.
    """
    ndcg_scores = []

    users_in_train = set(recommendations.groups.keys()) if isinstance(recommendations, pd.core.groupby.generic.DataFrameGroupBy) else set(recommendations['user_id'])
    users_in_val = set(ground_truth['user_id'])

    # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
    users_in_val = users_in_val.intersection(users_in_train)
    for user_id in tqdm(users_in_val):
        user_recommendations_top_k = get_top_k_for_user(k, user_id, recommendations)
        user_ground_truth_top_k = get_top_k_for_user(k, user_id, ground_truth)

        user_ndcg_score = get_category_ndcg_at_k(user_recommendations_top_k, user_ground_truth_top_k, video_info)

        ndcg_scores.append(user_ndcg_score)

    return np.mean(ndcg_scores)

In [199]:
overall_ndcg = get_average_ndcg_at_k(k, ground_truth, reco_grp, video_data)
overall_ndcg_random = get_average_ndcg_at_k(k, ground_truth, reco_grp_random, video_data)

print(f'Overall NDCG@{k}: {overall_ndcg}')
print(f'Overall NDCG@{k} random: {overall_ndcg_random}')

100%|██████████| 1411/1411 [01:34<00:00, 14.91it/s]
100%|██████████| 1411/1411 [01:35<00:00, 14.82it/s]

Overall NDCG@50: 0.8224443169370017
Overall NDCG@50 random: 0.8076224731772924





### Distinct Categories @ k

In [200]:
def get_user_distinct_categories_at_k(k, user_id, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The number of distinct categories in the top k recommendations.
    """
    top_k = get_top_k_for_user(k, user_id, recommendations)
    categories = set()

    for video_id in top_k['video_id']:
        category = video_data.loc[str(video_id)]['english_first_level_category_name']
        categories.add(category)
    
    return len(categories)

In [201]:
# Get Distinct Categories @ 50 for user 14
print(get_user_distinct_categories_at_k(50, 14, recommendations))
print(get_user_distinct_categories_at_k(50, 14, recommendations_random))

15
22


In [None]:
def get_average_distinct_categories_at_k(k, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
    
    Returns:
        The overall average number of distinct categories in the top k recommendations, and a dictionary with the average number of distinct categories per cluster.
    """
    all_distinct_categories = []

    users_in_train = set(recommendations.groups.keys()) if isinstance(recommendations, pd.core.groupby.generic.DataFrameGroupBy) else set(recommendations['user_id'])
    users_in_val = set(ground_truth['user_id'])

    # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
    users_in_val = users_in_val.intersection(users_in_train)
    for user_id in tqdm(users_in_val):
        user_distinct_categories = get_user_distinct_categories_at_k(k, user_id, recommendations)

        all_distinct_categories.append(user_distinct_categories)

    return np.mean(all_distinct_categories)

In [202]:
overall_distinct_categories = get_average_distinct_categories_at_k(50, reco_grp)
overall_distinct_categories_random = get_average_distinct_categories_at_k(50, reco_grp_random)

print(f'Overall Distinct Categories @50: {overall_distinct_categories}')
print(f'Overall Distinct Categories @50 random: {overall_distinct_categories_random}')

100%|██████████| 1411/1411 [00:19<00:00, 71.09it/s]
100%|██████████| 1411/1411 [00:20<00:00, 69.10it/s]

Overall Distinct Categories @50: 13.360737065910701
Overall Distinct Categories @50 random: 22.00850460666194





### Average watch ratio @ k

In [None]:
def get_user_avg_watch_ratio_at_k(k, user_id, recommendations, watch_ratio_column, ground_truth):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The average watch_ratio in the top k recommendations.
    """
    reco_subset = recommendations.get_group(user_id) if isinstance(recommendations, pd.core.groupby.generic.DataFrameGroupBy) else recommendations[recommendations['user_id'] == user_id]
    ground_truth_subset = ground_truth.get_group(user_id) if isinstance(ground_truth, pd.core.groupby.generic.DataFrameGroupBy) else ground_truth[ground_truth['user_id'] == user_id]

    video_ids = set(ground_truth_subset['video_id'])

    top_k = set(reco_subset[reco_subset['video_id'].isin(video_ids)].head(k)['video_id'].tolist())

    return np.mean(ground_truth_subset[ground_truth_subset['video_id'].isin(top_k)][watch_ratio_column])

In [221]:
# Get avg watch ratio @ 50 for user 14
print(get_user_avg_watch_ratio_at_k(50, 14, reco_grp, 'watch_ratio', ground_truth_grp))
print(get_user_avg_watch_ratio_at_k(50, 14, reco_grp_random, 'watch_ratio', ground_truth_grp))

1.041527408845175
0.99515572098966


In [225]:
def get_avg_watch_ratio_at_k(k, recommendations):
    """
    Args:
        k: The number of recommendations to return.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        
    Returns:
        The overall average watch_ratio in the top k ground truth videos, a dictionary with the average watch_ratio per cluster, 
        the overall average predicted_watch_ratio in the top k recommendations, and a dictionary with the average predicted_watch_ratio per cluster.
    """
    all_avg_watch_ratios_list = []

    users_in_train = set(recommendations.groups.keys()) if isinstance(recommendations, pd.core.groupby.generic.DataFrameGroupBy) else set(recommendations['user_id'])
    users_in_val = set(ground_truth['user_id'])

    # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
    users_in_val = users_in_val.intersection(users_in_train)
    for user_id in tqdm(users_in_val):
        user_avg_watch_ratio = get_user_avg_watch_ratio_at_k(k, user_id, recommendations, 'watch_ratio', ground_truth_grp)

        all_avg_watch_ratios_list.append(user_avg_watch_ratio)
    
    return np.mean(all_avg_watch_ratios_list)

In [226]:
overall_avg_watch_ratio = get_avg_watch_ratio_at_k(50, reco_grp)
overall_avg_watch_ratio_random = get_avg_watch_ratio_at_k(50, reco_grp_random)

print(f'Overall Avg Watch Ratio @50: {overall_avg_watch_ratio}')
print(f'Overall Avg Watch Ratio @50 random: {overall_avg_watch_ratio_random}')

100%|██████████| 1411/1411 [00:01<00:00, 1193.46it/s]
100%|██████████| 1411/1411 [00:01<00:00, 1153.58it/s]

Overall Avg Watch Ratio @50: 0.8787895045090198
Overall Avg Watch Ratio @50 random: 0.8428517616827994





In [None]:
# combine all metrics
metrics = {
    'ndcg': overall_ndcg,
    'distinct_categories': overall_distinct_categories,
    'avg_watch_ratio': overall_avg_watch_ratio
}

metrics_random = {
    'ndcg': overall_ndcg_random,
    'distinct_categories': overall_distinct_categories_random,
    'avg_watch_ratio': overall_avg_watch_ratio_random
}

# combine both metrics
metrics = {
    'caption': metrics,
    'random': metrics_random
}

# print it as a nice dataframe
print(pd.DataFrame(metrics))


                       caption     random
ndcg                  0.822444   0.807622
distinct_categories  13.360737  22.008505
avg_watch_ratio       0.878790   0.842852


### Precision@k, Recall@k, F1Score@k

In [157]:
def get_user_precision_recall_f1_at_k(k, user_id, recommendations, ground_truth, threshold):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
    
    Returns:
        Precision, recall, and F1 score at k for a specific user.
    """
    reco_subset = recommendations.get_group(user_id) if isinstance(recommendations, pd.core.groupby.generic.DataFrameGroupBy) else recommendations[recommendations['user_id'] == user_id]
    ground_truth_subset = ground_truth.get_group(user_id) if isinstance(ground_truth, pd.core.groupby.generic.DataFrameGroupBy) else ground_truth[ground_truth['user_id'] == user_id]

    video_ids = set(ground_truth_subset['video_id'])

    reco_subset = reco_subset[reco_subset['video_id'].isin(video_ids)].head(k)
    
    tp = 0
    fp = 0

    for video_id in reco_subset['video_id']:
        if video_id in ground_truth_subset['video_id'].values:
            if ground_truth_subset[ground_truth_subset['video_id'] == video_id]['watch_ratio'].values[0] >= threshold:
                # If the video is in top_k_ground_truth and watch ratio is above the threshold, it is a true positive
                tp += 1
            else:
                # If the video is in top_k_ground_truth but watch ratio is below the threshold, it is a false positive
                fp += 1
                
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall  = tp / np.sum(ground_truth_subset['watch_ratio'] >= threshold)
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1

In [239]:
threshold = 0.5

# Precision, recall, and F1 @ 500 for user 14
precision, recall, f1 = get_user_precision_recall_f1_at_k(500, 14, reco_grp, ground_truth, threshold)
precision_random, recall_random, f1_random = get_user_precision_recall_f1_at_k(500, 14, reco_grp_random, ground_truth, threshold)
print(precision, recall, f1)
print(precision_random, recall_random, f1_random)

0.726 0.9603174603174603 0.8268792710706149
0.728 0.9629629629629629 0.8291571753986332


In [240]:
def get_precision_recall_f1_at_k(k, recommendations, ground_truth, threshold):
    """
    Args:
        k: The number of recommendations to return.
        user_id: The user for which to get recommendations.
        recommendations: DataFrame containing the scores for all users, sorted by score in descending order.
        ground_truth: DataFrame with the ground truth watch ratios.
        threshold: The threshold for the watch ratio.
    
    Returns:
        The overall average precision, recall, and F1 score at k, and a dictionary with the average precision, recall, and F1 score per cluster.
    """
    all_precision_list = []
    all_recall_list = []
    all_f1_list = []

    users_in_train = set(recommendations.groups.keys()) if isinstance(recommendations, pd.core.groupby.generic.DataFrameGroupBy) else set(recommendations['user_id'])
    users_in_val = set(ground_truth.groups.keys()) if isinstance(ground_truth, pd.core.groupby.generic.DataFrameGroupBy) else set(ground_truth['user_id'])

    # Filter out users found in the validation set and not found in the training set. We cannot generate recommendations for these users since they do not exist in the training data
    users_in_val = users_in_val.intersection(users_in_train)

    for user_id in tqdm(users_in_val):
        user_precision, user_recall, user_f1 = get_user_precision_recall_f1_at_k(k, user_id, recommendations, ground_truth, threshold)

        all_precision_list.append(user_precision)
        all_recall_list.append(user_recall)
        all_f1_list.append(user_f1)
    
    return np.mean(all_precision_list), np.mean(all_recall_list), np.mean(all_f1_list)

In [241]:
k = 500
avg_precision, avg_recall, avg_f1 = get_precision_recall_f1_at_k(k, reco_grp, ground_truth_grp, threshold)
avg_precision_random, avg_recall_random, avg_f1_random = get_precision_recall_f1_at_k(k, reco_grp_random, ground_truth_grp, threshold)

100%|██████████| 1411/1411 [01:48<00:00, 13.02it/s]
100%|██████████| 1411/1411 [01:47<00:00, 13.08it/s]


In [242]:
# Merge all metrics
metrics = {
    'ndcg': overall_ndcg,
    'distinct_categories': overall_distinct_categories,
    'avg_watch_ratio': overall_avg_watch_ratio,
    'precision': avg_precision,
    'recall': avg_recall,
    'f1': avg_f1
}

metrics_random = {
    'ndcg': overall_ndcg_random,
    'distinct_categories': overall_distinct_categories_random,
    'avg_watch_ratio': overall_avg_watch_ratio_random,
    'precision': avg_precision_random,
    'recall': avg_recall_random,
    'f1': avg_f1_random
}

# combine both metrics
metrics = {
    'caption': metrics,
    'random': metrics_random
}

# print it as a nice dataframe
print(pd.DataFrame(metrics))

                       caption     random
ndcg                  0.822444   0.807622
distinct_categories  13.360737  22.008505
avg_watch_ratio       0.878790   0.842852
precision             0.723177   0.723086
recall                0.984918   0.984885
f1                    0.827564   0.827500
