In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import ndcg_score

from collections import defaultdict

from tqdm import tqdm

In [4]:
recommendations_caption = pd.read_csv('recommendations/recommendations_caption.csv', index_col=0)
recommendations_random = pd.read_csv('recommendations/recommendations_random.csv', index_col=0)
joined_train_data = pd.read_csv('data_exports/joined_train_data.csv')
joined_val_data = pd.read_csv('data_exports/joined_val_data.csv')

In [5]:
video_data = pd.read_csv('data/kuairec_caption_category_translated.csv', index_col=0)

In [6]:
_, n_videos = recommendations_caption.shape

In [8]:
videos_watched_before = {}

for index, row in joined_train_data.iterrows():
    user_id = row['user_id']
    video_id = row['video_id']
    if user_id not in videos_watched_before:
        videos_watched_before[user_id] = set()
    videos_watched_before[user_id].add(video_id)

In [9]:
ground_truth = {}

In [10]:
for user in videos_watched_before:
    ground_truth[user] = np.zeros(n_videos)

for index, row in joined_val_data.iterrows():
    user_id = row['user_id']
    video_id = row['video_id']
    watch_ratio = row['watch_ratio']
    if video_id not in videos_watched_before[user_id]:
        ground_truth[user_id][video_id] = watch_ratio

In [11]:
ground_truth_df = pd.DataFrame.from_dict(ground_truth, orient='index')

In [12]:
ground_truth_df.to_csv('recommendations/ground_truth_val.csv')

In [7]:
ground_truth_df = pd.read_csv('recommendations/ground_truth_val.csv', index_col=0)

In [8]:
k = 50

In [11]:
def get_top_k_for_user(recommendations, user_id):
    return recommendations.loc[user_id].sort_values(ascending=False).head(k).index

def get_category_tally_at_k(recommendations, user_id):
    top_k = get_top_k_for_user(recommendations, user_id)
    tally = defaultdict(int)

    for video_id in top_k:
        category = video_data.loc[video_id]['english_first_level_category_name']
        # tally[category] += joined_val_data[joined_val_data['user_id'] == user_id & joined_val_data['video_id'] == video_id]['watch_ratio'].values[0]
        tally[category] += 1
    
    return tally

def get_category_ndcg_at_k(recommendations, user_id):
    cat_tally_reco = get_category_tally_at_k(recommendations, user_id)
    cat_tally_gt = get_category_tally_at_k(ground_truth_df, user_id)

    cat_tally_reco_adjusted = {}
    for category in cat_tally_gt:
        cat_tally_reco_adjusted[category] = cat_tally_reco.get(category, 0)

    return ndcg_score([list(cat_tally_gt.values())], [list(cat_tally_reco_adjusted.values())])

def get_average_ndcg_at_k(recommendations):
    ndcg_scores = []

    for user_id in tqdm(ground_truth_df.index):
        ndcg_scores.append(get_category_ndcg_at_k(recommendations, user_id))

    return np.mean(ndcg_scores)

In [21]:
print(get_average_ndcg_at_k(recommendations_caption))
print(get_average_ndcg_at_k(recommendations_random))

100%|██████████| 1411/1411 [00:34<00:00, 40.35it/s]


0.7656175572142835


100%|██████████| 1411/1411 [00:35<00:00, 40.27it/s]

0.7778655954786724





In [12]:
def get_distinct_categories_at_k(recommendations, user_id):
    top_k = get_top_k_for_user(recommendations, user_id)
    categories = set()

    for video_id in top_k:
        category = video_data.loc[video_id]['english_first_level_category_name']
        categories.add(category)
    
    return len(categories)

def get_average_distinct_categories_at_k(recommendations):
    distinct_categories = []

    for user_id in tqdm(ground_truth_df.index):
        distinct_categories.append(get_distinct_categories_at_k(recommendations, user_id))

    return np.mean(distinct_categories)

In [13]:
print(get_average_distinct_categories_at_k(recommendations_caption))
print(get_average_distinct_categories_at_k(recommendations_random))

100%|██████████| 1411/1411 [00:17<00:00, 81.35it/s]


14.472005669737774


100%|██████████| 1411/1411 [00:17<00:00, 81.37it/s]

22.08433734939759





In [14]:
def get_avg_watch_ratio_at_k(recommendations, user_id):
    top_k = get_top_k_for_user(recommendations, user_id)
    watch_ratios = []

    for video_id in top_k:
        watch_ratios.append(ground_truth_df.loc[user_id][video_id])
    
    return np.mean(watch_ratios)

def get_average_watch_ratio_at_k(recommendations):
    watch_ratios = []

    for user_id in tqdm(ground_truth_df.index):
        watch_ratios.append(get_avg_watch_ratio_at_k(recommendations, user_id))

    return np.mean(watch_ratios)

In [15]:
print(get_average_watch_ratio_at_k(recommendations_caption))
print(get_average_watch_ratio_at_k(recommendations_random))

100%|██████████| 1411/1411 [00:03<00:00, 446.95it/s]


0.12574325081395435


100%|██████████| 1411/1411 [00:03<00:00, 451.69it/s]

0.08868784814552783





In [16]:
threshold = 0.7

In [19]:
def precision_recall_f1_at_k(recommendations, user_id):
    top_k = get_top_k_for_user(recommendations, user_id)
    tp = 0
    
    for video_id in top_k:
        if ground_truth_df.loc[user_id][video_id] > threshold:
            tp += 1
    
    precision = tp / k

    recall = tp / np.sum(ground_truth_df.loc[user_id] > threshold)

    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1

def get_average_precision_recall_f1_at_k(recommendations):
    precisions = []
    recalls = []
    f1s = []

    for user_id in tqdm(ground_truth_df.index):
        precision, recall, f1 = precision_recall_f1_at_k(recommendations, user_id)
        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    return np.mean(precisions), np.mean(recalls), np.mean(f1s)

In [20]:
print(get_average_precision_recall_f1_at_k(recommendations_caption))
print(get_average_precision_recall_f1_at_k(recommendations_random))

100%|██████████| 1411/1411 [00:03<00:00, 407.04it/s]


(0.07980155917788802, 0.007808810297744872, 0.014148214169573906)


100%|██████████| 1411/1411 [00:03<00:00, 418.57it/s]

(0.05705173635719348, 0.005584977462673443, 0.010117640070494816)





In [34]:
# joined_val_data['watch_ratio'].mean()
# joined_val_data['watch_ratio'].median()

x = ground_truth_df.loc[14]

In [None]:
below_threshold = 0
above_threshold = 0

# for each row in ground_truth_df
for index, row in ground_truth_df.iterrows():
    # get the row
    x = row
    # get the number of non-zero values
    y = x[x > 0.0]

    if y.median() < 0.7:
        below_threshold += 1
    else:
        above_threshold += 1

print(below_threshold)
print(above_threshold)

557
854


In [44]:
recommendations_caption

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10718,10719,10720,10721,10722,10723,10724,10725,10726,10727
14,4.729524,4.584302,4.756215,4.663043,4.538526,4.487528,4.770037,4.710154,4.346988,4.625941,...,4.579754,4.667784,4.751176,4.318532,4.695892,4.791679,4.605528,4.630749,4.624801,4.688878
19,4.725050,4.580030,4.752950,4.664137,4.541591,4.490345,4.769165,4.710895,4.352001,4.621956,...,4.585726,4.672139,4.750876,4.322028,4.698574,4.792092,4.610422,4.626287,4.627770,4.692447
21,4.728531,4.581941,4.753937,4.665498,4.545474,4.493424,4.769598,4.711306,4.353521,4.623811,...,4.586579,4.672764,4.749549,4.323714,4.701213,4.790397,4.609851,4.629597,4.629031,4.694502
23,4.721708,4.578430,4.751675,4.661883,4.545264,4.492078,4.766811,4.710204,4.359156,4.620590,...,4.589119,4.674706,4.747270,4.327264,4.700509,4.788663,4.610827,4.625542,4.630556,4.694591
24,4.721366,4.582058,4.751916,4.659609,4.540278,4.492656,4.767325,4.710432,4.352440,4.623292,...,4.583630,4.671717,4.749989,4.321323,4.695777,4.789942,4.611082,4.626125,4.626337,4.689455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7142,4.727670,4.584960,4.753845,4.661369,4.536234,4.489650,4.768251,4.710023,4.345253,4.625865,...,4.577229,4.665365,4.751517,4.319022,4.691820,4.791328,4.606975,4.628138,4.622860,4.685365
7147,4.730193,4.586308,4.753086,4.660717,4.538314,4.492768,4.770465,4.714970,4.347666,4.627142,...,4.579386,4.670721,4.753055,4.318561,4.696995,4.793811,4.612500,4.627772,4.622376,4.690160
7153,4.728416,4.582655,4.753454,4.660882,4.544476,4.496172,4.768706,4.712557,4.354093,4.623998,...,4.586230,4.673282,4.749466,4.323827,4.701179,4.791043,4.612863,4.628379,4.627998,4.694418
7159,4.723734,4.582644,4.752858,4.658630,4.538820,4.492357,4.767504,4.710072,4.351875,4.623766,...,4.581780,4.670050,4.749933,4.323838,4.694909,4.790268,4.610851,4.626533,4.625298,4.688614
