In [2]:
from timeit import default_timer as timer
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

In [3]:
# ranks should be sorted in order to work 
reco_data = {'user': [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5], 'item': [1, 2, 3, 4, 4, 6, 5, 3, 2, 4, 5, 1, 2, 5, 7, 2, 1, 2, 3, 4], 'rank': [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]}
reco = pd.DataFrame(data=reco_data)
reco

Unnamed: 0,user,item,rank
0,1,1,1
1,1,2,2
2,1,3,3
3,1,4,4
4,2,4,1
5,2,6,2
6,2,5,3
7,2,3,4
8,3,2,1
9,3,4,2


In [4]:
interactions_data = {'user': [1, 1, 2, 2, 2, 2, 3, 5], 'item': [1, 4, 6, 5, 2, 3, 1, 6]}
interactions = pd.DataFrame(data=interactions_data)
interactions

Unnamed: 0,user,item
0,1,1
1,1,4
2,2,6
3,2,5
4,2,2
5,2,3
6,3,1
7,5,6


In [5]:
k = 3
reco_k_first_ranks = reco[reco['rank'] <= k]
reco_k_first_ranks

Unnamed: 0,user,item,rank
0,1,1,1
1,1,2,2
2,1,3,3
4,2,4,1
5,2,6,2
6,2,5,3
8,3,2,1
9,3,4,2
10,3,5,3
12,4,2,1


In [6]:
interacted_users = interactions['user'].unique()
reco_for_interacted_users = reco_k_first_ranks[reco_k_first_ranks['user'].isin(interacted_users)]
reco_for_interacted_users

Unnamed: 0,user,item,rank
0,1,1,1
1,1,2,2
2,1,3,3
4,2,4,1
5,2,6,2
6,2,5,3
8,3,2,1
9,3,4,2
10,3,5,3
16,5,1,1


In [7]:
reco_true_interactions = reco_for_interacted_users.merge(interactions, left_on=['user', 'item'], right_on=['user', 'item'], how='inner')
reco_true_interactions

Unnamed: 0,user,item,rank
0,1,1,1
1,2,6,2
2,2,5,3


In [8]:
# https://stackoverflow.com/questions/55125680/pandas-get-all-groupby-values-in-an-array
# merge and groupby should preserve order
users_true_interactions = reco_true_interactions[['user', 'rank']].groupby('user')['rank'].apply(np.array)
users_true_interactions

user
1       [1]
2    [2, 3]
Name: rank, dtype: object

In [9]:
# https://stackoverflow.com/questions/36985659/numpy-replace-values-and-return-new-array
def _put_copy(arr, ind, v):
  arr_copy = arr.copy()
  np.put(arr_copy, ind, v)
  return arr_copy

#can't stack because sizes are different
users_tps_series = users_true_interactions.apply(lambda x: _put_copy(np.zeros(k + 1), x, 1))
users_tps = np.stack(users_tps_series.values)
users_tps

array([[0., 1., 0., 0.],
       [0., 0., 1., 1.]])

In [10]:
users_tps = np.cumsum(users_tps, axis=1)
users_fps = np.arange(k + 1) - users_tps
print(users_tps)
print(users_fps)

[[0. 1. 1. 1.]
 [0. 0. 1. 2.]]
[[0. 0. 1. 2.]
 [0. 1. 1. 1.]]


In [11]:
catalog = np.arange(1, 10)
tp = users_tps[:, -1]
fp = users_fps[:, -1]

interactions_series = interactions.groupby('user')['item'].count().rename('interactions_count')
tp_series = pd.Series(index=users_true_interactions.index, data=tp, name='tp')
fn_all = pd.concat([interactions_series, tp_series], axis=1)
fn_all['tp'] = fn_all['tp'].fillna(0)
fn_all = fn_all['interactions_count'] - fn_all['tp']
fn = fn_all.drop(labels=(set(fn_all.index) - set(users_true_interactions.index))).to_numpy()
tn = len(catalog) - fn - k
print(tp)
print(fp)
print(fn_all)
print(fn)
print(tn)

[1. 2.]
[2. 1.]
user
1    1.0
2    2.0
3    1.0
5    1.0
dtype: float64
[1. 2.]
[5. 4.]


In [12]:
users_tpr = users_tps / (tp + fn).reshape(-1, 1)
users_fpr = users_fps / (fp + tn).reshape(-1, 1)
users_tpr = np.hstack((users_tpr, np.ones((users_tpr.shape[0], 1))))
users_fpr = np.hstack((users_fpr, np.ones((users_fpr.shape[0], 1))))
print(users_tpr)
print(users_fpr)

[[0.   0.5  0.5  0.5  1.  ]
 [0.   0.   0.25 0.5  1.  ]]
[[0.         0.         0.14285714 0.28571429 1.        ]
 [0.         0.2        0.2        0.2        1.        ]]


In [13]:
aucs = np.trapz(users_tpr, users_fpr)
aucs

array([0.67857143, 0.6       ])

In [14]:
aucs_series = pd.Series(index=users_true_interactions.index, data=aucs)
aucs_series

user
1    0.678571
2    0.600000
dtype: float64

In [15]:
fn_for_users_with_all_fps = fn_all.drop(labels=users_true_interactions.index)
aucs_for_users_with_all_fps = (1 - k/(len(catalog) - fn_for_users_with_all_fps)) / 2
aucs_with_users_with_all_fps = pd.concat([aucs_series, aucs_for_users_with_all_fps])
aucs_with_users_with_all_fps

user
1    0.678571
2    0.600000
3    0.312500
5    0.312500
dtype: float64

In [16]:
aucs_for_users_without_interactions = pd.Series(index=list(set(reco['user'].unique()) - set(interacted_users)), data=np.nan)
auc_per_user = pd.concat([aucs_with_users_with_all_fps, aucs_for_users_without_interactions], sort=True)
auc_per_user

1    0.678571
2    0.600000
3    0.312500
5    0.312500
4         NaN
dtype: float64