In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from src.names import Columns

RANDOM_STATE = 12
N_PRODUCTS = 10
products_names = [f"product_{i}" for i in range(N_PRODUCTS)]
products_npv = {f"product_{i}": 10000 + i * 500 for i in range(N_PRODUCTS)}

In [2]:
X, y = make_multilabel_classification(
    n_labels=2,
    random_state=RANDOM_STATE,
    n_classes=N_PRODUCTS,
    n_samples=100,
    allow_unlabeled=False
)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)

In [4]:
classif = OneVsRestClassifier(RandomForestClassifier(random_state=RANDOM_STATE))
classif.fit(X_train, y_train);



In [5]:
predictions_test = classif.predict_proba(X_test)

In [6]:
df_recommendations = pd.DataFrame(data=predictions_test, columns=products_names).reset_index(names=Columns.Query)
df_interactions = pd.DataFrame(data=y_test, columns=products_names).reset_index(names=Columns.Query)

df_recommendations = pd.melt(df_recommendations, id_vars=Columns.Query, var_name=Columns.Item, value_name=Columns.Score).sort_values(by=Columns.Query)
df_interactions = (
    pd.melt(df_interactions, id_vars=Columns.Query, var_name=Columns.Item, value_name=Columns.Weight)
    .query(f"{Columns.Weight} == 1")
    .drop(columns=Columns.Weight)
)

Given $rel^Y_{u,j}$ as a ground truth relevance variable that indicates whether the item recommended at position $j$ in the ordered ranking $Y_{u,k}$. 

# Value at k

$$Value@k=\sum_{u\in U}\sum^{k}_{j=1}rel^{Y}_{u,j}\cdot p_{j}$$

In [7]:
def get_top_k_recommendations(recommendations: pd.DataFrame, k: int) -> pd.DataFrame:
    rnk = recommendations.groupby(Columns.Query)[Columns.Score].rank(method="first", ascending=False)
    at_k_indeces = rnk[rnk <= k].index
    return recommendations[recommendations.index.isin(at_k_indeces)]


def merge_recommendations_interactions(
    recommendations: pd.DataFrame,
    interactions: pd.DataFrame
) -> pd.DataFrame:
    merged = pd.merge(
        recommendations,
        interactions.assign(relevant=True),
        on=Columns.QueryItem,
        how="left"
    )
    return merged

In [8]:
top_recommendations = get_top_k_recommendations(df_recommendations, k=3)

In [9]:
merged_recommendations_interactions = merge_recommendations_interactions(top_recommendations, df_interactions)

In [10]:
merged_recommendations_interactions["value"] = merged_recommendations_interactions["relevant"] * merged_recommendations_interactions[Columns.Item].map(products_npv)

In [11]:
revenue_by_query = merged_recommendations_interactions.groupby(Columns.Query)["value"].sum()

In [12]:
revenue_by_query.sum()

539000

In [13]:
from src.metrics import ValueAtK

ValueAtK(k=3).calc(df_recommendations, df_interactions, products_npv)

539000

# Profit-At-Hit

PAH@k indicates the overall profit generated by the recommendation per user divided by the number of items sold;

$$PAH@k=\frac{1}{|U|}\cdot\frac{Profit@k}{Volume@k}=\frac{1}{|U|}\cdot\frac{Profit(true\ positive\ at\ k)}{|true\ positive\ at\ k|}$$

In [14]:
profit_at_hit = (
    revenue_by_query.sum()
    / merged_recommendations_interactions["relevant"].sum()
    / merged_recommendations_interactions[Columns.Query].nunique()
)
profit_at_hit

501.3953488372093

In [15]:
from src.metrics import PAHAtK

PAHAtK(k=3).calc(df_recommendations, df_interactions, products_npv)

501.3953488372093

# Expected profit at k

$$EP@k=\sum_{u\in U}\sum^{k}_{j=1}\hat{x}_{u,j}(\Theta)\cdot v_{j}$$

In [17]:
score_scaled = merged_recommendations_interactions.groupby(Columns.Query)[Columns.Score].transform(lambda x: x / x.sum())

In [32]:
merged_recommendations_interactions

Unnamed: 0,query_column,item_column,score,relevant,value
0,0,product_5,0.71,True,12500
1,0,product_8,0.55,True,14000
2,0,product_6,0.46,True,13000
3,1,product_5,0.61,True,12500
4,1,product_6,0.77,True,13000
...,...,...,...,...,...
70,23,product_5,0.49,True,12500
71,23,product_6,0.56,,
72,24,product_6,0.52,,
73,24,product_2,0.46,True,11000


In [35]:
ep = (
    score_scaled
    * merged_recommendations_interactions[Columns.Item].map(products_npv)
).sum()
ep

314762.3445513947

In [36]:
from src.metrics import EPAtK

EPAtK(k=3).calc(df_recommendations, products_npv)

314762.3445513947

# P-NDCG@k

$$P-NDCG@k=\frac{1}{|U|}\sum_{u\in U}\frac{\sum^{k}_{j=1}\frac{rel^y_{u,j}\cdot p_j}{log_2(j+1)}}{P-IDCG_u@k}$$

In [20]:
rnk = merged_recommendations_interactions.groupby(Columns.Query)[Columns.Score].rank(method="first", ascending=False)

In [21]:
dcg_u = merged_recommendations_interactions.groupby(Columns.Query)["value"].apply(lambda value: (value / np.log2(rnk + 1)).sum())

In [22]:
relevant_recs = merged_recommendations_interactions.query("relevant == True")

In [23]:
rnk_ideal = relevant_recs.groupby(Columns.Query)["value"].rank(method="first", ascending=False)

In [24]:
idcg_u = relevant_recs.groupby(Columns.Query)["value"].apply(lambda value: (value / np.log2(rnk_ideal + 1)).sum())

In [25]:
(dcg_u / idcg_u).mean()

0.9252702633315137

In [27]:
from src.metrics import PNDCGAtK

PNDCGAtK(k=3).calc(df_recommendations, df_interactions, products_npv)

0.9252702633315137