<a href="https://colab.research.google.com/github/lapshinaaa/recsys-tasks/blob/main/RecSys2_Metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><center>Recommender Systems ‚Äî Notebook #2</center></h1>

<center>
<img src="https://avatars.mds.yandex.net/get-grocery-goods/2783132/ab847ff6-95e3-4c4e-831a-0576d1949a9e/orig" width="300" />
</center>

**In this notebook, we will work through the following:**

- Explore the dataset of user interaction events from the **Yandex Lavka** application.
- Review the course competition (Kaggle contest):  
  https://www.kaggle.com/t/eb7d5a01648e4e7cb0dfa404d29497ea
- Implement a **baseline recommender model**.
- Train more advanced models (e.g., **CatBoost** for ranking).
- Implement several **new ranking quality metrics**.

In [None]:
# !pip install catboost

# !pip install numpy==1.23.5

In [None]:
import zipfile
import requests

import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

from PIL import Image
from io import BytesIO
from textwrap import wrap
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import roc_auc_score, log_loss, ndcg_score

# üóÑ Dataset:

In [None]:
def download_and_extract(url: str, filename: str, chunk_size: int = 1024):
    # load the file
    response = requests.get(url, stream=True)
    response.raise_for_status()

    total_size = int(response.headers.get('content-length', 0))

    # write the file
    with open(filename, "wb") as f:
        with tqdm(
            total=total_size,
            unit='B',
            unit_scale=True,
            desc=filename,
            bar_format='{l_bar}{bar:50}{r_bar}{bar:-50b}'  # —Ñ–æ—Ä–º–∞—Ç –¥–ª—è –∫—Ä–∞—Å–æ—Ç—ã
        ) as pbar:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))

    # inzip archive
    with zipfile.ZipFile(filename, "r") as zip_ref:
        print(f"\n–†–∞—Å–ø–∞–∫–æ–≤—ã–≤–∞–µ–º {filename}...")
        zip_ref.extractall(".")
        print(f"–§–∞–π–ª—ã –∏–∑ {filename} —É—Å–ø–µ—à–Ω–æ –∏–∑–≤–ª–µ—á–µ–Ω—ã\n")

In [None]:
download_and_extract(
    url="https://www.kaggle.com/api/v1/datasets/download/thekabeton/ysda-recsys-2025-lavka-dataset",
    filename="lavka.zip"
)

In [None]:
train = pl.read_parquet('train.parquet')
test = pl.read_parquet('test.parquet')

# for kaggle: train = train.sample(200000, shuffle=True)

train.head(100)

In [None]:
test.head(5)

# üëÄ Taking a look at the dataset

In [None]:
train.group_by(
    "action_type"
).agg(
    pl.len().alias("total_actions")
)

In [None]:
city_analysis = train.group_by("city_name").agg(
    pl.len().alias("actions_count")
).sort("actions_count", descending=True)

plt.figure(figsize=(10, 6))
sns.barplot(
    x="city_name",
    y="actions_count",
    data=city_analysis.to_pandas()
)
plt.title("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø—Ä–æ—Å–º–æ—Ç—Ä–æ–≤ –ø–æ –≥–æ—Ä–æ–¥–∞–º")
plt.xlabel("–ì–æ—Ä–æ–¥")
plt.ylabel("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–æ—Å–º–æ—Ç—Ä–æ–≤")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### –ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Å–∞–º—ã–µ –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ –ø–æ–∫—É–ø–∫–∏:

In [None]:
top_10 = train.filter(
    pl.col('action_type') == 'AT_Purchase'
).group_by(
    'product_name'
).agg(
    pl.len().alias("total_purchase"),
    pl.col('product_image').first()
).sort(
    'total_purchase', descending=True
).head(10)

top_10

In [None]:
def load_poster(row):
    title, poster_url = row[0], row[2]
    try:
        response = requests.get(poster_url, timeout=20)
        response.raise_for_status()
        return Image.open(BytesIO(response.content)), title

    except Exception as e:
        print(f"Error loading poster for '{title}': {e}")
        return None, None

def show_posters(data):
    fig, axes = plt.subplots(2, 5, figsize=(20, 10))
    plt.subplots_adjust(hspace=0.5, wspace=0.3)

    rows = [row for row in data.iter_rows()]

    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(executor.map(load_poster, rows))

    for idx, (img, title) in enumerate(results):
        ax = axes[idx//5, idx%5]
        if img and title:
            ax.imshow(img)
            wrapped_title = "\n".join(wrap(title, width=40))
            ax.set_title(wrapped_title)
        else:
            ax.set_title("Image not available", fontsize=10)
        ax.axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
show_posters(top_10)

# üé≤ –†–∞–Ω–¥–æ–º–Ω—ã–π —Å–∞–±–º–∏—Ç:

In [None]:
random_submit = test.select(
    'index',
    'request_id'
).sample(
    fraction=1.0,
    shuffle=True
)

# random_submit.write_csv('random_submit.csv')

random_submit

# üìà –ë–µ–π–∑–ª–∞–π–Ω:

In [None]:
count_purchase_in_train = train.filter(
    pl.col('action_type') == "AT_Purchase"
).group_by(
    'user_id',
    'product_id'
).agg(
    pl.len()
)

count_purchase_in_train

In [None]:
baseline_submit = test.join(
    count_purchase_in_train,
    on=["user_id", "product_id"],
    how="left"
).with_columns(
    pl.col("len").fill_null(0)
).sort(
    'len',
    descending=True
).select(
    'index',
    'request_id'
)

# baseline_submit.write_csv('baseline_submit.csv')

baseline_submit

# ü¶æ CatBoost

<center><img src="Timesplit1.svg" width="1100" /></center>


–î–∞–≤–∞–π—Ç–µ —Å–æ–±–µ—Ä—ë–º –∫–∞–∫–∏–µ-—Ç–æ —Ñ–∏—á–∏ –∏–∑ –¥–∞–Ω–Ω—ã—Ö –∏ –æ–±—É—á–∏–º –Ω–∞ –Ω–∏—Ö –≥—Ä–∞–¥–∏–µ–Ω—Ç–Ω—ã–π –±—É—Å—Ç–∏–Ω–≥. –ù—É–∂–Ω–æ –Ω–µ –∑–∞–±—ã–≤–∞—Ç—å –ø—Ä–æ –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –ª–∏–∫–∏. –ù–µ–ª—å–∑—è –¥–∞–≤–∞—Ç—å –º–æ–¥–µ–ª–∏ –≤–∏–¥–µ—Ç—å –¥–∞–Ω–Ω—ã–µ –∏–∑ –±—É–¥—É—â–µ–≥–æ, –ø–æ—ç—Ç–æ–º—É —Ñ–∏—á–∏ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Å–µ–º–ø–ª–∞ –¥–æ–ª–∂–Ω—ã –±—ã—Ç—å –ø–æ—Å—á–∏—Ç–∞–Ω—ã –Ω–∞ –¥–∞–Ω–Ω—ã—Ö –∏–∑ –ø—Ä–æ—à–ª–æ–≥–æ. –í –ø—Ä–æ—Å—Ç–µ–π—à–µ–π —Å—Ö–µ–º–µ –ø—Ä–µ–¥–ª–∞–≥–∞–µ—Ç—Å—è —Ä–∞–∑–¥–µ–ª–∏—Ç—å —Ä–∞–∑–º–µ—á–µ–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ –Ω–∞ 3 —á–∞—Å—Ç–∏:
- –í—Ç–æ—Ä–∞—è —á–∞—Å—Ç—å - train
- –¢—Ä–µ—Ç—å—è —á–∞—Å—Ç—å - validation
- –ü–µ—Ä–≤—É—é —á–∞—Å—Ç—å –∏—Å–ø–æ–ª—å–∑—É–µ–º –¥–ª—è —Ä–∞—Å—á—ë—Ç–∞ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫ –¥–ª—è —Ç—Ä–µ–π–Ω–∞
- –î–ª—è –≤–∞–ª–∏–¥–∞—Ü–∏–∏ —Å—á–∏—Ç–∞–µ–º —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∏ –∏—Å–ø–æ–ª—å–∑—É—è –ø–µ—Ä–≤—É—é –∏ –≤—Ç–æ—Ä—É—é —á–∞—Å—Ç–∏ –≤–º–µ—Å—Ç–µ

#### –î–µ–ª–∏–º train –Ω–∞ 3 —á–∞—Å—Ç–∏:

In [None]:
train_len_div3 = int(len(train) / 3)

train = train.sort(
    'timestamp'
)

train_part1 = train[:train_len_div3]
train_part2 = train[train_len_div3:train_len_div3 * 2]
train_part3 = train[train_len_div3 * 2:]

–ü–æ—Å—á–∏—Ç–∞–µ–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ–∫—É–ø–æ–∫ –∫–∞–∂–¥–æ–≥–æ —Ç–æ–≤–∞—Ä–∞ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –Ω–∞ –ø–µ—Ä–≤–æ–π —á–∞—Å—Ç–∏ —Ç—Ä–µ–π–Ω–∞:

In [None]:
def calculate_count_purchase_by_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:
    count_purchase_by_user_and_product = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id',
        'product_id'
    ).agg(
        pl.len().alias('count_purchase_by_user_and_product')
    )

    return count_purchase_by_user_and_product

count_purchase_by_user_and_product_for_train = calculate_count_purchase_by_user_and_product(train_part1)

assert set(count_purchase_by_user_and_product_for_train.columns) == set(['user_id', 'product_id', 'count_purchase_by_user_and_product'])

count_purchase_by_user_and_product_for_train.head(5)

–¢–µ–ø–µ—Ä—å –ø–æ—Å—á–∏—Ç–∞–µ–º CTR —Ç–æ–≤–∞—Ä–æ–≤ –ø–æ –≤—Å–µ–º —é–∑–µ—Ä–∞–º.

CTR (Click-Through Rate) ‚Äî –∫–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –∫–ª–∏–∫–∞–±–µ–ª—å–Ω–æ—Å—Ç–∏, –æ—Ç–Ω–æ—à–µ–Ω–∏–µ –∫–æ–ª–∏—á–µ—Å–≤–∞ –∫–ª–∏–∫–æ–≤ –∫ –∫–æ–ª–∏—á–µ—Å—Ç–≤—É –ø–æ–∫–∞–∑–æ–≤.

–í –Ω–∞—à–µ–º —Å–ª—É—á–∞–µ - –æ—Ç–Ω–æ—à–µ–Ω–∏–µ AT_Click –∫ AT_View.

–ü–æ—Å—á–∏—Ç–∞–µ–º CTR –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ç–æ–≤–∞—Ä–∞ –Ω–∞ –ø–µ—Ä–≤–æ–π —á–∞—Å—Ç–∏ —Ç—Ä–µ–π–Ω–∞:

In [None]:
def calculate_ctr(dataset: pl.DataFrame) -> pl.DataFrame:
    data = train_part1.group_by(
        'action_type',
        'product_id'
    ).agg(
        pl.len()
    )

    clicks = data.filter(
         pl.col('action_type') == "AT_Click"
    )

    views = data.filter(
         pl.col('action_type') == "AT_View"
    )

    ctr = clicks.join(
        views,
        on='product_id'
    ).with_columns(
        ctr=pl.col('len') / pl.col('len_right')
    ).select(
        'product_id',
        'ctr'
    )

    return ctr

ctr_for_train = calculate_ctr(train_part1)

assert set(ctr_for_train.columns) == set(['product_id', 'ctr'])

ctr_for_train.head(5)

–°–æ–∑–¥–∞—ë–º —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã–π –ø—É–ª –¥–ª—è –∫–∞—Ç–±—É—Å—Ç–∞. –ë–µ—Ä—ë–º —Å–æ–±—ã—Ç–∏—è –∏–∑ –≤—Ç–æ—Ä–æ–π —á–∞—Å—Ç–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞ –∏ –∫–ª–µ–∏–º –∫ –Ω–∏–º —Å–æ–∑–¥–∞–Ω–Ω—ã–µ —Ñ–∏—á–∏:

In [None]:
def join_features_to_dataset(
    dataset: pl.DataFrame,
    count_purchase_by_user_and_product: pl.DataFrame,
    ctr: pl.DataFrame
) -> pl.DataFrame:
    catboost_pool = dataset.filter(
        pl.col('action_type').is_in(["AT_View", "AT_CartUpdate"])
    ).with_columns(
        target=pl.when(pl.col('action_type') == "AT_View").then(0).otherwise(1)
    ).group_by(
        ['product_id', 'request_id']
    ).max().drop(
        'source_type',
        'store_id',
        'timestamp',
        'product_image',
        'product_name',
        'city_name',
        'position_in_request',
        'product_category',
        'action_type'
    ).join(
        ctr,
        on='product_id',
        how='left'
    ).join(
        count_purchase_by_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    )

    return catboost_pool

train_catboost = join_features_to_dataset(
    train_part2,
    calculate_count_purchase_by_user_and_product(train_part1),
    calculate_ctr(train_part1)
)

assert set(train_catboost.columns) == set(['ctr', 'count_purchase_by_user_and_product', 'target', 'request_id', 'product_id', 'user_id'])

train_catboost.head(5)

–ü—Ä–æ–¥–µ–ª—ã–≤–∞–µ–º —Ç–æ-–∂–µ —Å–∞–º–æ–µ –¥–ª—è –≤–∞–ª–∏–¥–∞—Ü–∏–∏. –§–∏—á–∏ —Å—á–∏—Ç–∞–µ–º –ø–æ —Å–æ–±—ã—Ç–∏—è–º –∏–∑ 1 –∏ 2 —á–∞—Å—Ç–∏ –¥–∞—Ç–∞—Å–µ—Ç–∞. –ó–∞—Ç–µ–º –∫–ª–µ–∏–º –∏—Ö –∫ 3 —á–∞—Å—Ç–∏:

In [None]:
train_parts_1_2 = pl.concat([train_part1, train_part2])

val_catboost = join_features_to_dataset(
    train_part3,
    calculate_count_purchase_by_user_and_product(train_parts_1_2),
    calculate_ctr(train_parts_1_2)
)

val_catboost.head(5)

#### –û–±—É—á–∞–µ–º –∫–∞—Ç–±—É—Å—Ç:

In [None]:
from catboost import CatBoostClassifier, Pool

# –ü—Ä–∏–º–µ—Ä –¥–∞–Ω–Ω—ã—Ö
train_data = Pool(
    data=train_catboost.drop(['target', 'request_id', 'product_id', 'user_id']).to_pandas(),
    label=train_catboost['target'].to_list()
)

val_data = Pool(
    data=val_catboost.drop(['target', 'request_id', 'product_id', 'user_id']).to_pandas(),
    label=val_catboost['target'].to_list()
)

In [None]:
model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.01,
    depth=2,
    loss_function="Logloss",
    eval_metric="AUC",
    early_stopping_rounds=50,
)

In [None]:
model.fit(
    train_data,
    eval_set=val_data,
    # plot=True
)

In [None]:
y_pred_proba = model.predict_proba(val_catboost.drop(['target']).to_pandas())[:, 1]

roc_auc = roc_auc_score(val_catboost['target'].to_list(), y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

logloss = log_loss(val_catboost['target'].to_list(), y_pred_proba)
print(f"LogLoss: {logloss:.4f}")

#### –í–∞–∂–Ω–æ—Å—Ç–∏ —Ñ–∏—á–µ–π:

In [None]:
for name, fstr in zip(model.feature_names_, model.feature_importances_):
    print(name, ':', fstr)

–ü–µ—Ä–µ–¥–µ–ª–∞–µ–º —Ñ—É–Ω–∫—Ü–∏—é –¥–∂–æ–π–Ω–∞ –¥–ª—è —Ç–µ—Å—Ç–æ–≤–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞:

In [None]:
def join_features_to_val_dataset(
    dataset: pl.DataFrame,
    count_purchase_by_user_and_product: pl.DataFrame,
    ctr: pl.DataFrame
) -> pl.DataFrame:
    catboost_pool = dataset.drop(
        'source_type',
        'store_id',
        'timestamp',
        'city_name',
        'product_name',
        'product_category',
        'product_image'
    ).join(
        ctr,
        on='product_id',
        how='left'
    ).join(
        count_purchase_by_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    )

    catboost_pool = catboost_pool.drop(
        'user_id',
        'product_id',
        'request_id'
    )

    return catboost_pool

In [None]:
kaggle_catboost = join_features_to_val_dataset(
    test,
    calculate_count_purchase_by_user_and_product(train),
    calculate_ctr(train)
)

kaggle_catboost.head(5)

In [None]:
test_data = test['index', 'request_id']

test_data.with_columns(
    predict=model.predict_proba(kaggle_catboost.to_pandas())[:, 1]
).sort(
    'predict',
    descending=True
).select(
    'index',
    'request_id'
).write_csv('cb_submit.csv')

In [None]:
val_catboost

# üéØ –ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏—è

In [None]:
import sklearn

catboost_predicts = val_catboost.with_columns(
    predict=model.predict_proba(val_catboost.drop(['target']).to_pandas())[:, 1]
)

true = []
pred = []

for i in catboost_predicts.group_by('request_id'):
    value = i[1].sort('target', descending=True)[:10]
    if sum(value['target']) == 0:
        continue
    l = [0] * (10 - len(value['target']))
    true.append(value['target'].to_list() + l)
    pred.append(value['predict'].to_list() + l)

# –§–æ—Ä–º—É–ª—ã –¥–ª—è MAP@K (Mean Average Precision at K)

## 1. **Precision@K**
–î–æ–ª—è —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ —Å—Ä–µ–¥–∏ –ø–µ—Ä–≤—ã—Ö `K` —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤:
$$
\text{Precision}@K = \frac{\text{–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Ç–æ–ø-}K}{K}
$$

---

## 2. **Average Precision@K (AP@K)**
–°—Ä–µ–¥–Ω—è—è —Ç–æ—á–Ω–æ—Å—Ç—å –¥–ª—è –æ–¥–Ω–æ–≥–æ –∑–∞–ø—Ä–æ—Å–∞, —É—á–∏—Ç—ã–≤–∞—é—â–∞—è –ø–æ–∑–∏—Ü–∏–∏ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Ç–æ–ø-`K`:
$$
\text{AP}@K = \frac{\sum_{k=1}^{K} \text{Precision}@k \cdot \text{rel}(k)}{\text{–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Ç–æ–ø-}K}
$$
- `rel(k)` = 1, –µ—Å–ª–∏ –¥–æ–∫—É–º–µ–Ω—Ç –Ω–∞ –ø–æ–∑–∏—Ü–∏–∏ `k` —Ä–µ–ª–µ–≤–∞–Ω—Ç–µ–Ω, –∏–Ω–∞—á–µ 0.
- –ï—Å–ª–∏ –≤ —Ç–æ–ø-`K` –Ω–µ—Ç —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, —Ç–æ `AP@K = 0`.

---

## 3. **MAP@K (Mean Average Precision at K)**
–°—Ä–µ–¥–Ω–µ–µ –∑–Ω–∞—á–µ–Ω–∏–µ AP@K –ø–æ –≤—Å–µ–º –∑–∞–ø—Ä–æ—Å–∞–º:
$$
\text{MAP}@K = \frac{1}{Q} \sum_{q=1}^{Q} \text{AP}@K^{(q)}
$$
- `Q` ‚Äî –æ–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –∑–∞–ø—Ä–æ—Å–æ–≤.
-  $AP@K^{(q)}$ ‚Äî Average Precision@K –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞ `q`.

In [None]:
def ap_at_k(y_true, y_pred, k):
    if np.sum(y_true) == 0:
        return 0.0
    sorted_indices = np.argsort(y_pred)[::-1]
    top_k_indices = sorted_indices[:k]
    y_true_k = y_true[top_k_indices]

    cumulative_precision = 0.0
    relevant_seen = 0
    for i in range(len(y_true_k)):
        if y_true_k[i]:
            relevant_seen += 1
            precision_at_i = relevant_seen / (i + 1)
            cumulative_precision += precision_at_i

    return cumulative_precision / relevant_seen

def map_at_k(true_relevance, predicted_scores, k):
    total_ap = 0.0

    for y_true, y_pred in zip(true_relevance, predicted_scores):
        y_true = np.array(y_true)
        y_pred = np.array(y_pred)

        ap = ap_at_k(y_true, y_pred, k)
        total_ap += ap

    return total_ap / len(true_relevance)

custom_map = map_at_k(true, pred, 10)

print(f"MAP@10: {custom_map:.4f}")

# –§–æ—Ä–º—É–ª—ã –¥–ª—è NDCG (Normalized Discounted Cumulative Gain)

## 1. **CG (Cumulative Gain)**
–ü—Ä–æ—Å—Ç–∞—è —Å—É–º–º–∞ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç–µ–π –ø–µ—Ä–≤—ã—Ö `p` –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –≤ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–∞—Ö —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏—è:
$$
\text{CG}_p = \sum_{i=1}^{p} \text{rel}_i
$$
- `rel_i` ‚Äî —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å –¥–æ–∫—É–º–µ–Ω—Ç–∞ –Ω–∞ –ø–æ–∑–∏—Ü–∏–∏ `i`.

---

## 2. **DCG (Discounted Cumulative Gain)**
–£—á–∏—Ç—ã–≤–∞–µ—Ç –ø–æ—Ä—è–¥–æ–∫ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –¥–∏—Å–∫–æ–Ω—Ç–∏—Ä—É—è —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å –Ω–∞ –±–æ–ª–µ–µ –Ω–∏–∑–∫–∏—Ö –ø–æ–∑–∏—Ü–∏—è—Ö:
$$
\text{DCG}_p = \sum_{i=1}^{p} \frac{\text{rel}_i}{\log_2(i + 1)}
$$

---

## 3. **IDCG (Ideal DCG)**
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ –≤–æ–∑–º–æ–∂–Ω—ã–π DCG –ø—Ä–∏ –∏–¥–µ–∞–ª—å–Ω–æ–º –ø–æ—Ä—è–¥–∫–µ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤:
$$
\text{IDCG}_p = \sum_{i=1}^{p} \frac{\text{rel}_i^{\text{(ideal)}}}{\log_2(i + 1)}
$$
–≥–¥–µ $rel_i^{(ideal)}$ ‚Äî —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç–∏ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –æ—Ç—Å–æ—Ä—Ç–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ –ø–æ —É–±—ã–≤–∞–Ω–∏—é.

---

## 4. **NDCG (Normalized DCG)**
–ù–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω–∞—è –≤–µ—Ä—Å–∏—è DCG –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ [0, 1]:
$$
\text{NDCG}_p = \frac{\text{DCG}_p}{\text{IDCG}_p}
$$

In [None]:
def ndcg_at_10(true_relevance, predicted_scores):
    ndcg = 0.0

    for true, pred in zip(true_relevance, predicted_scores):
        true = np.array(true)
        pred = np.array(pred)

        top_10_indices = np.argsort(pred)[::-1]
        rels = true[top_10_indices]

        dcg = 0.0
        for i, rel in enumerate(rels, 1):
            dcg += rel / np.log2(i + 1)

        ideal_rels = sorted(true, reverse=True)
        idcg = 0.0
        for i, rel in enumerate(ideal_rels, 1):
            idcg += rel / np.log2(i + 1)

        ndcg += dcg / idcg

    return ndcg / len(true_relevance)

custom_ndcg = ndcg_at_10(true, pred)
sklearn_ndcg = ndcg_score(true, pred, k=10, ignore_ties=True)

print(f"Custom NDCG@10: {custom_ndcg:.4f}")
print(f"Sklearn NDCG@10: {sklearn_ndcg:.4f}")

assert abs(custom_ndcg - sklearn_ndcg) < 1e-4

# –ú–µ—Ç—Ä–∏–∫–∞ Novelty –≤ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ç–µ–ª—å–Ω—ã—Ö —Å–∏—Å—Ç–µ–º–∞—Ö

**Novelty** (–Ω–æ–≤–∏–∑–Ω–∞) –æ—Ç—Ä–∞–∂–∞–µ—Ç —Å–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å —Å–∏—Å—Ç–µ–º—ã —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å —ç–ª–µ–º–µ–Ω—Ç—ã, –∫–æ—Ç–æ—Ä—ã–µ **–Ω–æ–≤—ã** –∏–ª–∏ **–Ω–µ–∏–∑–≤–µ—Å—Ç–Ω—ã** –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é.  
Novelty –Ω–µ —Ç—Ä–µ–±—É–µ—Ç, —á—Ç–æ–±—ã —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –±—ã–ª–∏ –ø–æ–ª–µ–∑–Ω—ã–º–∏ ‚Äî —Ç–æ–ª—å–∫–æ **–Ω–µ–ø—Ä–∏–≤—ã—á–Ω—ã–º–∏**. –û—Å–Ω–æ–≤–Ω–æ–π –ø–æ–¥—Ö–æ–¥ –∫ —Ä–∞—Å—á–µ—Ç—É:

---

## **–ù–∞ –æ—Å–Ω–æ–≤–µ –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç–∏ —ç–ª–µ–º–µ–Ω—Ç–æ–≤**
–ß–µ–º –º–µ–Ω–µ–µ –ø–æ–ø—É–ª—è—Ä–µ–Ω —ç–ª–µ–º–µ–Ω—Ç, —Ç–µ–º –≤—ã—à–µ –µ–≥–æ –Ω–æ–≤–∏–∑–Ω–∞:
$$
\text{Novelty}(i) = 1 - \text{Popularity}(i)
$$
- `Popularity(i)` ‚Äî –Ω–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç—å —ç–ª–µ–º–µ–Ω—Ç–∞ (–Ω–∞–ø—Ä–∏–º–µ—Ä, –¥–æ–ª—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–æ–≤–∞–≤—à–∏—Ö —Å `i`).

**–°—Ä–µ–¥–Ω—è—è Novelty –¥–ª—è —Å–ø–∏—Å–∫–∞ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π**:
$$
\text{Novelty}@K = \frac{1}{K} \sum_{i=1}^{K} \left(1 - \text{Popularity}(i)\right)
$$

In [None]:
total_purchasing_users = (
    train_parts_1_2.filter(pl.col('action_type') == "AT_Purchase")
    ['user_id'].unique().shape[0]
)

product_novelty_df = (
    train_parts_1_2.filter(pl.col('action_type') == "AT_Purchase")
    .group_by(['product_id', 'user_id'])
    .agg()
    .group_by('product_id')
    .agg(
        pl.len().alias('unique_buyers_count')
    )
    .with_columns(
        novelty_score=1 - (pl.col('unique_buyers_count') / total_purchasing_users)
    )
    .drop('unique_buyers_count')
)

total_novelty_score = 0.0
processed_requests_count = 0

predicts_with_novelty = catboost_predicts.join(
    product_novelty_df,
    on='product_id',
    how='left'
).fill_null(1).group_by('request_id')

for request_id, recommendations in predicts_with_novelty:

    top10_recommendations = recommendations.sort('target', descending=True).head(10)
    average_novelty = top10_recommendations['novelty_score'].mean()

    if average_novelty is not None:
        processed_requests_count += 1
        total_novelty_score += average_novelty

final_novelty_metric = total_novelty_score / processed_requests_count

print(f"Novelty@10: {final_novelty_metric:.4f}")

# –ú–µ—Ç—Ä–∏–∫–∏ –¥–ª—è –æ—Ü–µ–Ω–∫–∏ Serendipity –≤ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ç–µ–ª—å–Ω—ã—Ö —Å–∏—Å—Ç–µ–º–∞—Ö

**Serendipity** –æ—Ç—Ä–∞–∂–∞–µ—Ç —Å–ø–æ—Å–æ–±–Ω–æ—Å—Ç—å —Å–∏—Å—Ç–µ–º—ã —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –Ω–µ–æ–∂–∏–¥–∞–Ω–Ω—ã–µ, –Ω–æ –ø–æ–ª–µ–∑–Ω—ã–µ —ç–ª–µ–º–µ–Ω—Ç—ã, –≤—ã—Ö–æ–¥—è—â–∏–µ –∑–∞ —Ä–∞–º–∫–∏ –æ—á–µ–≤–∏–¥–Ω—ã—Ö –ø—Ä–µ–¥–ø–æ—á—Ç–µ–Ω–∏–π –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è.  
–ò–∑–º–µ—Ä–µ–Ω–∏–µ —Å–ª–æ–∂–Ω–æ–µ, —Ç–∞–∫ –∫–∞–∫ —Ç—Ä–µ–±—É–µ—Ç —É—á–µ—Ç–∞ **—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç–∏** –∏ **–Ω–µ–æ–∂–∏–¥–∞–Ω–Ω–æ—Å—Ç–∏**. –ü—Ä–∏–≤–µ–¥–µ–º –æ—Å–Ω–æ–≤–Ω—ã–µ –ø–æ–¥—Ö–æ–¥—ã:

---

## 1. **–ö–ª–∞—Å—Å–∏—á–µ—Å–∫–∞—è —Ñ–æ—Ä–º—É–ª–∞ (–Ω–∞ –æ—Å–Ω–æ–≤–µ –æ–∂–∏–¥–∞–Ω–∏–π)**
–°–µ—Ä–µ–Ω–¥–∏–ø–Ω–æ—Å—Ç—å = –†–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å √ó –ù–µ–æ–∂–∏–¥–∞–Ω–Ω–æ—Å—Ç—å:
$$
\text{Serendipity}(i) = \text{Rel}(i) \times \left(1 - \text{Prob}_{\text{user}}(i)\right)
$$
- `Rel(i)` ‚Äî —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å —ç–ª–µ–º–µ–Ω—Ç–∞ `i` –¥–ª—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è (–Ω–∞–ø—Ä–∏–º–µ—Ä, –æ—Ü–µ–Ω–∫–∞ –∏–ª–∏ –∫–ª–∏–∫).
- `Prob_user(i)` ‚Äî –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å **–æ–∂–∏–¥–∞–ª** —ç–ª–µ–º–µ–Ω—Ç `i` (–Ω–∞–ø—Ä–∏–º–µ—Ä, –Ω–∞ –æ—Å–Ω–æ–≤–µ –µ–≥–æ –∏—Å—Ç–æ—Ä–∏–∏).

---

## 2. **–ú–µ—Ç—Ä–∏–∫–∞ –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç–∏**
–£—á–∏—Ç—ã–≤–∞–µ—Ç —Ä–µ–¥–∫–æ—Å—Ç—å —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –≤ –æ–±—â–µ–º –∫–æ–Ω—Ç–µ–∫—Å—Ç–µ:
$$
\text{Serendipity}(i) = \text{Rel}(i) \times \left(1 - \text{Popularity}(i)\right)
$$
- `Popularity(i)` ‚Äî –Ω–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–Ω–∞—è –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç—å —ç–ª–µ–º–µ–Ω—Ç–∞ `i` (–Ω–∞–ø—Ä–∏–º–µ—Ä, –¥–æ–ª—è –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–æ–≤–∞–≤—à–∏—Ö —Å –Ω–∏–º).

In [None]:
user_product_purchase_history = (
    train_parts_1_2.filter(pl.col('action_type') == "AT_Purchase")
    .group_by(['user_id', 'product_id'])
    .agg(
        pl.lit(1).alias('has_purchased')
    )
)

total_serendipity_score = 0.0
processed_recommendation_requests = 0

predicts_with_history = catboost_predicts.join(
    user_product_purchase_history,
    on=['user_id', 'product_id'],
    how='left'
).with_columns(
    pl.col('has_purchased').fill_null(0)
).group_by('request_id')

for request_id, recommendations in predicts_with_history:
    top10_recommendations = recommendations.sort('target', descending=True).head(10)

    serendipity_values = (1 - top10_recommendations['has_purchased']) * top10_recommendations['predict']
    average_serendipity = serendipity_values.mean()

    if average_serendipity is not None:
        processed_recommendation_requests += 1
        total_serendipity_score += average_serendipity

final_serendipity_metric = total_serendipity_score / processed_recommendation_requests
print(f"Serendipity@10: {final_serendipity_metric:.4f}")

### –ö–∞–∫ –º–æ–∂–Ω–æ —É–ª—É—á—à–∏—Ç—å —Å–∫–æ—Ä:

- –ß–∏—Å—Ç–∏–º –¥–∞—Ç–∞—Å–µ—Ç
- –ë–æ–ª—å—à–µ —Ñ–∏—á–µ–π
- –í–∞—Ä–∏–º —Ñ–∏—á–∏ –±–æ–ª–µ–µ —É–º–Ω—ã–º —Å–ø–æ—Å–æ–±–æ–º:
<center><img src="Timesplit2.svg" width="1100" /></center>