# day5: Ranking Models(LightGBM/XGBoost)

## 1. Data preparation


**1.1 Read the original data**

In [25]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm

DATA_PATH = "/kaggle/input/otto-recsys-dataset/train.parquet"

df = pl.read_parquet(
    DATA_PATH,
    columns=["session", "aid", "ts", "type"]
)

df



session,aid,ts,type
u32,u32,u64,u8
0,1517085,1659304800025,0
0,1563459,1659304904511,0
0,1309446,1659367439426,0
0,16246,1659367719997,0
0,1781822,1659367871344,0
…,…,…,…
12899776,1737908,1661723987073,0
12899777,384045,1661723976974,0
12899777,384045,1661723986800,0
12899778,561560,1661723983611,0


**1.2 Construct label**

In [26]:
def build_labels(df: pl.DataFrame):
    return (
        df.sort("ts")
          .group_by("session")
          .tail(1)
          .select(["session", pl.col("aid").alias("label_aid")])
    )

label_train = build_labels(df_train)
label_valid = build_labels(df_valid)

 **1.3 Construct a candidate pool**

In [27]:
def build_candidates(df: pl.DataFrame, max_items=20):
    return (
        df.sort("ts")
          .group_by("session")
          .agg(
              pl.col("aid")
                .unique()
                .tail(max_items)
                .alias("candidate_aid")
          )
          .explode("candidate_aid")
    )

cand_train = build_candidates(df_train)
cand_valid = build_candidates(df_valid)

cand_train.head()

session,candidate_aid
u32,u32
4565830,205488
4565830,966771
8122791,206526
8122791,543659
8122791,1434743


**1.4 Merge Label**

In [28]:
train_df = (
    cand_train
    .join(label_train, on="session", how="left")
    .with_columns(
        (pl.col("candidate_aid") == pl.col("label_aid"))
        .cast(pl.Int8)
        .alias("label")
    )
)

valid_df = (
    cand_valid
    .join(label_valid, on="session", how="left")
    .with_columns(
        (pl.col("candidate_aid") == pl.col("label_aid"))
        .cast(pl.Int8)
        .alias("label")
    )
)

train_df.select("label").sum(), valid_df.select("label").sum()

(shape: (1, 1)
 ┌─────────┐
 │ label   │
 │ ---     │
 │ i64     │
 ╞═════════╡
 │ 9993701 │
 └─────────┘,
 shape: (1, 1)
 ┌─────────┐
 │ label   │
 │ ---     │
 │ i64     │
 ╞═════════╡
 │ 2231350 │
 └─────────┘)

## 2.Feature Engineering

**2.1 Item-level Features**

In [29]:
item_feats = (
    df_train
    .group_by("aid")
    .agg([
        pl.len().alias("item_cnt"),
        pl.max("ts").alias("item_last_ts")
    ])
)

train_df = train_df.join(
    item_feats,
    left_on="candidate_aid",
    right_on="aid",
    how="left"
).fill_null(0)

valid_df = valid_df.join(
    item_feats,
    left_on="candidate_aid",
    right_on="aid",
    how="left"
).fill_null(0)

**2.2 Turn to Pandas**

In [30]:
FEATURES = ["item_cnt", "item_last_ts"]

train_pd = train_df.select(
    ["session", "label"] + FEATURES
).to_pandas()

valid_pd = valid_df.select(
    ["session", "label"] + FEATURES
).to_pandas()

## 3.Time-based Train/Validation Split

**3.1 Session time**

In [31]:
# Use the last time of the session to cut
session_ts = (
    df.group_by("session")
      .agg(pl.max("ts").alias("max_ts"))
)

cut_ts = session_ts["max_ts"].quantile(0.8)

train_sessions = session_ts.filter(pl.col("max_ts") <= cut_ts)["session"]
valid_sessions = session_ts.filter(pl.col("max_ts") > cut_ts)["session"]

df_train = df.filter(pl.col("session").is_in(train_sessions))
df_valid = df.filter(pl.col("session").is_in(valid_sessions))

print(len(train_sessions), len(valid_sessions))

10319823 2579956


## 4.LightGBM Ranking Model

**4.1 Construct group**

In [32]:
def make_group(df):
    return df.groupby("session").size().values

X_train = train_pd[FEATURES]
y_train = train_pd["label"]
group_train = make_group(train_pd)

X_valid = valid_pd[FEATURES]
y_valid = valid_pd["label"]
group_valid = make_group(valid_pd)

**4.2 Training LightGBM Ranker**

In [33]:
model = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=80,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

model.fit(
    X_train,
    y_train,
    group=group_train,
    eval_set=[(X_valid, y_valid)],
    eval_group=[group_valid],
    eval_at=[20],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.241986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 63702922, number of used features: 2


## 5. Recall@20 Evaluation

**5.1 Recall@20 Realisation**

In [34]:
def recall_at_k(df, scores, k=20):
    df = df.copy()
    df["score"] = scores

    hits = 0
    total = df["session"].nunique()

    for _, g in df.groupby("session"):
        topk = g.sort_values("score", ascending=False).head(k)
        hits += topk["label"].sum() > 0

    return hits / total

**5.2 Evaluate**

In [35]:
valid_pd["score"] = model.predict(X_valid)

recall20 = recall_at_k(valid_pd, valid_pd["score"], k=20)
print("Recall@20:", recall20)

Recall@20: 0.8648790909612412
