In [1]:
import polars as pl
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import ndcg_score
import os
import glob
from tqdm import tqdm
from datetime import timedelta
import kagglehub
path = kagglehub.model_download("thomasstrahl/events/other/default")

In [2]:
TRAIN_END = pl.datetime(2025, 7, 1)
TRAIN_CUTOFF = pl.datetime(2025, 6, 15)
TRAIN_START = pl.datetime(2025, 5, 20)

VAL_END = pl.datetime(2025, 7, 16)
VAL_CUTOFF = pl.datetime(2025, 7, 1)
VAL_START = pl.datetime(2025, 6, 10)

TEST_START = pl.datetime(2025, 5, 20)

In [4]:
test_user_ids = pd.read_parquet('/kaggle/input/testststs/ml_ozon_recsys_test.snappy.parquet')
test_user_ids_list = test_user_ids['user_id'].to_list()
test_user_ids_set = set(test_user_ids_list)

In [3]:
path = kagglehub.dataset_download("thunderpede/testststs")
print("Path to dataset files:", path)

In [4]:
folder1_path = os.path.join(path, "extra_ml_ozon_recsys_train%202", "extra_ml_ozon_recsys_train", "archive_extra_orders_data", "final_apparel_orders_data_07")
folder2_path = os.path.join(path, "ml_ozon_recsys_train_final_apparel_orders_data", "ml_ozon_recsys_train_final_apparel_orders_data")
order_files_1 = glob.glob(os.path.join(folder1_path, "**", "*.parquet"), recursive=True)
order_files_2 = glob.glob(os.path.join(folder2_path, "*.parquet"))
order_files = order_files_1 + order_files_2

tr_folder1_path = os.path.join(path, "extra_ml_ozon_recsys_train%202", "extra_ml_ozon_recsys_train", "final_apparel_tracker_data_08_action_widget")
tr_folder2_path = os.path.join(path, "ml_ozon_recsys_train_final_apparel_tracker_data", "ml_ozon_recsys_train_final_apparel_tracker_data")
tracker_files_1 = glob.glob(os.path.join(tr_folder1_path, "**", "*.parquet"), recursive=True)
tracker_files_2 = glob.glob(os.path.join(tr_folder2_path, "*.parquet"))
tracker_files = tracker_files_1 + tracker_files_2

In [5]:
def get_weight(action_type):
    weight_map = {
        "delivered_orders": 6.0,
        "proccesed_orders": 4.0,
        "to_cart": 3.0,
        "favorite": 2.0,
        "view_description": 1.5,
        "review_view": 1.5,
        "page_view": 1.0,
        "unfavorite": -0.5,
        "remove": -1.0,
        "canceled_orders": -1.0,
    }
    return weight_map.get(action_type, 0.0)

In [6]:
def read_trackers(tracker_files, user_set, ts_from):
    all_dfs = []
    for i in tqdm(range(len(tracker_files)), desc='Reading trackers'):
        file_path = tracker_files[i]
        df = (pl.read_parquet(
                file_path,
                columns=['user_id', 'item_id', 'action_type', 'timestamp']
             )
             .filter(
                (pl.col('timestamp') > ts_from) &
                (pl.col('user_id').is_in(user_set))
             )
             .with_columns(
                weight = pl.col("action_type").map_elements(get_weight, return_dtype=pl.Float32)
             )
        )
        all_dfs.append(df)
    return pl.concat(all_dfs) if all_dfs else pl.DataFrame(schema={'user_id':pl.Int64, 'item_id':pl.Int64, 'action_type':pl.Utf8, 'timestamp':pl.Datetime, 'weight':pl.Float32})

def read_orders(order_files, user_set, ts_from):
    all_dfs = []
    for i in tqdm(range(len(order_files)), desc='Reading orders'):
        file_path = order_files[i]
        df = (pl.read_parquet(
                file_path,
                columns=['user_id', 'item_id', 'created_timestamp', 'last_status']
             )
             .filter(
                (pl.col('created_timestamp') > ts_from) &
                (pl.col('user_id').is_in(user_set))
             )
             .with_columns(
                weight = pl.col("last_status").map_elements(get_weight, return_dtype=pl.Float32)
             )
             .rename({"created_timestamp":"timestamp","last_status":"action_type"})
        )
        all_dfs.append(df)
    return pl.concat(all_dfs) if all_dfs else pl.DataFrame(schema={'user_id':pl.Int64, 'item_id':pl.Int64, 'action_type':pl.Utf8, 'timestamp':pl.Datetime, 'weight':pl.Float32})

In [7]:
events_tr = read_trackers(tracker_files, test_user_ids_set, TRAIN_START)
events_or = read_orders(order_files, test_user_ids_set, TRAIN_START)

In [8]:
events = pl.concat([events_tr.select([
    "user_id", "item_id", "timestamp", "action_type", "weight"
]), events_or])

In [9]:
events = events.filter(~pl.col('action_type').is_in(['page_view', "unfavorite", "remove", "canceled_orders"]))

In [10]:
events.write_parquet('events')

In [3]:
df = pl.read_parquet('/kaggle/input/events1/other/default/1/events').filter(pl.col('timestamp') >= TEST_START)

In [5]:
train_df = df.filter((pl.col('timestamp') <= TRAIN_CUTOFF) &
                     (pl.col('timestamp') > TRAIN_START)
                    )
train_target = (
    df
    .filter(
        (pl.col('timestamp') > TRAIN_CUTOFF) &
        (pl.col('timestamp') <= TRAIN_END)
    )
    .group_by(['user_id', 'item_id'])
    .agg(
        (pl.col('action_type') == 'delivered_orders').any().cast(pl.Int8).alias('target')
    )
)
users_with_positive = train_target.filter(pl.col("target")==1)["user_id"].unique()
train_filtered = train_df.filter(pl.col("user_id").is_in(users_with_positive))
train_df = (train_filtered
           .join(train_target, on=['user_id', 'item_id'], how='left')
           .with_columns(pl.col('target').fill_null(0))
)
user_max_target = train_df.group_by("user_id").agg(
    pl.col("target").max().alias("has_positive")
)
positive_users = user_max_target.filter(pl.col("has_positive") == 1).select("user_id").to_series()
train_df = train_df.filter(pl.col("user_id").is_in(positive_users)).unique()

In [13]:
train_df

user_id,item_id,timestamp,action_type,weight,target
i32,i32,datetime[ns],str,f32,i8
2754841,14341153,2025-05-24 16:36:09,"""view_description""",1.5,0
590500,141044077,2025-05-20 09:16:19,"""favorite""",2.0,0
2487320,233035153,2025-06-13 18:58:32,"""review_view""",1.5,0
2967770,54113583,2025-06-07 04:48:21,"""view_description""",1.5,0
734030,145300866,2025-06-12 21:19:28,"""view_description""",1.5,0
…,…,…,…,…,…
4023581,71256804,2025-06-03 12:45:08,"""review_view""",1.5,0
4994841,171811251,2025-06-10 12:02:53,"""view_description""",1.5,0
4703290,250609358,2025-05-26 17:46:23,"""view_description""",1.5,0
2107550,55467897,2025-06-09 07:36:23,"""view_description""",1.5,0


In [6]:
val_target_all = (
    df
    .filter((pl.col('timestamp') > VAL_CUTOFF) & (pl.col('timestamp') <= VAL_END))
    .group_by(['user_id', 'item_id'])
    .agg(
        (pl.col('action_type') == 'delivered_orders').any().cast(pl.Int8).alias('target')
    )
)

users_with_positive = (
    val_target_all.filter(pl.col('target') == 1)
                  .select('user_id')
                  .unique()
                  .to_series()
)

val_users_1000 = users_with_positive.sample(10000, seed=42)

val_df = (
    df
    .filter(pl.col('user_id').is_in(val_users_1000))
    .filter((pl.col('timestamp') <= VAL_CUTOFF) & (pl.col('timestamp') > VAL_START))
)

val_df = (
    val_df
    .join(val_target_all, on=['user_id', 'item_id'], how='left')
    .with_columns(pl.col('target').fill_null(0))
)
user_max_target_val = val_df.group_by("user_id").agg(
    pl.col("target").max().alias("has_positive")
)
positive_users_val = user_max_target_val.filter(pl.col("has_positive") == 1).select("user_id").to_series()
val_df = val_df.filter(pl.col("user_id").is_in(positive_users_val)).unique()

In [15]:
val_df

user_id,item_id,timestamp,action_type,weight,target
i32,i32,datetime[ns],str,f32,i8
930491,196431086,2025-06-21 07:17:52,"""view_description""",1.5,0
1791751,71520302,2025-06-29 09:17:34,"""view_description""",1.5,0
1444050,220437285,2025-06-13 06:57:07,"""favorite""",2.0,0
3856540,256345073,2025-06-10 19:12:45,"""to_cart""",3.0,0
4548790,108595829,2025-06-29 15:28:40,"""view_description""",1.5,0
…,…,…,…,…,…
266311,50273745,2025-06-27 21:11:16,"""favorite""",2.0,0
1703201,236262862,2025-06-10 18:25:44,"""favorite""",2.0,0
4224270,182685311,2025-06-13 16:25:08,"""favorite""",2.0,0
5024360,60334596,2025-06-21 20:38:11,"""view_description""",1.5,0


In [7]:
FEATURE_COLS = [
    "share_ui_cart_in_user_cart_7d",
    "share_ui_cart_in_user_cart_14d",
    "ui_cart_7d",
    "ui_cart_14d",
    # "ui_cart_all",
    "ui_total_weight_7d",
    "ui_total_weight_14d",
    "ui_actions_7d",
    # "ui_actions_14d",
    "ui_total_weight_all",
    "user_conv_rate_14d",
    "ui_actions_all",
    "item_orders_7d",
    "item_orders_14d",
    # "item_orders_all",
    "item_conv_rate_14d",
    # "item_conv_rate_7d",
    "user_conv_rate_7d",
    "item_actions_7d",
    # "item_actions_14d",
    # "item_unique_users_7d",
    # "item_actions_all",
    # "item_unique_users_14d",
    # "share_ui_orders_in_user_orders_14d",
    # # "share_ui_orders_in_user_orders_7d",
    # "item_unique_users_all",
    # "ui_orders_7d",
    # "ui_orders_14d",
    # "ui_orders_all"
]

In [8]:
def build_features(base: pl.DataFrame) -> pl.DataFrame:
    base = base.unique()

    max_date = base["timestamp"].max()
    
    # временные окна
    cutoff_7d = max_date - timedelta(days=7)
    cutoff_14d = max_date - timedelta(days=14)

    def make_window_stats(df: pl.DataFrame, suffix: str) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
        """Собираем user/item/ui статистики за окно"""
        user_stats = df.group_by("user_id").agg([
            pl.len().alias(f"user_actions_{suffix}"),
            (pl.col("action_type") == "delivered_orders").sum().alias(f"user_orders_{suffix}"),
            (pl.col("action_type") == "to_cart").sum().alias(f"user_cart_{suffix}"),
            pl.col("weight").sum().alias(f"user_total_weight_{suffix}"),
        ])

        item_stats = df.group_by("item_id").agg([
            pl.len().alias(f"item_actions_{suffix}"),
            (pl.col("action_type") == "delivered_orders").sum().alias(f"item_orders_{suffix}"),
            pl.col("user_id").n_unique().alias(f"item_unique_users_{suffix}"),
        ])

        ui_stats = df.group_by(["user_id", "item_id"]).agg([
            pl.len().alias(f"ui_actions_{suffix}"),
            (pl.col("action_type") == "delivered_orders").sum().alias(f"ui_orders_{suffix}"),
            (pl.col("action_type") == "to_cart").sum().alias(f"ui_cart_{suffix}"),
            pl.col("weight").sum().alias(f"ui_total_weight_{suffix}"),
        ])
        return user_stats, item_stats, ui_stats

    # lifetime stats (без окна)
    user_stats_all, item_stats_all, ui_stats_all = make_window_stats(base, "all")

    # 7дн stats
    recent7 = base.filter(pl.col("timestamp") >= cutoff_7d)
    user_stats_7d, item_stats_7d, ui_stats_7d = make_window_stats(recent7, "7d")

    # 14дн stats
    recent14 = base.filter(pl.col("timestamp") >= cutoff_14d)
    user_stats_14d, item_stats_14d, ui_stats_14d = make_window_stats(recent14, "14d")

    # target
    target_df = base.select(["user_id", "item_id", "target"]).unique()

    # собираем всё
    feats = (
        ui_stats_all
        .join(user_stats_all, on="user_id", how="left")
        .join(item_stats_all, on="item_id", how="left")
        .join(ui_stats_7d, on=["user_id","item_id"], how="left")
        .join(user_stats_7d, on="user_id", how="left")
        .join(item_stats_7d, on="item_id", how="left")
        .join(ui_stats_14d, on=["user_id","item_id"], how="left")
        .join(user_stats_14d, on="user_id", how="left")
        .join(item_stats_14d, on="item_id", how="left")
        .join(target_df, on=["user_id", "item_id"], how="left")
        .fill_null(0)
    )

    # ratio-фичи
    feats = feats.with_columns([
        (pl.col("ui_cart_14d") / (pl.col("user_cart_14d")+1)).alias("share_ui_cart_in_user_cart_14d"),
        (pl.col("ui_orders_14d") / (pl.col("user_orders_14d")+1)).alias("share_ui_orders_in_user_orders_14d"),
        (pl.col("item_orders_14d") / (pl.col("item_actions_14d")+1)).alias("item_conv_rate_14d"),
        (pl.col("user_orders_14d") / (pl.col("user_actions_14d")+1)).alias("user_conv_rate_14d"),
        (pl.col("ui_cart_7d") / (pl.col("user_cart_7d")+1)).alias("share_ui_cart_in_user_cart_7d"),
        (pl.col("ui_orders_7d") / (pl.col("user_orders_7d")+1)).alias("share_ui_orders_in_user_orders_7d"),
        (pl.col("item_orders_7d") / (pl.col("item_actions_7d")+1)).alias("item_conv_rate_7d"),
        (pl.col("user_orders_7d") / (pl.col("user_actions_7d")+1)).alias("user_conv_rate_7d"),
    ])

    final_cols = FEATURE_COLS + ["target", "user_id", "item_id"]
    feats = feats.select([c for c in final_cols if c in feats.columns])
    
    return feats

In [9]:
train_feats = build_features(train_df)
val_feats = build_features(val_df)
feats = [c for c in train_feats.columns 
                if c not in ("target", "user_id", "item_id", "timestamp")]

In [10]:
train_feats

share_ui_cart_in_user_cart_7d,share_ui_cart_in_user_cart_14d,ui_cart_7d,ui_cart_14d,ui_total_weight_7d,ui_total_weight_14d,ui_actions_7d,ui_total_weight_all,user_conv_rate_14d,ui_actions_all,item_orders_7d,item_orders_14d,item_conv_rate_14d,user_conv_rate_7d,item_actions_7d,target,user_id,item_id
f64,f64,u32,u32,f32,f32,u32,f32,f64,u32,u32,u32,f64,f64,u32,i8,i32,i32
0.0,0.0,0,0,1.5,1.5,1,1.5,0.021739,1,0,0,0.0,0.009709,3,0,668440,297239858
0.0,0.0,0,0,1.5,1.5,1,1.5,0.0,1,0,0,0.0,0.0,14,0,2512141,157055743
0.0,0.0,0,0,11.0,11.0,4,11.0,0.055215,4,1,1,0.142857,0.063197,5,0,1050850,252593946
0.0,0.0,0,0,2.0,2.0,1,2.0,0.0,1,0,0,0.0,0.0,1,0,536421,314392224
0.0,0.0,0,0,0.0,0.0,0,6.0,0.017378,1,0,0,0.0,0.018648,0,0,4843751,249664464
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,0.034483,0,1,0.0,4.5,0,4.5,0.025,2,0,0,0.0,0.0,5,0,685521,328810459
0.0,0.0,0,0,0.0,0.0,0,3.0,0.028037,2,0,0,0.0,0.025,0,0,2779141,96761703
0.0,0.0,0,0,0.0,3.0,0,3.0,0.01682,2,0,0,0.0,0.003571,3,0,638680,168755074
0.0,0.0,0,0,0.0,0.0,0,4.5,0.08,2,0,0,0.0,0.0,2,0,1513990,234485161


In [11]:
user_stats = (
    train_feats.group_by("user_id")
    .agg([
        pl.len().alias("n_rows"),
        pl.col("target").sum().alias("n_pos")
    ])
)

# оставляем только "теплых" пользователей
warm_users = user_stats.filter(
    (pl.col("n_pos") > 0) & (pl.col("n_rows") >= 10)   # хотя бы 1 позитив и ≥10 строк
)["user_id"]

# фильтруем train_feats
train_feats_filtered = train_feats.filter(pl.col("user_id").is_in(warm_users))

positives = train_feats_filtered.filter(pl.col("target") == 1)

# для каждого user_id берём не больше 50 отрицательных
negatives = (
    train_feats_filtered.filter(pl.col("target") == 0)
    .group_by("user_id")
    .map_groups(lambda df: df.sample(min(len(df), 50), seed=42))
)

train_feats_sampled = pl.concat([positives, negatives])

In [12]:
train_feats_sampled

share_ui_cart_in_user_cart_7d,share_ui_cart_in_user_cart_14d,ui_cart_7d,ui_cart_14d,ui_total_weight_7d,ui_total_weight_14d,ui_actions_7d,ui_total_weight_all,user_conv_rate_14d,ui_actions_all,item_orders_7d,item_orders_14d,item_conv_rate_14d,user_conv_rate_7d,item_actions_7d,target,user_id,item_id
f64,f64,u32,u32,f32,f32,u32,f32,f64,u32,u32,u32,f64,f64,u32,i8,i32,i32
0.0,0.05,0,1,0.0,3.0,0,3.0,0.072727,1,0,0,0.0,0.111111,1,1,3282420,23192547
0.0,0.0,0,0,0.0,4.5,0,4.5,0.012034,3,0,0,0.0,0.005168,0,1,1484331,62888407
0.010638,0.008,1,1,7.5,7.5,4,7.5,0.027778,4,0,0,0.0,0.033445,5,1,4420690,159816757
0.0,0.0,0,0,7.5,12.0,5,24.0,0.004717,16,0,0,0.0,0.003831,7,1,1091111,59219518
0.0,0.0,0,0,0.0,0.0,0,1.5,0.0,1,0,0,0.0,0.0,5,1,1047581,286546490
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,0.0,0,0,3.5,3.5,2,3.5,0.053571,2,0,0,0.0,0.090909,12,0,380461,262691726
0.0,0.0,0,0,0.0,0.0,0,3.0,0.053571,2,1,1,0.052632,0.090909,7,0,380461,62639106
0.0,0.0,0,0,1.5,1.5,1,1.5,0.053571,1,0,0,0.0,0.090909,2,0,380461,310352187
0.0,0.0,0,0,2.0,2.0,1,2.0,0.053571,1,0,0,0.0,0.090909,2,0,380461,1140416


In [13]:
val_feats

share_ui_cart_in_user_cart_7d,share_ui_cart_in_user_cart_14d,ui_cart_7d,ui_cart_14d,ui_total_weight_7d,ui_total_weight_14d,ui_actions_7d,ui_total_weight_all,user_conv_rate_14d,ui_actions_all,item_orders_7d,item_orders_14d,item_conv_rate_14d,user_conv_rate_7d,item_actions_7d,target,user_id,item_id
f64,f64,u32,u32,f32,f32,u32,f32,f64,u32,u32,u32,f64,f64,u32,i8,i32,i32
0.090909,0.052632,1,1,3.0,3.0,1,3.0,0.067797,1,0,0,0.0,0.09375,1,0,1410281,249598139
0.0,0.0,0,0,12.5,12.5,8,12.5,0.008117,8,0,0,0.0,0.002304,8,0,4878130,100822433
0.0,0.0,0,0,0.0,0.0,0,3.0,0.004464,1,0,0,0.0,0.0,0,0,2113220,147626077
0.1,0.076923,1,1,3.0,3.0,1,3.0,0.0,1,0,0,0.0,0.0,1,0,4542111,154315029
0.0,0.0,0,0,2.0,2.0,1,2.0,0.0,1,0,0,0.0,0.0,1,0,1649461,188817978
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,0.0,0,0,0.0,3.5,0,3.5,0.006173,2,0,0,0.0,0.0,0,0,3049751,292083288
0.0,0.0,0,0,0.0,1.5,0,1.5,0.027624,1,0,0,0.0,0.0,1,0,3734631,36106287
0.0,0.0,0,0,0.0,4.5,0,4.5,0.010054,3,0,0,0.0,0.010309,0,0,3064501,208449731
0.0,0.0,0,0,0.0,5.5,0,5.5,0.0131,2,0,0,0.0,0.004016,0,0,1396391,139513921


In [14]:
cols = [
    c for c in train_feats.columns
    if c not in ["user_id", "item_id", "timestamp", "action_type", "target"]
]

# считаем корреляции каждой фичи с target
corrs = (
    train_feats.select([
        pl.corr(pl.col(c), pl.col("target")).alias(c)
        for c in cols
    ])
    .to_dict(as_series=False)   # превращаем в dict {col: [corr]}
)

corrs = pd.Series({k: v[0] for k, v in corrs.items()}).sort_values(ascending=False)

print(corrs)

share_ui_cart_in_user_cart_7d     0.197574
share_ui_cart_in_user_cart_14d    0.195281
ui_cart_7d                        0.137501
ui_cart_14d                       0.122758
ui_total_weight_7d                0.089639
ui_total_weight_14d               0.075860
ui_actions_7d                     0.074056
ui_total_weight_all               0.053586
user_conv_rate_14d                0.036883
ui_actions_all                    0.036576
item_orders_7d                    0.031470
item_orders_14d                   0.030780
item_conv_rate_14d                0.024233
user_conv_rate_7d                 0.023617
item_actions_7d                   0.022181
dtype: float64


In [15]:
def prepare_pool(df: pl.DataFrame):
    df = df.sort("user_id")  # чтобы кандидаты одного пользователя шли подряд
    drop_cols = ["target", "timestamp", "action_type", "ui_orders", "user_id", "item_id"]
    X = df.drop([c for c in drop_cols if c in df.columns]).to_pandas()
    y = df["target"].to_numpy()
    group_id = df["user_id"].to_numpy()
    return Pool(data=X, label=y, group_id=group_id)

In [16]:
train_pool = prepare_pool(train_feats_sampled)
val_pool = prepare_pool(val_feats)

In [17]:
params = dict(
    # Основные параметры
    iterations=1500,
    learning_rate=0.01,
    depth=8,
    l2_leaf_reg=3.0,
    random_seed=42,
    
    # GPU
    task_type="GPU",
    devices="1",
    gpu_ram_part=0.7,
    
    # ЛТР
    loss_function="YetiRankPairwise",
    eval_metric="NDCG:top=10",
    
    # Оптимизация памяти
    border_count=64,
    max_ctr_complexity=1,
    bootstrap_type='Bernoulli',
    subsample=0.8,
    
    # Leaf estimation
    leaf_estimation_method="Gradient",  
    leaf_estimation_iterations=1,  
    
    # Ранняя остановка
    early_stopping_rounds=250,
    od_type="Iter",
    
    # Логирование
    verbose=100
)


In [18]:
model = CatBoostRanker(**params)
model.fit(train_pool, eval_set=val_pool)

Default metric period is 5 because PFound, NDCG is/are not implemented for GPU
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=10;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.2854185	best: 0.2854185 (0)	total: 998ms	remaining: 24m 56s
100:	test: 0.3211421	best: 0.3221437 (74)	total: 33.9s	remaining: 7m 49s
200:	test: 0.3319242	best: 0.3321603 (197)	total: 1m 7s	remaining: 7m 15s
300:	test: 0.3367660	best: 0.3369126 (299)	total: 1m 41s	remaining: 6m 44s
400:	test: 0.3390369	best: 0.3395447 (373)	total: 2m 16s	remaining: 6m 13s
500:	test: 0.3413900	best: 0.3413900 (499)	total: 2m 51s	remaining: 5m 41s
600:	test: 0.3410351	best: 0.3417350 (529)	total: 3m 27s	remaining: 5m 9s
700:	test: 0.3416314	best: 0.3420612 (661)	total: 4m 3s	remaining: 4m 37s
800:	test: 0.3426571	best: 0.3426571 (800)	total: 4m 39s	remaining: 4m 4s
900:	test: 0.3428906	best: 0.3430473 (863)	total: 5m 16s	remaining: 3m 30s
1000:	test: 0.3449539	best: 0.3451132 (995)	total: 5m 53s	remaining: 2m 56s
1100:	test: 0.3454649	best: 0.3456030 (1094)	total: 6m 29s	remaining: 2m 21s
1200:	test: 0.3461089	best: 0.3464214 (1174)	total: 7m 6s	remaining: 1m 46s
1300:	test: 0.3463811	best: 0.3

<catboost.core.CatBoostRanker at 0x7b1cc70c0890>

In [33]:
model.save_model("catboost_ranker4.cbm")

In [34]:
feature_importance = model.get_feature_importance(type='PredictionValuesChange')
for feat, imp in zip(feats, feature_importance):
    print(f"{feat}: {imp:.4f}")

share_ui_cart_in_user_cart_7d: 1.5015
share_ui_cart_in_user_cart_14d: 21.0893
ui_cart_7d: 0.0000
ui_cart_14d: 0.3096
ui_total_weight_7d: 8.7312
ui_total_weight_14d: 4.3120
ui_actions_7d: 2.7549
ui_total_weight_all: 38.0166
user_conv_rate_14d: 1.9297
ui_actions_all: 12.5960
item_orders_7d: 2.3547
item_orders_14d: 1.2077
item_conv_rate_14d: 4.3929
user_conv_rate_7d: 0.1946
item_actions_7d: 0.6091


In [9]:
def build_features_submit(base: pl.DataFrame) -> pl.DataFrame:
    base = base.unique()

    max_date = base["timestamp"].max()
    
    # временные окна
    cutoff_7d = max_date - timedelta(days=7)
    cutoff_14d = max_date - timedelta(days=14)

    def make_window_stats(df: pl.DataFrame, suffix: str) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
        """Собираем user/item/ui статистики за окно"""
        user_stats = df.group_by("user_id").agg([
            pl.len().alias(f"user_actions_{suffix}"),
            (pl.col("action_type") == "delivered_orders").sum().alias(f"user_orders_{suffix}"),
            (pl.col("action_type") == "to_cart").sum().alias(f"user_cart_{suffix}"),
            pl.col("weight").sum().alias(f"user_total_weight_{suffix}"),
        ])

        item_stats = df.group_by("item_id").agg([
            pl.len().alias(f"item_actions_{suffix}"),
            (pl.col("action_type") == "delivered_orders").sum().alias(f"item_orders_{suffix}"),
            pl.col("user_id").n_unique().alias(f"item_unique_users_{suffix}"),
        ])

        ui_stats = df.group_by(["user_id", "item_id"]).agg([
            pl.len().alias(f"ui_actions_{suffix}"),
            (pl.col("action_type") == "delivered_orders").sum().alias(f"ui_orders_{suffix}"),
            (pl.col("action_type") == "to_cart").sum().alias(f"ui_cart_{suffix}"),
            pl.col("weight").sum().alias(f"ui_total_weight_{suffix}"),
        ])
        return user_stats, item_stats, ui_stats

    # lifetime stats (без окна)
    user_stats_all, item_stats_all, ui_stats_all = make_window_stats(base, "all")

    # 7дн stats
    recent7 = base.filter(pl.col("timestamp") >= cutoff_7d)
    user_stats_7d, item_stats_7d, ui_stats_7d = make_window_stats(recent7, "7d")

    # 14дн stats
    recent14 = base.filter(pl.col("timestamp") >= cutoff_14d)
    user_stats_14d, item_stats_14d, ui_stats_14d = make_window_stats(recent14, "14d")


    # собираем всё
    feats = (
        ui_stats_all
        .join(user_stats_all, on="user_id", how="left")
        .join(item_stats_all, on="item_id", how="left")
        .join(ui_stats_7d, on=["user_id","item_id"], how="left")
        .join(user_stats_7d, on="user_id", how="left")
        .join(item_stats_7d, on="item_id", how="left")
        .join(ui_stats_14d, on=["user_id","item_id"], how="left")
        .join(user_stats_14d, on="user_id", how="left")
        .join(item_stats_14d, on="item_id", how="left")
        .fill_null(0)
    )

    # ratio-фичи
    feats = feats.with_columns([
        (pl.col("ui_cart_14d") / (pl.col("user_cart_14d")+1)).alias("share_ui_cart_in_user_cart_14d"),
        (pl.col("ui_orders_14d") / (pl.col("user_orders_14d")+1)).alias("share_ui_orders_in_user_orders_14d"),
        (pl.col("item_orders_14d") / (pl.col("item_actions_14d")+1)).alias("item_conv_rate_14d"),
        (pl.col("user_orders_14d") / (pl.col("user_actions_14d")+1)).alias("user_conv_rate_14d"),
        (pl.col("ui_cart_7d") / (pl.col("user_cart_7d")+1)).alias("share_ui_cart_in_user_cart_7d"),
        (pl.col("ui_orders_7d") / (pl.col("user_orders_7d")+1)).alias("share_ui_orders_in_user_orders_7d"),
        (pl.col("item_orders_7d") / (pl.col("item_actions_7d")+1)).alias("item_conv_rate_7d"),
        (pl.col("user_orders_7d") / (pl.col("user_actions_7d")+1)).alias("user_conv_rate_7d"),
    ])

    final_cols = FEATURE_COLS + ["user_id", "item_id"]
    feats = feats.select([c for c in final_cols if c in feats.columns])
    
    return feats

In [10]:
loaded_ranker = CatBoostRanker(**params)
loaded_ranker.load_model("/kaggle/input/catboostr/other/default/1/catboost_ranker4.cbm")

<catboost.core.CatBoostRanker at 0x7a3060d97610>

In [11]:
candidates_df_exploded = pl.read_parquet('/kaggle/input/cand-bef-v2/other/default/1')

In [12]:
test_df = df.filter(pl.col('user_id').is_in(test_user_ids_set))

In [13]:
feats_test = build_features_submit(test_df)

In [14]:
users = candidates_df_exploded.select("user_id").unique().to_series().to_list()
n_chunks = 50
user_chunks = np.array_split(users, n_chunks)

top100_list = []

for chunk_users in user_chunks:

    chunk_cand = candidates_df_exploded.filter(pl.col("user_id").is_in(chunk_users))
    chunk_feats = feats_test.filter(pl.col("user_id").is_in(chunk_users))

    merged = chunk_cand.join(chunk_feats, on=["user_id","item_id"], how="left").fill_null(0)

    merged_pd = merged.to_pandas()

    merged_pd["score"] = loaded_ranker.predict(merged_pd[FEATURE_COLS])

    top_chunk = (
        merged_pd.sort_values(["user_id","score"], ascending=[True, False])
        .groupby("user_id", sort=False)
        .head(100)
        .loc[:, ["user_id","item_id","score"]]
    )
    
    top100_list.append(top_chunk)

top100 = pd.concat(top100_list, ignore_index=True)

top100.to_parquet("top100_polars_batched.parquet", index=False)
print("Saved: top100_polars_batched.parquet, rows =", len(top100))

Saved: top100_polars_batched.parquet, rows = 47034700
