In [231]:
import zipfile
import requests

import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from scipy.sparse import csr_matrix

from PIL import Image
from io import BytesIO
from textwrap import wrap
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import roc_auc_score, log_loss, ndcg_score
from catboost import CatBoostClassifier, Pool
from datetime import datetime
from collections import defaultdict
from typing import Dict, List

In [232]:
train = pl.read_parquet('data/lavka/train.parquet').sort('timestamp')
test = pl.read_parquet('data/lavka/test.parquet')

train = train.sort(
    'timestamp'
).with_columns(
    pl.from_epoch(pl.col("timestamp")).dt.date().alias("date")
)
unique_dates = train["date"].unique().sort()

FILL_VALUE = -999999999.0

---
USER Features
---

In [233]:
def calculate_count_purchases_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_purchases_user')
    )

In [234]:
def calculate_count_views_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_views_user')
    )

In [235]:
def calculate_count_clicks_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_clicks_user')
    )

In [236]:
def calculate_total_interactions_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by('user_id')
        .agg(
            pl.len().alias('total_interactions_user')
        )
    )

In [237]:
def calculate_purchase_to_views_ratio_user(dataset: pl.DataFrame) -> pl.DataFrame:
    views = dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_views')
    )
    
    purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_purchases')
    )
    
    return views.join(purchases, on='user_id', how='left').with_columns(
                pl.col('count_purchases').fill_null(0),
                (pl.col('count_purchases') / pl.col('count_views')).alias('purchase_to_views_ratio_user')
            ).select([
                'user_id', 'purchase_to_views_ratio_user',
            ])

In [238]:
def calculate_unique_products_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type').is_in(["AT_Purchase"])
    ).group_by('user_id').agg(
        pl.col('product_id').n_unique().alias('unique_products_user')
    )

In [239]:
def calculate_hourly_user_purchases(dataset: pl.DataFrame) -> pl.DataFrame:
    dataset = dataset.with_columns(
       pl.from_epoch(pl.col('timestamp')).dt.hour().alias('hour_of_day')
    )
    
    hourly_user = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        ['user_id', 'hour_of_day']
    ).agg(
        pl.len().alias('purchases_in_hour_user')
    )
    
    return hourly_user

In [240]:
def calculate_mean_time_between_purchases(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_Purchase")
        .sort(["user_id", "date"])
        .group_by("user_id")
        .agg(
            pl.col("date").diff().dt.total_days()
            .mean().alias("mean_purchase_interval")
        )
    )


In [241]:
def calculate_purchase_time_std(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_Purchase")
        .sort(["user_id", "date"])
        .group_by("user_id")
        .agg(
            pl.col("date").diff().dt.total_days()
            .std().alias("purchase_interval_std")
        )
    )

In [242]:
def calculate_purchase_trend(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_Purchase")
        .sort(["user_id", "date"])
        .with_columns(
            pl.col("date").rank().over("user_id").alias("rank")
        )
        .group_by("user_id")
        .agg(
            (pl.cov("date", "rank") / pl.var("rank"))
            .alias("purchase_trend_coef")
        )
    )

In [None]:
def calculate_response_time(data):
    return data.sort("date").group_by("user_id", "product_id").agg(
        (pl.col("date").filter(pl.col("action_type") == "AT_Purchase").first() - 
        pl.col("date").filter(pl.col("action_type") == "AT_View").first()
    ).dt.total_seconds().alias("view_to_purchase_seconds"))

---
PRODUCT Features
---

In [244]:
def calculate_count_purchases_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_purchases_product')
    )

In [245]:
def calculate_count_views_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_views_product')
    )

In [246]:
def calculate_count_clicks_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_clicks_product')
    )

In [247]:
def calculate_total_interactions_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by('product_id')
        .agg(
            pl.len().alias('total_interactions_product')
        )
    )

In [248]:
def calculate_purchase_to_views_ratio_product(dataset: pl.DataFrame) -> pl.DataFrame:
    views = dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_views')
    )
    
    purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_purchases')
    )
    
    return views.join(purchases, on='product_id', how='left').with_columns(
                pl.col('count_purchases').fill_null(0),
                (pl.col('count_purchases') / pl.col('count_views')).alias('purchase_to_views_ratio_product')
            ).select([
                'product_id', 'purchase_to_views_ratio_product',
            ])

In [249]:
def calculate_purchase_to_cart_ratio_product(dataset: pl.DataFrame) -> pl.DataFrame:
    cart = dataset.filter(
        pl.col('action_type') == "AT_CartUpdate"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_cart')
    )
    
    purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_purchases')
    )
    
    return cart.join(purchases, on='product_id', how='left').with_columns(
                pl.col('count_purchases').fill_null(0),
                (pl.col('count_purchases') / pl.col('count_cart')).alias('purchase_to_cart_ratio_product')
            ).select([
                'product_id', 'purchase_to_cart_ratio_product',
            ])

In [250]:
def calculate_unique_users_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type').is_in(["AT_Purchase"])
    ).group_by('product_id').agg(
        pl.col('user_id').n_unique().alias('unique_users_product')
    )

In [251]:
def calculate_hourly_product_purchases(dataset: pl.DataFrame) -> pl.DataFrame:
    dataset = dataset.with_columns(
       pl.from_epoch(pl.col('timestamp')).dt.hour().alias('hour_of_day')
    )
    
    hourly_purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        ['product_id', 'hour_of_day']
    ).agg(
        pl.len().alias('purchases_in_hour')
    )
    
    return hourly_purchases


---
CITY Features
---

In [252]:
def calculate_count_purchases_city(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'city_name'
    ).agg(
        pl.len().alias('count_purchases_city')
    )

In [253]:
def calculate_count_views_city(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'city_name'
    ).agg(
        pl.len().alias('count_views_city')
    )

In [254]:
def calculate_count_clicks_city(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'city_name'
    ).agg(
        pl.len().alias('count_clicks_city')
    )

In [255]:
def calculate_total_interactions_city(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by('city_name')
        .agg(
            pl.len().alias('total_interactions_city')
        )
    )

In [256]:
def calculate_purchase_to_views_ratio_city(dataset: pl.DataFrame) -> pl.DataFrame:
    views = dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'city_name'
    ).agg(
        pl.len().alias('count_views')
    )
    
    purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'city_name'
    ).agg(
        pl.len().alias('count_purchases')
    )
    
    return views.join(purchases, on='city_name', how='left').with_columns(
                pl.col('count_purchases').fill_null(0),
                (pl.col('count_purchases') / pl.col('count_views')).alias('purchase_to_views_ratio_city')
            ).select([
                'city_name', 'purchase_to_views_ratio_city',
            ])

In [257]:
def calculate_total_interactions_to_purchases_ratio_city(dataset: pl.DataFrame) -> pl.DataFrame:
    purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'city_name'
    ).agg(
        pl.len().alias('count_purchases')
    )
    
    total = dataset.group_by('city_name').agg(
            pl.len().alias('total_interactions')
        )
    
    return purchases.join(total, on='city_name', how='left').with_columns(
                pl.col('total_interactions').fill_null(0),
                (pl.col('count_purchases') / pl.col('total_interactions')).alias('total_interactions_to_purchases_ratio_city')
            ).select([
                'city_name', 'total_interactions_to_purchases_ratio_city',
            ])

In [258]:
def calculate_unique_stores_city(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.group_by('city_name').agg(
        pl.col('store_id').n_unique().alias('unique_stores_city')
    )

---
USER_AND_PRODUCT Features
---

In [None]:
import polars as pl

def calculate_days_since_last_purchase(dataset: pl.DataFrame) -> pl.DataFrame:
    last_purchases = (
        dataset
        .filter(pl.col("action_type") == "AT_Purchase")
        .group_by("user_id", "product_id")
        .agg(
            pl.col("date").max().alias("last_purchase_date")
        )
    )
    return (
        dataset
        .join(
            last_purchases,
            on=["user_id", "product_id"],
            how="left"
        )
        .with_columns(
            (pl.col("date") - pl.col("last_purchase_date"))
            .dt.total_days()
            .alias("days_since_last_purchase")
        )
        .drop("last_purchase_date")
        .select([
            "date",
            "user_id",
            "product_id",
            "days_since_last_purchase"
        ])
    )


In [260]:
def calculate_mean_time_between_cart_updates(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_CartUpdate")
        .sort("date")
        .group_by("user_id", "product_id")
        .agg(
            pl.col("date").diff().dt.total_days()
            .mean().alias("u2s_mean_time_between_cartupdates")
        )
    )


In [261]:
def calculate_count_views_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'user_id',
        'product_id'
    ).agg(
        pl.len().alias('count_views_user_and_product')
    )

In [262]:
def calculate_count_click_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'user_id',
        'product_id'
    ).agg(
        pl.len().alias('count_click_user_and_product')
    )

In [263]:
def calculate_count_purchase_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id',
        'product_id'
    ).agg(
        pl.len().alias('count_purchase_user_and_product')
    )

In [264]:
def calculate_last_purchase_time_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:

    return (
        dataset
        .filter(pl.col('action_type') == "AT_Purchase")
        .sort(['user_id', 'product_id', 'timestamp'])
        .group_by(['user_id', 'product_id'])
        .agg(
            pl.col('timestamp').max().alias('last_purchase_time')
        )
    )


In [None]:
def calculate_purchase_to_views_ratio_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:
    views = dataset.filter(
        pl.col('action_type') == "AT_View"
    ).select(["user_id", "product_id", "date"])
    
    purchases = dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).select(["user_id", "product_id", "date"])
    
    result_df = views.group_by(
        ['user_id', 'product_id']
    ).agg(
        pl.len().alias('total_views')
    ).join(
        purchases.group_by(['user_id', 'product_id']).agg(
            pl.len().alias('total_purchases')
        ),
        on=['user_id', 'product_id'],
        how='left'
    ).with_columns(
        pl.col('total_purchases').fill_null(0),
        (pl.col('total_purchases') / pl.col('total_views')).alias('purchase_to_views_ratio')
    ).select(
        ['user_id', 'product_id', 'purchase_to_views_ratio']
    )
    
    for days in range(60, 361, 60):
        # Просмотры за период
        period_views = views.group_by(['user_id', 'product_id']).agg(
            pl.col('date').filter(
                pl.col('date') > (pl.max('date') - pl.duration(days=days))
            ).count().alias(f'views_last_{days}d')
        )
        
        # Покупки за период
        period_purchases = purchases.group_by(['user_id', 'product_id']).agg(
            pl.col('date').filter(
                pl.col('date') > (pl.max('date') - pl.duration(days=days))
            ).count().alias(f'purchases_last_{days}d')
        )
        
        # Соединяем и вычисляем отношение
        period_ratio = period_views.join(
            period_purchases,
            on=['user_id', 'product_id'],
            how='left'
        ).with_columns(
            pl.col(f'purchases_last_{days}d').fill_null(0),
            (pl.col(f'purchases_last_{days}d') / pl.col(f'views_last_{days}d'))
            .fill_null(0)
            .alias(f'ratio_last_{days}d')
        ).select(
            ['user_id', 'product_id', f'ratio_last_{days}d']
        )
        
        # Добавляем к результату
        result_df = result_df.join(
            period_ratio,
            on=['user_id', 'product_id'],
            how='left'
        )
    
    return result_df

In [266]:
def calculate_total_interactions_user_and_product(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(['user_id', 'product_id'])
        .agg(
            pl.len().alias('total_interactions_user_and_product')
        )
    )

---
OTHER
---

In [267]:
def calculate_ctr(dataset: pl.DataFrame) -> pl.DataFrame:
    data = dataset.group_by(
        'action_type',
        'product_id'
    ).agg(
        pl.len()
    )
    
    clicks = data.filter(
         pl.col('action_type') == "AT_Click"
    )
    
    views = data.filter(
         pl.col('action_type') == "AT_View"
    )
    
    ctr = clicks.join(
        views, 
        on='product_id'
    ).with_columns(
        ctr=pl.col('len') / pl.col('len_right')
    ).select(
        'product_id', 
        'ctr'
    )

    return ctr

In [268]:
def calculate_convertion(dataset: pl.DataFrame) -> pl.DataFrame:
    data = dataset.group_by(
        'action_type',
        'product_id'
    ).agg(
        pl.len()
    )
    
    purchases = data.filter(
         pl.col('action_type') == "AT_CartUpdate"
    )
    
    clicks = data.filter(
         pl.col('action_type') == "AT_Click"
    )
    
    convertion = purchases.join(
        clicks, 
        on='product_id'
    ).with_columns(
        convertion=pl.col('len') / pl.col('len_right')
    ).select(
        'product_id', 
        'convertion'
    )

    return convertion

In [269]:
def calculate_count_purchases_store(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by('store_id').agg(
        pl.len().alias('count_purchases_store'),
    )

In [272]:
def calculate_count_purchase_user_and_store(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_Purchase"
    ).group_by(
        'user_id',
        'store_id'
    ).agg(
        pl.len().alias('count_purchase_user_and_store')
    )

In [273]:
def calculate_count_purchases_user_and_category(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset
        .filter(pl.col('action_type') == "AT_Purchase")
        .group_by(['user_id', 'product_category'])
        .agg(
            pl.len().alias('user_category_purchases')
        )
    )

In [274]:
def calculate_count_views_user_and_category(dataset: pl.DataFrame) -> pl.DataFrame:
    """Подсчёт количества просмотров по пользователям и категориям"""
    return (
        dataset
        .filter(pl.col('action_type') == "AT_View")
        .group_by(['user_id', 'product_category'])
        .agg(
            pl.len().alias('user_category_views')
        )
    )

def calculate_count_clicks_user_and_category(dataset: pl.DataFrame) -> pl.DataFrame:
    """Подсчёт количества кликов по пользователям и категориям"""
    return (
        dataset
        .filter(pl.col('action_type') == "AT_Click")
        .group_by(['user_id', 'product_category'])
        .agg(
            pl.len().alias('user_category_clicks')
        )
    )

def calculate_views_to_clicks_ratio_user_and_category(dataset: pl.DataFrame) -> pl.DataFrame:
    """Отношение просмотров к кликам по пользователям и категориям"""
    views = calculate_count_views_user_and_category(dataset)
    clicks = calculate_count_clicks_user_and_category(dataset)
    
    return (
        views.join(clicks, on=['user_id', 'product_category'], how='left')
        .with_columns(
            (pl.col('user_category_clicks') / pl.col('user_category_views')).alias('user_category_views_to_clicks_ratio')
        )
        .fill_nan(0)
        .select(['user_id', 'product_category', 'user_category_views_to_clicks_ratio'])
    )

def calculate_clicks_to_purchases_ratio_user_and_category(dataset: pl.DataFrame) -> pl.DataFrame:
    """Отношение кликов к покупкам по пользователям и категориям"""
    clicks = calculate_count_clicks_user_and_category(dataset)
    purchases = calculate_count_purchases_user_and_category(dataset)
    
    return (
        clicks.join(purchases, on=['user_id', 'product_category'], how='left')
        .with_columns(
            (pl.col('user_category_purchases') / pl.col('user_category_clicks')).alias('user_category_clicks_to_purchases_ratio')
        )
        .fill_nan(0)
        .select(['user_id', 'product_category', 'user_category_clicks_to_purchases_ratio'])
    )

def calculate_total_interactions_user_and_category(dataset: pl.DataFrame) -> pl.DataFrame:
    """Общее количество взаимодействий по пользователям и категориям"""
    return (
        dataset
        .group_by(['user_id', 'product_category'])
        .agg(
            pl.len().alias('user_category_total_interactions')
        )
    )

In [275]:
def calculate_clicks_to_views_ratio_city(dataset: pl.DataFrame) -> pl.DataFrame:
    clicks = dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'city_name', 'product_id'
    ).agg(
        pl.len().alias('count_clicks')
    )
    
    views = dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'city_name', 'product_id'
    ).agg(
        pl.len().alias('count_views')
    )
    
    return clicks.join(views, on=['city_name', 'product_id'], how='left').with_columns(
                pl.col('count_clicks').fill_null(0),
                (pl.col('count_clicks') / pl.col('count_views')).alias('clicks_to_views_ratio_city')
            ).select([
                'city_name', 'product_id', 'clicks_to_views_ratio_city',
            ])


def calculate_clicks_to_cart_ratio_user_product(dataset: pl.DataFrame) -> pl.DataFrame:
    clicks = dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'user_id', 'product_id'
    ).agg(
        pl.len().alias('count_clicks')
    )
    
    carts = dataset.filter(
        pl.col('action_type') == "AT_CartUpdate"
    ).group_by(
        'user_id', 'product_id'
    ).agg(
        pl.len().alias('count_carts')
    )
    
    return clicks.join(carts, on=['user_id', 'product_id'], how='left').with_columns(
                pl.col('count_clicks').fill_null(0),
                (pl.col('count_clicks') / pl.col('count_carts')).alias('click_to_cart_ratio_user_product')
            ).select([
                'user_id', 'product_id', 'click_to_cart_ratio_user_product',
            ])

def calculate_clicks_to_cart_ratio_product(dataset: pl.DataFrame) -> pl.DataFrame:
    clicks = dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_clicks')
    )
    
    carts = dataset.filter(
        pl.col('action_type') == "AT_CartUpdate"
    ).group_by(
        'product_id'
    ).agg(
        pl.len().alias('count_carts')
    )
    
    return clicks.join(carts, on='product_id', how='left').with_columns(
                pl.col('count_clicks').fill_null(0),
                (pl.col('count_clicks') / pl.col('count_carts')).alias('click_to_cart_ratio_product')
            ).select([
                'product_id', 'click_to_cart_ratio_product',
            ])

def calculate_log_count_views_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.filter(
        pl.col('action_type') == "AT_View"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().log().alias('log_count_views_user')
    )

def calculate_clicks_to_cart_ratio_user(dataset: pl.DataFrame) -> pl.DataFrame:
    clicks = dataset.filter(
        pl.col('action_type') == "AT_Click"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_clicks')
    )
    
    carts = dataset.filter(
        pl.col('action_type') == "AT_CartUpdate"
    ).group_by(
        'user_id'
    ).agg(
        pl.len().alias('count_carts')
    )
    
    return clicks.join(carts, on='user_id', how='left').with_columns(
                pl.col('count_clicks').fill_null(0),
                (pl.col('count_clicks') / pl.col('count_carts')).alias('click_to_cart_ratio_user')
            ).select([
                'user_id', 'click_to_cart_ratio_user',
            ])

---
NEW
---

In [276]:
def calculate_user_retention(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_Purchase")
        .sort(["user_id", "date"])
        .group_by("user_id")
        .agg(
            (pl.col("product_id").n_unique() / pl.col("date").n_unique()).alias("repeat_purchase_ratio"),
            (pl.col("date").diff().dt.total_days().mean() < 5).cast(pl.Int8).alias("frequent_buyer")
        )
    )

#user

def calculate_product_position_efficiency(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by("product_id")
        .agg(
            pl.col("position_in_request").filter(
                pl.col("action_type") == "AT_Click"
            ).mean().alias("avg_purchase_position"),
            
            pl.col("position_in_request").filter(
                pl.col("action_type") == "AT_Click"
            ).min().alias("best_position")
        )
    )

#item

def calculate_behavior_uniqueness(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by("user_id")
        .agg(
            (pl.col("product_category").n_unique() / pl.col("product_id").n_unique()).alias("category_exploration_index"),
            (pl.col("source_type").n_unique() / pl.col("request_id").n_unique()).alias("source_diversity_index")
        )
    )

#user

def calculate_geo_dependency(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["product_id", "city_name"])
        .agg(
            (pl.col("action_type") == "AT_Purchase").sum().alias("city_purchases")
        )
        .group_by("product_id")
        .agg(
            (pl.col("city_purchases").max() / pl.col("city_purchases").sum()).alias("geo_concentration"),
            pl.col("city_name").filter(pl.col("city_purchases") == pl.col("city_purchases").max()).first().alias("top_city")
        )
    )

#item

def calculate_product_hourly_pattern(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.with_columns(hour=pl.from_epoch("timestamp").dt.hour())
        .filter(pl.col("action_type") == "AT_Purchase")
        .group_by(["product_id", "hour"])
        .agg(pl.len().alias("purchases"))
        .sort(["product_id", "hour"])
        .group_by("product_id")
        .agg(
            pl.col("hour").filter(pl.col("purchases") == pl.col("purchases").max()).first().alias("peak_hour"),
            (pl.col("purchases").max() / pl.col("purchases").sum()).alias("hour_concentration")
        )
    )

#item

def calculate_position_stability(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by("product_id")
        .agg(
            pl.col("position_in_request").std().alias("position_std"),
            (pl.col("position_in_request").max() - pl.col("position_in_request").min()).alias("position_range")
        )
    )

#item

def calculate_category_trend(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.with_columns(week=pl.from_epoch("timestamp").dt.truncate("1w"))
        .filter(pl.col("action_type") == "AT_Purchase")
        .group_by(["product_category", "week"])
        .agg(pl.len().alias("weekly_purchases"))
        .sort(["product_category", "week"])
        .group_by("product_category")
        .agg(
            ((pl.col("weekly_purchases").last() - pl.col("weekly_purchases").first()) / 
             pl.col("weekly_purchases").first()).alias("category_trend")
        )
    )

# cats

def calculate_request_uniqueness(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by("product_id")
        .agg(
            (pl.col("request_id").n_unique() / pl.len()).alias("request_uniqueness"),
            pl.col("position_in_request").std().alias("request_position_variability")
        )
    )

#item

def calculate_seasonality(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.with_columns(
            month=pl.from_epoch("timestamp").dt.month(),
            is_purchase=(pl.col("action_type") == "AT_Purchase").cast(pl.Int8)
        )
        .group_by(["product_id", "month"])
        .agg(
            pl.col("is_purchase").sum().alias("monthly_purchases")
        )
        .group_by("product_id")
        .agg(
            (pl.col("monthly_purchases").max() - pl.col("monthly_purchases").min()).alias("seasonality_amplitude"),
            pl.col("month").filter(pl.col("monthly_purchases") == pl.col("monthly_purchases").max()).first().alias("peak_month")
        )
    )

#item

def calculate_social_proof_metrics(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["product_id", "city_name"])
        .agg(
            pl.col("user_id").n_unique().alias("unique_buyers_city"),
            pl.len().alias("total_purchases_city")
        )
        .group_by("product_id")
        .agg(
            (pl.col("unique_buyers_city").max() / pl.col("unique_buyers_city").sum()).alias("city_concentration_index"),
            (pl.col("total_purchases_city").max() / pl.col("total_purchases_city").sum()).alias("purchase_concentration_index")
        )
    )

#item

def calculate_engagement_dynamics_user(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.with_columns(
            week=pl.from_epoch("timestamp").dt.truncate("1w")
        )
        .group_by(["user_id", "week"])
        .agg(
            pl.col("action_type").filter(pl.col("action_type") == "AT_Purchase").count().alias("weekly_purchases"),
            pl.col("action_type").filter(pl.col("action_type") == "AT_View").count().alias("weekly_views")
        )
        .sort(["user_id", "week"])
        .group_by("user_id")
        .agg(
            ((pl.col("weekly_purchases").last() - pl.col("weekly_purchases").first()) / 
             (pl.col("weekly_purchases").first() + 1)).alias("purchase_growth_rate"),
            ((pl.col("weekly_views").last() - pl.col("weekly_views").first()) / 
             (pl.col("weekly_views").first() + 1)).alias("view_growth_rate")
        )
    )

#user

def calculate_engagement_dynamics(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.with_columns(
            week=pl.from_epoch("timestamp").dt.truncate("1w")
        )
        .group_by(["user_id", "product_id", "week"])
        .agg(
            (pl.col("action_type") == "AT_Purchase").sum().alias("weekly_purchases"),
            pl.len().alias("weekly_interactions")
        )
        .sort(["user_id", "product_id", "week"])
        .group_by(["user_id", "product_id"])
        .agg(
            # Тренд вовлеченности
            ((pl.col("weekly_interactions").last() - pl.col("weekly_interactions").first()) / 
             (pl.col("weekly_interactions").first() + 1)).alias("engagement_trend"),
            # Стабильность взаимодействий
            pl.col("weekly_interactions").std().alias("engagement_stability")
        )
    )

#user item

def calculate_relative_attractiveness(dataset: pl.DataFrame) -> pl.DataFrame:
    user_product_stats = (
        dataset.group_by(["user_id", "product_id"])
        .agg(
            (pl.col("action_type") == "AT_Purchase").sum().alias("user_product_purchases"),
            pl.len().alias("user_product_interactions")
        )
    )
    
    user_stats = (
        dataset.group_by("user_id")
        .agg(
            pl.len().alias("total_user_interactions"),
            (pl.col("action_type") == "AT_Purchase").sum().alias("total_user_purchases")
        )
    )
    
    return (
        user_product_stats.join(user_stats, on="user_id")
        .select([
            "user_id",
            "product_id",
            # Относительная популярность продукта у пользователя
            (pl.col("user_product_interactions") / pl.col("total_user_interactions")).alias("relative_interaction_share"),
            # Относительная конверсия продукта
            (pl.col("user_product_purchases") / pl.col("user_product_interactions") - 
             pl.col("total_user_purchases") / pl.col("total_user_interactions")).alias("conversion_deviation")
        ])
    )

#user item

def calculate_personal_value_index(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "product_id"])
        .agg(
            ((pl.col("action_type") == "AT_Purchase").sum() * 2 - 
             (pl.col("action_type") == "AT_View").sum() * 0.5 +
             (pl.col("action_type") == "AT_Cart").sum() * 1.5) / 
            pl.len().alias("personal_value_score")
        )
    )

#user item

def calculate_brand_loyalty_index(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "store_id", "product_id"])
        .agg(
            (pl.col("action_type") == "AT_Purchase").sum().alias("store_product_purchases"),
            pl.len().alias("store_product_interactions")
        )
        .group_by(["user_id", "product_id"])
        .agg(
            (pl.col("store_product_purchases").max() / 
             pl.col("store_product_interactions").sum()).alias("brand_loyalty_index")
        )
    )

#user item

def calculate_seasonal_demand_coefficient(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.with_columns(month=pl.from_epoch("timestamp").dt.month())
        .group_by(["user_id", "product_id", "month"])
        .agg(pl.len().alias("monthly_interactions"))
        .group_by(["user_id", "product_id"])
        .agg(
            (pl.col("monthly_interactions").max() - 
             pl.col("monthly_interactions").min()) / 
            pl.col("monthly_interactions").mean().alias("seasonality_index")
        )
    )

#user item

def calculate_cross_channel_index(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "product_id", "source_type"])
        .agg(
            (pl.col("action_type") == "AT_Purchase").sum().alias("source_purchases"),
            pl.len().alias("source_interactions")
        )
        .group_by(["user_id", "product_id"])
        .agg(
            pl.col("source_type").n_unique().alias("unique_channels_used")
        )
    )

#user item

def calculate_user_session_duration(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "date"])
        .agg(
            (pl.max("timestamp") - pl.min("timestamp")).alias("session_duration")
        )
        .group_by("user_id")
        .agg(
            pl.mean("session_duration").alias("avg_session_duration")
        )
    )

#user

def calculate_user_preferred_category(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_Purchase")
        .group_by(["user_id", "product_category"])
        .agg(pl.len().alias("purchase_count"))
        .group_by("user_id")
        .agg(
            pl.col("product_category").first().alias("preferred_category"),
            pl.max("purchase_count").alias("preferred_category_purchases")
        )
    )

#user

def calculate_user_device_preference(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "source_type"])
        .agg(pl.len().alias("interaction_count"))
        .group_by("user_id")
        .agg(
            pl.col("source_type").first().alias("preferred_device"),
            pl.max("interaction_count").alias("preferred_device_interactions")
        )
    )

#user

def calculate_product_age(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by("product_id")
        .agg(
            ((pl.max("date") - pl.min("date")).dt.total_days()).alias("product_age_days")
        )
    )

#product

def calculate_product_popularity_trend(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_View")
        .group_by(["product_id", "date"])
        .agg(pl.len().alias("daily_views"))
        .group_by("product_id")
        .agg(
            pl.cov("date", "daily_views").alias("popularity_trend")
        )
    )

#product

def calculate_city_purchase_frequency(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.filter(pl.col("action_type") == "AT_Purchase")
        .group_by(["city_name", "date"])
        .agg(pl.len().alias("daily_purchases"))
        .group_by("city_name")
        .agg(
            pl.mean("daily_purchases").alias("avg_daily_purchases")
        )
    )

#city

def calculate_repeat_interaction_index(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "product_id"])
        .agg(
            ((pl.col("action_type") == "AT_View").sum() > 1).cast(pl.Int8).alias("is_repeat_view"),
            ((pl.col("action_type") == "AT_CartUpdate").sum() > 1).cast(pl.Int8).alias("is_repeat_cart"),
            (pl.col("date").n_unique() > 1).cast(pl.Int8).alias("multi_day_interaction")
        )
    )

#user item

def calculate_interaction_time_patterns(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset.group_by(["user_id", "product_id"])
        .agg(
            (pl.max("timestamp") - pl.min("timestamp")).alias("total_interaction_time"),
            (pl.col("timestamp").diff().mean()).alias("avg_time_between_interactions"),
            ((pl.max("date") - pl.min("date")).dt.total_days()).alias("interaction_period_days")
        )
    )

#user item


In [277]:
def calculate_user_activity_trend(data):
    return data.group_by_dynamic("date", every="1w", group_by="user_id").agg(
        pl.col("action_type").filter(pl.col("action_type") == "AT_Purchase").count().alias("weekly_purchases"),
        pl.col("action_type").filter(pl.col("action_type") == "AT_View").count().alias("weekly_views")
    ).group_by("user_id").agg(
        pl.col("weekly_purchases").last().alias("last_week_purchases"),
        pl.col("weekly_purchases").mean().alias("avg_weekly_purchases"),
        (pl.col("weekly_purchases").last() - pl.col("weekly_purchases").mean()).alias("purchase_trend")
    )


In [278]:
def calculate_purchase_stability(data):
    return data.group_by_dynamic("date", every="1w", group_by="user_id").agg(
        pl.col("action_type").filter(pl.col("action_type") == "AT_Purchase").count().alias("weekly_purchases")
    ).group_by("user_id").agg(
        (pl.col("weekly_purchases").std() / pl.col("weekly_purchases").mean()).alias("purchase_variability")
    )

In [None]:
def calculate_product_boom(df):
    return df.filter(pl.col("action_type") == "AT_Purchase") \
        .group_by_dynamic("date", every="1w", group_by="product_id") \
        .agg(pl.len().alias("weekly_purchases")) \
        .group_by("product_id") \
        .agg(
            (pl.col("weekly_purchases").last() / pl.col("weekly_purchases").mean()).alias("recent_popularity_boom"),
            pl.col("weekly_purchases").std().alias("popularity_volatility")
        )

def calculate_user_store_diversity(df):
    return df.group_by("user_id").agg(
        pl.col("store_id").n_unique().alias("user_store_diversity")
    )

def calculate_avg_product_position(df):
    return df.group_by("product_id").agg(
        pl.col("position_in_request").mean().alias("avg_product_position")
    )

def calculate_user_product_views_without_purchase(dataset: pl.DataFrame) -> pl.DataFrame:  
    views = dataset.filter(pl.col("action_type") == "AT_View")  
    purchases = dataset.filter(pl.col("action_type") == "AT_Purchase")  
    return views.join(  
        purchases, on=["user_id", "product_id"], how="anti"  
    ).group_by(["user_id", "product_id"]).agg(  
        pl.len().alias("user_product_views_without_purchase")  
    )  

def calculate_user_category_exploration(dataset: pl.DataFrame) -> pl.DataFrame:  
    return dataset.group_by("user_id").agg(  
        pl.col("product_category").n_unique().alias("user_unique_categories_viewed")  
    )  

def calculate_same_day_purchase(dataset: pl.DataFrame) -> pl.DataFrame:  
    first_views = dataset.filter(pl.col("action_type") == "AT_View").group_by(  
        ["user_id", "product_id"]  
    ).agg(pl.col("date").min().alias("first_view_date"))  
    
    purchases = dataset.filter(pl.col("action_type") == "AT_Purchase").group_by(  
        ["user_id", "product_id"]  
    ).agg(pl.col("date").min().alias("first_purchase_date"))  
    
    return first_views.join(  
        purchases, on=["user_id", "product_id"], how="left"  
    ).with_columns(  
        (pl.col("first_purchase_date") == pl.col("first_view_date")).cast(pl.Int8).alias("same_day_purchase")  
    ).select(["user_id", "product_id", "same_day_purchase"])  

def calculate_user_purchase_ratio(dataset: pl.DataFrame) -> pl.DataFrame:
    return dataset.group_by("user_id").agg(
        (pl.col("action_type") == "AT_Purchase").mean().alias("user_purchase_ratio")
    )

def calculate_user_category_entropy(dataset: pl.DataFrame) -> pl.DataFrame:
    category_counts = dataset.group_by(["user_id", "product_category"]).agg(
        pl.len().alias("count")
    )
    return category_counts.group_by("user_id").agg(
        (
            - (pl.col("count") / pl.col("count").sum() * 
              (pl.col("count") / pl.col("count").sum()).log()  # Исправлено: .log() вместо pl.log()
            ).sum().alias("category_entropy"))
    )

def calculate_is_preferred_category(dataset: pl.DataFrame) -> pl.DataFrame:
    user_preferred_categories = (
        dataset
        .group_by(["user_id", "product_category"])
        .agg(pl.len().alias("category_count"))
        .sort(["user_id", "category_count"], descending=[False, True])
        .group_by("user_id")
        .agg(pl.col("product_category").first().alias("preferred_category"))
    )
    
    result = (
        dataset
        .join(user_preferred_categories, on="user_id", how="left")
        .select([
            "user_id",
            "product_category",
            (pl.col("product_category") == pl.col("preferred_category"))
            .alias("is_preferred_category")
        ])
        .unique(subset=["user_id", "product_category"])
    )
    
    return result


In [None]:
def calculate_npmi(df: pl.DataFrame) -> pl.DataFrame:
    total_actions = df.height
    user_counts = df.group_by('user_id').agg(pl.len().alias('user_total_actions'))
    item_counts = df.group_by('product_id').agg(pl.len().alias('item_total_actions'))
    user_item_counts = df.group_by(['user_id', 'product_id']).agg(pl.len().alias('user_item_actions'))
    
    stats = (
        user_item_counts
        .join(user_counts, on='user_id')
        .join(item_counts, on='product_id')
    )
    
    stats = stats.with_columns(
        (pl.col('user_item_actions') / total_actions).alias('p_xy'),
        (pl.col('user_total_actions') / total_actions).alias('p_x'),
        (pl.col('item_total_actions') / total_actions).alias('p_y')
    ).with_columns(
        (pl.lit(1) + (pl.col('p_xy').log() - (pl.col('p_x') * pl.col('p_y')).log())).alias('pmi'),
        ((pl.col('p_xy').log() - (pl.col('p_x') * pl.col('p_y')).log()) / 
        (-pl.col('p_xy').log())).alias('npmi')
    ).select(
        'user_id',
        'product_id',
        'npmi',
        'pmi'
    )
    
    return stats

In [None]:
def calculate_count_views_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset
        .filter(pl.col('action_type') == "AT_View")
        .group_by(['user_id', 'source_type'])
        .agg(
            pl.len().alias('user_source_views')
        )
    )

def calculate_count_clicks_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset
        .filter(pl.col('action_type') == "AT_Click")
        .group_by(['user_id', 'source_type'])
        .agg(
            pl.len().alias('user_source_clicks')
        )
    )

def calculate_count_purchases_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset
        .filter(pl.col('action_type') == "AT_Purchase")
        .group_by(['user_id', 'source_type'])
        .agg(
            pl.len().alias('user_source_purchases')
        )
    )

def calculate_views_to_clicks_ratio_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    views = calculate_count_views_user_and_source(dataset)
    clicks = calculate_count_clicks_user_and_source(dataset)
    
    return (
        views.join(clicks, on=['user_id', 'source_type'], how='left')
        .with_columns(
            (pl.col('user_source_clicks') / pl.col('user_source_views')).alias('user_source_views_to_clicks_ratio')
        )
        .fill_nan(0)
        .select(['user_id', 'source_type', 'user_source_views_to_clicks_ratio'])
    )

def calculate_clicks_to_purchases_ratio_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    clicks = calculate_count_clicks_user_and_source(dataset)
    purchases = calculate_count_purchases_user_and_source(dataset)
    
    return (
        clicks.join(purchases, on=['user_id', 'source_type'], how='left')
        .with_columns(
            (pl.col('user_source_purchases') / pl.col('user_source_clicks')).alias('user_source_clicks_to_purchases_ratio')
        )
        .fill_nan(0)
        .select(['user_id', 'source_type', 'user_source_clicks_to_purchases_ratio'])
    )

def calculate_total_interactions_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    return (
        dataset
        .group_by(['user_id', 'source_type'])
        .agg(
            pl.len().alias('user_source_total_interactions')
        )
    )

def calculate_conversion_rate_user_and_source(dataset: pl.DataFrame) -> pl.DataFrame:
    views = calculate_count_views_user_and_source(dataset)
    purchases = calculate_count_purchases_user_and_source(dataset)
    
    return (
        views.join(purchases, on=['user_id', 'source_type'], how='left')
        .with_columns(
            (pl.col('user_source_purchases') / pl.col('user_source_views')).alias('user_source_conversion_rate')
        )
        .fill_nan(0)
        .select(['user_id', 'source_type', 'user_source_conversion_rate'])
    )

---
Solution
---

In [401]:
def join_features_to_dataset(
    dataset: pl.DataFrame,
    count_purchases_user: pl.DataFrame,
    count_views_user: pl.DataFrame,
    count_clicks_user: pl.DataFrame,
    total_interactions_user: pl.DataFrame,
    purchase_to_views_ratio_user: pl.DataFrame,
    unique_products_user: pl.DataFrame,
    hourly_user_purchases: pl.DataFrame,
    mean_time_between_purchases: pl.DataFrame,
    purchase_time_std: pl.DataFrame,
    purchase_trend: pl.DataFrame,
    behavior_uniqueness: pl.DataFrame,
    user_retention: pl.DataFrame,
    engagement_dynamics_user: pl.DataFrame,
    user_session_duration: pl.DataFrame,
    user_preferred_category: pl.DataFrame,
    user_device_preference: pl.DataFrame,
    user_activity_trend: pl.DataFrame,
    purchase_stability: pl.DataFrame,
    user_store_diversity: pl.DataFrame,
    user_category_exploration: pl.DataFrame,
    user_purchase_ratio: pl.DataFrame,
    user_category_entropy: pl.DataFrame,
    clicks_to_cart_ratio_user: pl.DataFrame,
    log_count_views_user: pl.DataFrame,

    count_purchases_product: pl.DataFrame,
    count_views_product: pl.DataFrame,
    count_clicks_product: pl.DataFrame,
    total_interactions_product: pl.DataFrame,
    purchase_to_views_ratio_product: pl.DataFrame,
    purchase_to_cart_ratio_product: pl.DataFrame,
    unique_users_product: pl.DataFrame,
    hourly_product_purchases: pl.DataFrame,
    product_position_efficiency: pl.DataFrame,
    hourly_pattern: pl.DataFrame,
    geo_dependency: pl.DataFrame,
    position_stability: pl.DataFrame,
    request_uniqueness: pl.DataFrame,
    seasonality: pl.DataFrame,
    social_proof_metrics: pl.DataFrame,
    product_age: pl.DataFrame,
    product_popularity_trend: pl.DataFrame,
    product_boom: pl.DataFrame,
    avg_product_position: pl.DataFrame,
    clicks_to_cart_ratio_product: pl.DataFrame,

    count_purchases_city: pl.DataFrame,
    count_views_city: pl.DataFrame,
    count_clicks_city: pl.DataFrame,
    total_interactions_city: pl.DataFrame,
    purchase_to_views_ratio_city: pl.DataFrame,
    total_interactions_to_purchases_ratio_city: pl.DataFrame,
    unique_stores_city: pl.DataFrame,
    city_purchase_frequency: pl.DataFrame,

    count_purchases_user_and_product: pl.DataFrame,
    count_views_user_and_product: pl.DataFrame,
    count_clicks_user_and_product: pl.DataFrame,
    last_purchase_time_user_and_product: pl.DataFrame,
    purchase_to_views_ratio_user_and_product: pl.DataFrame,
    total_interactions_user_and_product: pl.DataFrame,
    mean_time_between_cart_updates: pl.DataFrame,
    days_since_last_purchase: pl.DataFrame,
    cross_channel_index: pl.DataFrame,
    seasonal_demand_coefficient: pl.DataFrame,
    brand_loyalty_index: pl.DataFrame,
    personal_value_index: pl.DataFrame,
    relative_attractiveness: pl.DataFrame,
    engagement_dynamics: pl.DataFrame,
    repeat_interaction_index: pl.DataFrame,
    interaction_time_patterns: pl.DataFrame,
    response_time: pl.DataFrame,
    user_product_views_without_purchase: pl.DataFrame,
    same_day_purchase: pl.DataFrame,
    clicks_to_cart_ratio_user_product: pl.DataFrame,
    npmi: pl.DataFrame,
    
    ctr: pl.DataFrame,
    convertion: pl.DataFrame,
    count_purchases_store: pl.DataFrame,
    count_purchase_user_and_store: pl.DataFrame,
    count_purchases_user_and_category: pl.DataFrame,
    is_preferred_category: pl.DataFrame,
    category_trend: pl.DataFrame,
    clicks_to_views_ratio_city: pl.DataFrame,
    count_views_user_and_category: pl.DataFrame,
    count_clicks_user_and_category: pl.DataFrame,
    views_to_clicks_ratio_user_and_category: pl.DataFrame,
    clicks_to_purchases_ratio_user_and_category: pl.DataFrame,
    total_interactions_user_and_category: pl.DataFrame,
    count_views_user_and_source: pl.DataFrame,
    count_clicks_user_and_source: pl.DataFrame,
    count_purchases_user_and_source: pl.DataFrame,
    views_to_clicks_ratio_user_and_source: pl.DataFrame,
    clicks_to_purchases_ratio_user_and_source: pl.DataFrame,
    total_interactions_user_and_source: pl.DataFrame,
    conversion_rate_user_and_source: pl.DataFrame,

) -> pl.DataFrame:
    catboost_pool = dataset.with_columns(
            hour_of_day=pl.from_epoch(pl.col('timestamp')).dt.hour(),
            target=pl.when(pl.col('action_type') == "AT_View").then(0).otherwise(1),
            day_of_week=pl.from_epoch(pl.col("timestamp")).dt.weekday(),
            month=pl.from_epoch(pl.col("timestamp")).dt.month(),
            is_weekend=(pl.from_epoch(pl.col("timestamp")).dt.weekday()).is_in([6, 7]).cast(pl.Boolean)
    ).group_by(
        ['product_id', 'request_id']
    ).max().drop(
        #'source_type',
        #'store_id',
        'timestamp',
        #'date',
        'product_image',
        #'product_name',
        #'city_name',
        #'product_category',
        'position_in_request',
        'action_type'
    ).join(
        count_views_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        count_clicks_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        count_purchases_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        views_to_clicks_ratio_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        clicks_to_purchases_ratio_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        total_interactions_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        conversion_rate_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        user_activity_trend,
        on='user_id',
        how='left'
    ).join(
        log_count_views_user,
        on='user_id',
        how='left'
    ).join(
        clicks_to_cart_ratio_user,
        on='user_id',
        how='left'
    ).join(
        user_category_entropy,
        on='user_id',
        how='left'
    ).join(
        user_purchase_ratio,
        on='user_id',
        how='left'
    ).join(
        user_category_exploration,
        on='user_id',
        how='left'
    ).join(
        purchase_stability,
        on='user_id',
        how='left'
    ).join(
        user_store_diversity,
        on='user_id',
        how='left'
    ).join(
        user_device_preference,
        on='user_id',
        how='left'
    ).join(
        user_preferred_category,
        on='user_id',
        how='left'
    ).join(
        user_session_duration,
        on='user_id',
        how='left'
    ).join(
        count_purchases_user,
        on='user_id',
        how='left'
    ).join(
        engagement_dynamics_user,
        on='user_id',
        how='left'
    ).join(
        social_proof_metrics,
        on='product_id',
        how='left'
    ).join(
        purchase_time_std,
        on='user_id',
        how='left'
    ).join(
        purchase_trend,
        on='user_id',
        how='left'
    ).join(
        mean_time_between_purchases,
        on='user_id',
        how='left'
    ).join(
        count_views_user,
        on='user_id',
        how='left'
    ).join(
        count_clicks_user,
        on='user_id',
        how='left'
    ).join(
        total_interactions_user,
        on='user_id',
        how='left'
    ).join(
        purchase_to_views_ratio_user,
        on='user_id',
        how='left'
    ).join(
        unique_products_user,
        on='user_id',
        how='left'
    ).join(
        behavior_uniqueness,
        on='user_id',
        how='left'
    ).join(
        user_retention,
        on='user_id',
        how='left'
    ).join(
        count_purchases_product,
        on='product_id',
        how='left'
    ).join(
        product_position_efficiency,
        on='product_id',
        how='left'
    ).join(
        count_views_product,
        on='product_id',
        how='left'
    ).join(
        count_clicks_product,
        on='product_id',
        how='left'
    ).join(
        avg_product_position,
        on='product_id',
        how='left'
    ).join(
        total_interactions_product,
        on='product_id',
        how='left'
    ).join(
        product_age,
        on='product_id',
        how='left'
    ).join(
        clicks_to_cart_ratio_product,
        on='product_id',
        how='left'
    ).join(
        product_popularity_trend,
        on='product_id',
        how='left'
    ).join(
        purchase_to_views_ratio_product,
        on='product_id',
        how='left'
    ).join(
        product_boom,
        on='product_id',
        how='left'
    ).join(
        purchase_to_cart_ratio_product,
        on='product_id',
        how='left'
    ).join(
        hourly_pattern,
        on='product_id',
        how='left'
    ).join(
        geo_dependency,
        on='product_id',
        how='left'
    ).join(
        position_stability,
        on='product_id',
        how='left'
    ).join(
        request_uniqueness,
        on='product_id',
        how='left'
    ).join(
        seasonality,
        on='product_id',
        how='left'
    ).join(
        hourly_product_purchases,
        on=['product_id', 'hour_of_day'],
        how='left'
    ).join(
        same_day_purchase,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        npmi,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        hourly_user_purchases,
        on=['user_id', 'hour_of_day'],
        how='left'
    ).join(
        response_time,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        user_product_views_without_purchase,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        count_purchases_city,
        on='city_name',
        how='left'
    ).join(
        count_views_city,
        on='city_name',
        how='left'
    ).join(
        count_clicks_city,
        on='city_name',
        how='left'
    ).join(
        total_interactions_city,
        on='city_name',
        how='left'
    ).join(
        purchase_to_views_ratio_city,
        on='city_name',
        how='left'
    ).join(
        total_interactions_to_purchases_ratio_city,
        on='city_name',
        how='left'
    ).join(
        unique_stores_city,
        on='city_name',
        how='left'
    ).join(
        city_purchase_frequency,
        on='city_name',
        how='left'
    ).join(
        count_purchases_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        clicks_to_cart_ratio_user_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        cross_channel_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        repeat_interaction_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        interaction_time_patterns,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        seasonal_demand_coefficient,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        relative_attractiveness,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        engagement_dynamics,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        brand_loyalty_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        personal_value_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        count_views_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        last_purchase_time_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        total_interactions_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        purchase_to_views_ratio_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        count_clicks_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        mean_time_between_cart_updates,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        clicks_to_views_ratio_city,
        on=['city_name', 'product_id'],
        how='left'
    ).join(
        days_since_last_purchase,
        on=['user_id', 'product_id', 'date'],
        how='left'
    ).join(
        unique_users_product,
        on='product_id',
        how='left'
    ).join(
        ctr,
        on='product_id',
        how='left'
    ).join(
        convertion,
        on='product_id',
        how='left'
    ).join(
        count_purchases_store,
        on='store_id',
        how='left'
    ).join(
        count_purchases_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        is_preferred_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        count_views_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        count_clicks_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        views_to_clicks_ratio_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        clicks_to_purchases_ratio_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        total_interactions_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        category_trend,
        on= 'product_category',
        how='left'
    ).join(
        count_purchase_user_and_store,
        on=['user_id', 'store_id'],
        how='left'
    ).select(
        'source_type',
        'product_name',
        'store_id',
        'user_source_views_to_clicks_ratio',
        'preferred_category',
        'product_category',
        'request_uniqueness',
        'is_repeat_cart',
        'hour_of_day',
        'date',
        'purchase_to_views_ratio_product',
        'preferred_device',
        'ratio_last_240d',
        'category_entropy',
        'ctr',
        'day_of_week',
        'purchases_in_hour_user',
        'user_source_views',
        'brand_loyalty_index',
        'request_id',
        'user_id',
        'product_id',
        'target'
    )
    
    return catboost_pool


cat_features = ['source_type',
        'product_name',
        'store_id',
        'preferred_category',
        'product_category',
        'is_repeat_cart',
        'hour_of_day',
        'date',
        'preferred_device',
        'day_of_week']

In [402]:
def prepare_categorical_features(df: pl.DataFrame, cat_features: list, fill_value: str = "MISSING") -> pl.DataFrame:
    return df.with_columns([
        pl.col(col).fill_null(fill_value).cast(str)
        for col in cat_features 
        if col in df.columns
    ])

def prepare_numeric_features(df: pl.DataFrame, fill_value: float = FILL_VALUE) -> pl.DataFrame:
    numeric_cols = [col for col in df.columns if col not in cat_features]
    return df.with_columns([
        pl.col(col).fill_null(fill_value)
        for col in numeric_cols
        if col not in ['target', 'request_id', 'user_id', 'product_id']
    ])

In [403]:
def get_dataset(dataset: pl.DataFrame, data: pl.DataFrame):
    new_data =  join_features_to_dataset(
        dataset,
        calculate_count_purchases_user(data),
        calculate_count_views_user(data),
        calculate_count_clicks_user(data),
        calculate_total_interactions_user(data),
        calculate_purchase_to_views_ratio_user(data),
        calculate_unique_products_user(data),
        calculate_hourly_user_purchases(data),
        calculate_mean_time_between_purchases(data),
        calculate_purchase_time_std(data),
        calculate_purchase_trend(data),
        calculate_behavior_uniqueness(data),
        calculate_user_retention(data),
        calculate_engagement_dynamics_user(data),
        calculate_user_session_duration(data),
        calculate_user_preferred_category(data),
        calculate_user_device_preference(data),
        calculate_user_activity_trend(data),
        calculate_purchase_stability(data),
        calculate_user_store_diversity(data),
        calculate_user_category_exploration(data),
        calculate_user_purchase_ratio(data),
        calculate_user_category_entropy(data),
        calculate_clicks_to_cart_ratio_user(data),
        calculate_log_count_views_user(data),

        calculate_count_purchases_product(data),
        calculate_count_views_product(data),
        calculate_count_clicks_product(data),
        calculate_total_interactions_product(data),
        calculate_purchase_to_views_ratio_product(data),
        calculate_purchase_to_cart_ratio_product(data),
        calculate_unique_users_product(data),
        calculate_hourly_product_purchases(data),
        calculate_product_position_efficiency(data),
        calculate_product_hourly_pattern(data),
        calculate_geo_dependency(data),
        calculate_position_stability(data),
        calculate_request_uniqueness(data),
        calculate_seasonality(data),
        calculate_social_proof_metrics(data),
        calculate_product_age(data),
        calculate_product_popularity_trend(data),
        calculate_product_boom(data),
        calculate_avg_product_position(data),
        calculate_clicks_to_cart_ratio_product(data),

        calculate_count_purchases_city(data),
        calculate_count_views_city(data),
        calculate_count_clicks_city(data),
        calculate_total_interactions_city(data),
        calculate_purchase_to_views_ratio_city(data),
        calculate_total_interactions_to_purchases_ratio_city(data),
        calculate_unique_stores_city(data),
        calculate_city_purchase_frequency(data),

        calculate_count_purchase_user_and_product(data),
        calculate_count_views_user_and_product(data),
        calculate_count_click_user_and_product(data),
        calculate_last_purchase_time_user_and_product(data),
        calculate_purchase_to_views_ratio_user_and_product(data),
        calculate_total_interactions_user_and_product(data),
        calculate_mean_time_between_cart_updates(data),
        calculate_days_since_last_purchase(data),
        calculate_cross_channel_index(data),
        calculate_seasonal_demand_coefficient(data),
        calculate_brand_loyalty_index(data),
        calculate_personal_value_index(data),
        calculate_relative_attractiveness(data),
        calculate_engagement_dynamics(data),
        calculate_repeat_interaction_index(data),
        calculate_interaction_time_patterns(data),
        calculate_response_time(data),
        calculate_user_product_views_without_purchase(data),
        calculate_same_day_purchase(data),
        calculate_clicks_to_cart_ratio_user_product(data),
        calculate_npmi(data),
        
        calculate_ctr(data),
        calculate_convertion(data),
        calculate_count_purchases_store(data),
        calculate_count_purchase_user_and_store(data),
        calculate_count_purchases_user_and_category(data),
        calculate_is_preferred_category(data),
        calculate_category_trend(data),
        calculate_clicks_to_views_ratio_city(data),
        calculate_count_views_user_and_category(data),
        calculate_count_clicks_user_and_category(data),
        calculate_views_to_clicks_ratio_user_and_category(data),
        calculate_clicks_to_purchases_ratio_user_and_category(data),
        calculate_total_interactions_user_and_category(data),
        calculate_count_views_user_and_source(data),
        calculate_count_clicks_user_and_source(data),
        calculate_count_purchases_user_and_source(data),
        calculate_views_to_clicks_ratio_user_and_source(data),
        calculate_clicks_to_purchases_ratio_user_and_source(data),
        calculate_total_interactions_user_and_source(data),
        calculate_conversion_rate_user_and_source(data),
    )

    return new_data.with_columns(
                        pl.Series("random_noise", np.random.normal(size=len(new_data))),
                    ).pipe(prepare_categorical_features, cat_features) \
                   .pipe(prepare_numeric_features)

def prepare_pool(df: pl.DataFrame) -> Pool:
    features = df.drop(['target', 'request_id', 'product_id', 'user_id'])
    
    pool_params = {
        'data': features.to_pandas(),
        'label': df['target'].to_list(),
        'cat_features': cat_features,
    }
    return Pool(**pool_params)

In [404]:
train_data = train.filter(pl.col("date").is_in(unique_dates[-10:-5]))
train_data_1 = train.filter(pl.col("date").is_in(unique_dates[:-10]))
    
val_dates = unique_dates[-5:]
val_data = train.filter(pl.col("date").is_in(val_dates))

train_catboost = get_dataset(train_data, train_data_1)
val_catboost = get_dataset(val_data, pl.concat([train_data_1, train_data]))

train_pool = prepare_pool(train_catboost)
val_pool = prepare_pool(val_catboost)

In [405]:
model = CatBoostClassifier(
    iterations=500,                 # Дольше, но даст шанс для тонкой настройки
    learning_rate=0.03,              # Маленький шаг — меньше переобучения
    depth=7,                         # 6–8 обычно оптимальны
    loss_function="Logloss",        # Классическая для бинарной классификации
    eval_metric="AUC",              # Метрика, чувствительная к порядку вероятностей
    early_stopping_rounds=50,      # Позволяет остановиться, если модель не улучшается

    boosting_type='Ordered',        # Лучше на малых/средних датасетах, стабильнее
    bootstrap_type='Bernoulli',     # Более агрессивный бэггинг
    subsample=0.7,                  # Улучшает обобщающую способность
    colsample_bylevel=0.7,          # Уменьшает переобучение
    random_strength=1.0,            # Усиливает рандомизацию сплитов, для обобщения
    l2_leaf_reg=5.0,                # Регуляризация; можно увеличить до 10-20 при переобучении
    border_count=254,               # Более точные сплиты, но чуть медленнее

    auto_class_weights='Balanced', # Особенно важно при дисбалансе классов

    verbose=100,
    thread_count=-1,
    random_seed=42
)

In [406]:
model.fit(
    train_pool,
    eval_set=val_pool,
    plot=True
)

val_features = val_catboost.drop(['target', 'request_id', 'product_id', 'user_id'])
y_pred_proba = model.predict_proba(
    Pool(data=val_features.to_pandas(), cat_features=cat_features)
)[:, 1]
y_true = val_catboost['target'].to_list()

print(f"ROC AUC: {roc_auc_score(y_true, y_pred_proba):.4f}")
print(f"LogLoss: {log_loss(y_true, y_pred_proba):.4f}")

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.6598601	best: 0.6598601 (0)	total: 135ms	remaining: 1m 7s
100:	test: 0.7471386	best: 0.7494729 (62)	total: 9.13s	remaining: 36.1s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7494729358
bestIteration = 62

Shrink model to first 63 iterations.
ROC AUC: 0.7495
LogLoss: 0.6174


In [407]:
metrics = model.eval_metrics(val_pool, ["AUC:type=Ranking"])
for metric in metrics:
    print(metric)
    print(np.mean(metrics[metric]))

print()
for importance, name in sorted(zip(model.feature_importances_, model.feature_names_), reverse=True):
    print(f'{name}: {importance}')

AUC:type=Ranking
0.7379537370624843

source_type: 24.599947037852317
is_repeat_cart: 14.314734525454606
store_id: 10.148066235233971
product_category: 8.828020625532163
user_source_views_to_clicks_ratio: 8.752594886553476
ratio_last_240d: 8.548929264329944
preferred_category: 5.923456533999595
brand_loyalty_index: 5.5520204481922395
request_uniqueness: 3.0252397515129834
ctr: 2.7688349058116106
product_name: 2.5033066861380875
hour_of_day: 1.6989035640935655
purchase_to_views_ratio_product: 0.7857106508945935
date: 0.773909698083301
user_source_views: 0.7582191442029309
category_entropy: 0.3939247365693813
day_of_week: 0.380347555723995
preferred_device: 0.20141229477941927
random_noise: 0.04242145504182607
purchases_in_hour_user: 0.0


In [408]:
feature_importance = model.get_feature_importance()
shindler_list = []
random_importance = 0
for name, importance in sorted(zip(model.feature_names_, feature_importance), 
                             key=lambda x: x[1], reverse=True):
    if name == "random_noise":
        random_importance = importance

feature_importance = model.get_feature_importance()
shindler_list = []
for name, importance in sorted(zip(model.feature_names_, feature_importance), 
                             key=lambda x: x[1], reverse=True):
    if importance == 0:
        shindler_list.append(name)
shindler_list

['purchases_in_hour_user']

---
Посылка на kaggle
---

In [412]:
def join_features_to_val_dataset(
    dataset: pl.DataFrame,
    count_purchases_user: pl.DataFrame,
    count_views_user: pl.DataFrame,
    count_clicks_user: pl.DataFrame,
    total_interactions_user: pl.DataFrame,
    purchase_to_views_ratio_user: pl.DataFrame,
    unique_products_user: pl.DataFrame,
    hourly_user_purchases: pl.DataFrame,
    mean_time_between_purchases: pl.DataFrame,
    purchase_time_std: pl.DataFrame,
    purchase_trend: pl.DataFrame,
    behavior_uniqueness: pl.DataFrame,
    user_retention: pl.DataFrame,
    engagement_dynamics_user: pl.DataFrame,
    user_session_duration: pl.DataFrame,
    user_preferred_category: pl.DataFrame,
    user_device_preference: pl.DataFrame,
    user_activity_trend: pl.DataFrame,
    purchase_stability: pl.DataFrame,
    user_store_diversity: pl.DataFrame,
    user_category_exploration: pl.DataFrame,
    user_purchase_ratio: pl.DataFrame,
    user_category_entropy: pl.DataFrame,
    clicks_to_cart_ratio_user: pl.DataFrame,
    log_count_views_user: pl.DataFrame,

    count_purchases_product: pl.DataFrame,
    count_views_product: pl.DataFrame,
    count_clicks_product: pl.DataFrame,
    total_interactions_product: pl.DataFrame,
    purchase_to_views_ratio_product: pl.DataFrame,
    purchase_to_cart_ratio_product: pl.DataFrame,
    unique_users_product: pl.DataFrame,
    hourly_product_purchases: pl.DataFrame,
    product_position_efficiency: pl.DataFrame,
    hourly_pattern: pl.DataFrame,
    geo_dependency: pl.DataFrame,
    position_stability: pl.DataFrame,
    request_uniqueness: pl.DataFrame,
    seasonality: pl.DataFrame,
    social_proof_metrics: pl.DataFrame,
    product_age: pl.DataFrame,
    product_popularity_trend: pl.DataFrame,
    product_boom: pl.DataFrame,
    avg_product_position: pl.DataFrame,
    clicks_to_cart_ratio_product: pl.DataFrame,

    count_purchases_city: pl.DataFrame,
    count_views_city: pl.DataFrame,
    count_clicks_city: pl.DataFrame,
    total_interactions_city: pl.DataFrame,
    purchase_to_views_ratio_city: pl.DataFrame,
    total_interactions_to_purchases_ratio_city: pl.DataFrame,
    unique_stores_city: pl.DataFrame,
    city_purchase_frequency: pl.DataFrame,

    count_purchases_user_and_product: pl.DataFrame,
    count_views_user_and_product: pl.DataFrame,
    count_clicks_user_and_product: pl.DataFrame,
    last_purchase_time_user_and_product: pl.DataFrame,
    purchase_to_views_ratio_user_and_product: pl.DataFrame,
    total_interactions_user_and_product: pl.DataFrame,
    mean_time_between_cart_updates: pl.DataFrame,
    days_since_last_purchase: pl.DataFrame,
    cross_channel_index: pl.DataFrame,
    seasonal_demand_coefficient: pl.DataFrame,
    brand_loyalty_index: pl.DataFrame,
    personal_value_index: pl.DataFrame,
    relative_attractiveness: pl.DataFrame,
    engagement_dynamics: pl.DataFrame,
    repeat_interaction_index: pl.DataFrame,
    interaction_time_patterns: pl.DataFrame,
    response_time: pl.DataFrame,
    user_product_views_without_purchase: pl.DataFrame,
    same_day_purchase: pl.DataFrame,
    clicks_to_cart_ratio_user_product: pl.DataFrame,
    npmi: pl.DataFrame,
    
    ctr: pl.DataFrame,
    convertion: pl.DataFrame,
    count_purchases_store: pl.DataFrame,
    count_purchase_user_and_store: pl.DataFrame,
    count_purchases_user_and_category: pl.DataFrame,
    is_preferred_category: pl.DataFrame,
    category_trend: pl.DataFrame,
    clicks_to_views_ratio_city: pl.DataFrame,
    count_views_user_and_category: pl.DataFrame,
    count_clicks_user_and_category: pl.DataFrame,
    views_to_clicks_ratio_user_and_category: pl.DataFrame,
    clicks_to_purchases_ratio_user_and_category: pl.DataFrame,
    total_interactions_user_and_category: pl.DataFrame,
    count_views_user_and_source: pl.DataFrame,
    count_clicks_user_and_source: pl.DataFrame,
    count_purchases_user_and_source: pl.DataFrame,
    views_to_clicks_ratio_user_and_source: pl.DataFrame,
    clicks_to_purchases_ratio_user_and_source: pl.DataFrame,
    total_interactions_user_and_source: pl.DataFrame,
    conversion_rate_user_and_source: pl.DataFrame,
) -> pl.DataFrame:
    catboost_pool = dataset.with_columns([
        pl.from_epoch(pl.col("timestamp")).dt.hour().alias("hour_of_day"),
        (pl.from_epoch(pl.col("timestamp")).dt.weekday() - 1).alias("day_of_week"),
        pl.from_epoch(pl.col("timestamp")).dt.month().alias("month"),
        (pl.from_epoch(pl.col("timestamp")).dt.weekday() - 1)
            .is_in([5, 6])
            .cast(pl.Int8)
            .alias("is_weekend"),
    ]).drop(
        #'source_type',
        #'store_id',
        'timestamp',
        #'city_name',
        #'product_name',
        #'product_category',
        'product_image'
    ).join(
        count_views_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        count_clicks_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        count_purchases_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        views_to_clicks_ratio_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        clicks_to_purchases_ratio_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        total_interactions_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        conversion_rate_user_and_source,
        on=['user_id', 'source_type'],
        how='left'
    ).join(
        user_activity_trend,
        on='user_id',
        how='left'
    ).join(
        log_count_views_user,
        on='user_id',
        how='left'
    ).join(
        clicks_to_cart_ratio_user,
        on='user_id',
        how='left'
    ).join(
        user_category_entropy,
        on='user_id',
        how='left'
    ).join(
        user_purchase_ratio,
        on='user_id',
        how='left'
    ).join(
        user_category_exploration,
        on='user_id',
        how='left'
    ).join(
        purchase_stability,
        on='user_id',
        how='left'
    ).join(
        user_store_diversity,
        on='user_id',
        how='left'
    ).join(
        user_device_preference,
        on='user_id',
        how='left'
    ).join(
        user_preferred_category,
        on='user_id',
        how='left'
    ).join(
        user_session_duration,
        on='user_id',
        how='left'
    ).join(
        count_purchases_user,
        on='user_id',
        how='left'
    ).join(
        engagement_dynamics_user,
        on='user_id',
        how='left'
    ).join(
        social_proof_metrics,
        on='product_id',
        how='left'
    ).join(
        purchase_time_std,
        on='user_id',
        how='left'
    ).join(
        purchase_trend,
        on='user_id',
        how='left'
    ).join(
        mean_time_between_purchases,
        on='user_id',
        how='left'
    ).join(
        count_views_user,
        on='user_id',
        how='left'
    ).join(
        count_clicks_user,
        on='user_id',
        how='left'
    ).join(
        total_interactions_user,
        on='user_id',
        how='left'
    ).join(
        purchase_to_views_ratio_user,
        on='user_id',
        how='left'
    ).join(
        unique_products_user,
        on='user_id',
        how='left'
    ).join(
        behavior_uniqueness,
        on='user_id',
        how='left'
    ).join(
        user_retention,
        on='user_id',
        how='left'
    ).join(
        count_purchases_product,
        on='product_id',
        how='left'
    ).join(
        product_position_efficiency,
        on='product_id',
        how='left'
    ).join(
        count_views_product,
        on='product_id',
        how='left'
    ).join(
        count_clicks_product,
        on='product_id',
        how='left'
    ).join(
        avg_product_position,
        on='product_id',
        how='left'
    ).join(
        total_interactions_product,
        on='product_id',
        how='left'
    ).join(
        product_age,
        on='product_id',
        how='left'
    ).join(
        clicks_to_cart_ratio_product,
        on='product_id',
        how='left'
    ).join(
        product_popularity_trend,
        on='product_id',
        how='left'
    ).join(
        purchase_to_views_ratio_product,
        on='product_id',
        how='left'
    ).join(
        product_boom,
        on='product_id',
        how='left'
    ).join(
        purchase_to_cart_ratio_product,
        on='product_id',
        how='left'
    ).join(
        hourly_pattern,
        on='product_id',
        how='left'
    ).join(
        geo_dependency,
        on='product_id',
        how='left'
    ).join(
        position_stability,
        on='product_id',
        how='left'
    ).join(
        request_uniqueness,
        on='product_id',
        how='left'
    ).join(
        seasonality,
        on='product_id',
        how='left'
    ).join(
        hourly_product_purchases,
        on=['product_id', 'hour_of_day'],
        how='left'
    ).join(
        same_day_purchase,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        npmi,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        hourly_user_purchases,
        on=['user_id', 'hour_of_day'],
        how='left'
    ).join(
        response_time,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        user_product_views_without_purchase,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        count_purchases_city,
        on='city_name',
        how='left'
    ).join(
        count_views_city,
        on='city_name',
        how='left'
    ).join(
        count_clicks_city,
        on='city_name',
        how='left'
    ).join(
        total_interactions_city,
        on='city_name',
        how='left'
    ).join(
        purchase_to_views_ratio_city,
        on='city_name',
        how='left'
    ).join(
        total_interactions_to_purchases_ratio_city,
        on='city_name',
        how='left'
    ).join(
        unique_stores_city,
        on='city_name',
        how='left'
    ).join(
        city_purchase_frequency,
        on='city_name',
        how='left'
    ).join(
        count_purchases_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        clicks_to_cart_ratio_user_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        cross_channel_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        repeat_interaction_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        interaction_time_patterns,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        seasonal_demand_coefficient,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        relative_attractiveness,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        engagement_dynamics,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        brand_loyalty_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        personal_value_index,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        count_views_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        last_purchase_time_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        total_interactions_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        purchase_to_views_ratio_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        count_clicks_user_and_product,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        mean_time_between_cart_updates,
        on=['user_id', 'product_id'],
        how='left'
    ).join(
        clicks_to_views_ratio_city,
        on=['city_name', 'product_id'],
        how='left'
    ).join(
        days_since_last_purchase,
        on=['user_id', 'product_id', 'date'],
        how='left'
    ).join(
        unique_users_product,
        on='product_id',
        how='left'
    ).join(
        ctr,
        on='product_id',
        how='left'
    ).join(
        convertion,
        on='product_id',
        how='left'
    ).join(
        count_purchases_store,
        on='store_id',
        how='left'
    ).join(
        count_purchases_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        is_preferred_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        count_views_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        count_clicks_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        views_to_clicks_ratio_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        clicks_to_purchases_ratio_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        total_interactions_user_and_category,
        on=['user_id', 'product_category'],
        how='left'
    ).join(
        category_trend,
        on= 'product_category',
        how='left'
    ).join(
        count_purchase_user_and_store,
        on=['user_id', 'store_id'],
        how='left'
    ).select(
        'index',
        'source_type',
        'product_name',
        'store_id',
        'user_source_views_to_clicks_ratio',
        'preferred_category',
        'product_category',
        'request_uniqueness',
        'is_repeat_cart',
        'hour_of_day',
        'date',
        'purchase_to_views_ratio_product',
        'preferred_device',
        'ratio_last_240d',
        'category_entropy',
        'ctr',
        'day_of_week',
        'purchases_in_hour_user',
        'user_source_views',
        'brand_loyalty_index',
        'request_id',
        'user_id',
        'product_id',
    )

    catboost_pool = catboost_pool.drop(
        'user_id',
        'request_id',
        'product_id'
    )

    return catboost_pool

In [413]:
def get_dataset_test(dataset: pl.DataFrame, data: pl.DataFrame):
    new_data =  join_features_to_val_dataset(
        dataset,
        calculate_count_purchases_user(data),
        calculate_count_views_user(data),
        calculate_count_clicks_user(data),
        calculate_total_interactions_user(data),
        calculate_purchase_to_views_ratio_user(data),
        calculate_unique_products_user(data),
        calculate_hourly_user_purchases(data),
        calculate_mean_time_between_purchases(data),
        calculate_purchase_time_std(data),
        calculate_purchase_trend(data),
        calculate_behavior_uniqueness(data),
        calculate_user_retention(data),
        calculate_engagement_dynamics_user(data),
        calculate_user_session_duration(data),
        calculate_user_preferred_category(data),
        calculate_user_device_preference(data),
        calculate_user_activity_trend(data),
        calculate_purchase_stability(data),
        calculate_user_store_diversity(data),
        calculate_user_category_exploration(data),
        calculate_user_purchase_ratio(data),
        calculate_user_category_entropy(data),
        calculate_clicks_to_cart_ratio_user(data),
        calculate_log_count_views_user(data),

        calculate_count_purchases_product(data),
        calculate_count_views_product(data),
        calculate_count_clicks_product(data),
        calculate_total_interactions_product(data),
        calculate_purchase_to_views_ratio_product(data),
        calculate_purchase_to_cart_ratio_product(data),
        calculate_unique_users_product(data),
        calculate_hourly_product_purchases(data),
        calculate_product_position_efficiency(data),
        calculate_product_hourly_pattern(data),
        calculate_geo_dependency(data),
        calculate_position_stability(data),
        calculate_request_uniqueness(data),
        calculate_seasonality(data),
        calculate_social_proof_metrics(data),
        calculate_product_age(data),
        calculate_product_popularity_trend(data),
        calculate_product_boom(data),
        calculate_avg_product_position(data),
        calculate_clicks_to_cart_ratio_product(data),

        calculate_count_purchases_city(data),
        calculate_count_views_city(data),
        calculate_count_clicks_city(data),
        calculate_total_interactions_city(data),
        calculate_purchase_to_views_ratio_city(data),
        calculate_total_interactions_to_purchases_ratio_city(data),
        calculate_unique_stores_city(data),
        calculate_city_purchase_frequency(data),

        calculate_count_purchase_user_and_product(data),
        calculate_count_views_user_and_product(data),
        calculate_count_click_user_and_product(data),
        calculate_last_purchase_time_user_and_product(data),
        calculate_purchase_to_views_ratio_user_and_product(data),
        calculate_total_interactions_user_and_product(data),
        calculate_mean_time_between_cart_updates(data),
        calculate_days_since_last_purchase(data),
        calculate_cross_channel_index(data),
        calculate_seasonal_demand_coefficient(data),
        calculate_brand_loyalty_index(data),
        calculate_personal_value_index(data),
        calculate_relative_attractiveness(data),
        calculate_engagement_dynamics(data),
        calculate_repeat_interaction_index(data),
        calculate_interaction_time_patterns(data),
        calculate_response_time(data),
        calculate_user_product_views_without_purchase(data),
        calculate_same_day_purchase(data),
        calculate_clicks_to_cart_ratio_user_product(data),
        calculate_npmi(data),
        
        calculate_ctr(data),
        calculate_convertion(data),
        calculate_count_purchases_store(data),
        calculate_count_purchase_user_and_store(data),
        calculate_count_purchases_user_and_category(data),
        calculate_is_preferred_category(data),
        calculate_category_trend(data),
        calculate_clicks_to_views_ratio_city(data),
        calculate_count_views_user_and_category(data),
        calculate_count_clicks_user_and_category(data),
        calculate_views_to_clicks_ratio_user_and_category(data),
        calculate_clicks_to_purchases_ratio_user_and_category(data),
        calculate_total_interactions_user_and_category(data),
        calculate_count_views_user_and_source(data),
        calculate_count_clicks_user_and_source(data),
        calculate_count_purchases_user_and_source(data),
        calculate_views_to_clicks_ratio_user_and_source(data),
        calculate_clicks_to_purchases_ratio_user_and_source(data),
        calculate_total_interactions_user_and_source(data),
        calculate_conversion_rate_user_and_source(data),
    )

    return new_data.with_columns(
                        pl.Series("random_noise", np.random.normal(size=len(new_data))),
                    ).pipe(prepare_categorical_features, cat_features) \
                   .pipe(prepare_numeric_features)

In [414]:
kaggle_catboost = get_dataset_test(test.with_columns(
    pl.from_epoch(pl.col("timestamp")).dt.date().alias("date")
), train)

test_data = test['index', 'request_id']

test_data.with_columns(
    predict=model.predict_proba(Pool(kaggle_catboost.to_pandas(), cat_features=cat_features))[:, 1]
).sort(
    'predict',
    descending=True
).select(
    'index',
    'request_id'
).write_csv('cb_submit12345.csv')