In [50]:
# !pip install mlflow fsspec==2025.3.0 scikit-learn==1.3.2 numpy==1.26.4 matplotlib==3.8.4 rich==13.7.0 --upgrade --force-reinstall

In [9]:
!pip install implicit >> _

In [10]:
from datetime import timedelta
from scipy.sparse import csr_matrix
from catboost import CatBoostClassifier

import pandas as pd
import polars as pl
import numpy as np

import implicit

import mlflow

import itertools
import time

In [11]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

from threadpoolctl import threadpool_limits
threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x7c55d8e26050>

## MLFlow Setup

In [12]:
mlflow.set_tracking_uri('http://51.250.35.156:5000/')
mlflow.set_experiment(experiment_name='homework-makochetkov.ext')

<Experiment: artifact_location='s3://mlflow/43', creation_time=1747390606149, experiment_id='43', last_update_time=1747390606149, lifecycle_stage='active', name='homework-makochetkov.ext', tags={}>

## Чтение данных

In [13]:
DATA_DIR = '/kaggle/input/avito-cup-2025-recsys/'

In [14]:
clickstream_df = pl.read_parquet(f'{DATA_DIR}clickstream.pq')
cat_features_df = pl.read_parquet(f'{DATA_DIR}cat_features.pq')
events_df = pl.read_parquet(f'{DATA_DIR}events.pq')
test_users_df = pl.read_parquet(f'{DATA_DIR}test_users.pq')
text_features_df = pl.read_parquet(f'{DATA_DIR}text_features.pq')

## Рассчет метрики

In [15]:
def recall_at(df_solution: pl.DataFrame, df_pred: pl.DataFrame, k=40):
    
    return  df_solution[['node', 'cookie']].join(
        df_pred.group_by('cookie').head(k).with_columns(value=1)[['node', 'cookie', 'value']], 
        how='left',
        on = ['cookie', 'node']
    ).select(
        [pl.col('value').fill_null(0), 'cookie']
    ).group_by(
        'cookie'
    ).agg(
        [
            pl.col('value').sum()/pl.col(
                'value'
            ).count()
        ]
    )['value'].mean()

## train eval split

In [16]:
DAYS_THR = 14

thr = clickstream_df['event_date'].max() - timedelta(days=DAYS_THR)

In [17]:
df_train = clickstream_df.filter(clickstream_df['event_date'] <= thr)
df_eval = clickstream_df.filter(clickstream_df['event_date'] > thr)[['cookie', 'node', 'event']]

In [18]:
df_eval = df_eval.join(df_train, on=['cookie', 'node'], how='anti')

In [19]:
df_eval = df_eval.filter(
    pl.col('event').is_in(
        events_df.filter(pl.col('is_contact') == 1)['event'].unique()
    )
)

In [20]:
df_eval = df_eval.filter(
    pl.col('cookie').is_in(df_train['cookie'].unique())
).filter(
    pl.col('node').is_in(df_train['node'].unique())
)

In [52]:
df_eval = df_eval.unique(['cookie', 'node'])

## БЕЙЗЛАЙН

Представлен в данном ноутбуке: https://github.com/imiderji/ml_pipelines/blob/main/baseline.ipynb

Recall@40 = 0.15041722015925818

## ALS predict

In [21]:
def make_sparse_matrix(users: pd.Series, nodes: pd.Series) -> csr_matrix:
    '''
    Готовит sparse-матрицу для обучения
    '''

    user_ids = users.unique().to_list()
    item_ids = nodes.unique().to_list()
        
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
    index_to_item_id = {v:k for k,v in item_id_to_index.items()}
    
    rows = users.replace_strict(user_id_to_index).to_list()
    cols = nodes.replace_strict(item_id_to_index).to_list()
    
    values = [1] * len(users)
    
    sparse_matrix = csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids)))

    return sparse_matrix, user_id_to_index, index_to_item_id

In [22]:
def als_predict(
    sparse_matrix: csr_matrix,
    users_to_pred: list,
    users_indicies: dict,
    index_items_ids: dict,
    factors: int = 50,
    iters: int = 10,
    regularization: float = 0.01,
    alpha: int = 10,
    use_cg: bool = True,
    random_state: int = 42
):
    als_model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        iterations=iters,
        regularization=regularization,
        alpha=alpha,
        use_cg=use_cg,
        random_state=random_state
    )

    als_model.fit((sparse_matrix * alpha).astype('float64'))

    user4pred = np.array([users_indicies[user_id] for user_id in users_to_pred])

    recommendations, scores = als_model.recommend(
        user4pred, 
        sparse_matrix[user4pred], 
        N=40, 
        filter_already_liked_items=True
    )

    df_pred = pl.DataFrame(
        {
            'node': [
                [index_items_ids[rec_id] for rec_id in rec] for rec in recommendations.tolist()
            ], 
            'cookie': list(users_to_pred),
            'scores': scores.tolist()
        }
    )

    df_pred = df_pred.explode(['node', 'scores'])

    return df_pred

In [23]:
def mlflow_als_predict_log(users_data, als_params):

    als_predicted = []

    with mlflow.start_run(run_name='als'):
        mlflow.log_param('model_type', 'ALS')
        mlflow.log_params(als_params)

        print('Обучение параметрами:', params)

        try:
            train_start_time = time.time()

            als_predicted = als_predict(
                sparse_matrix=users_data['sparse_matrix'],
                users_to_pred=users_data['eval_users'],
                users_indicies=users_data['user_id_to_index'],
                index_items_ids=users_data['index_to_item_id'],
                factors=als_params.get("factors", 50),
                iters=als_params.get("iters", 10),
                regularization=als_params.get("regularization", 0.01),
                alpha=als_params.get("alpha", 10),
                use_cg=als_params.get("use_cg", True),
                random_state=als_params.get("random_state", 42)
            )

            train_time = time.time() - train_start_time
            mlflow.log_metric('seconds_training', train_time)

            print('ALS обучилась за', train_time)

            als_recall40 = 0.0

            if als_predicted is None or als_predicted.height == 0:
                pass
            else:
                als_recall40 = recall_at(
                    df_eval,
                    als_predicted,
                    k=40
                )

            mlflow.log_metric('Recall_40', als_recall40)
            print('Recall@40 =', als_recall40)

        except KeyboardInterrupt as kie:
            mlflow.log_param("error_message", 'Прервано вручную')
            mlflow.set_tag("run_status", "FAILED")

    return als_predicted

## Перебор параметров ALS

In [24]:
users = df_train["cookie"]
nodes = df_train["node"]

In [25]:
eval_users = df_eval['cookie'].unique().to_list()

In [26]:
sparse_matrix, user_id_to_index, index_to_item_id = make_sparse_matrix(users, nodes)

In [27]:
data = {
    'sparse_matrix': sparse_matrix,
    'eval_users': eval_users,
    'user_id_to_index': user_id_to_index,
    'index_to_item_id': index_to_item_id
}

## Параметры 1

In [53]:
params = {
    'factors': 150,
    'iters': 20,
    'regularization': 0.01,
    'alpha': 5
}

In [54]:
als_predicted = mlflow_als_predict_log(data, params)

Обучение параметрами: {'factors': 150, 'iters': 20, 'regularization': 0.01, 'alpha': 5}


  0%|          | 0/20 [00:00<?, ?it/s]

ALS обучилась за 201.82533502578735
Recall@40 = 0.1499945428706099
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/e41904f67473494d94c6fb51dfcffc99
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Параметры 2 Лучший результат

In [2]:
best_als_params = {
    'factors': 200,
    'iters': 10,
    'regularization': 0.1,
    'alpha': 3
}

In [23]:
als_predicted2 = mlflow_als_predict_log(data, best_als_params)

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7a8312702200>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error


Обучение параметрами: {'factors': 200, 'iters': 10, 'regularization': 0.1, 'alpha': 3}


  0%|          | 0/10 [00:00<?, ?it/s]

ALS обучилась за 190.08961749076843
Recall@40 = 0.16129182635690145
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/196c8f3ff0f4403483ad931e1216c0f8
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Параметры 2.1

In [39]:
params = {
    'factors': 200,
    'iters': 20,
    'regularization': 0.1,
    'alpha': 3
}

In [40]:
als_predicted2_1 = mlflow_als_predict_log(data, params)

Обучение параметрами: {'factors': 200, 'iters': 20, 'regularization': 0.1, 'alpha': 3}


  0%|          | 0/20 [00:00<?, ?it/s]

ALS обучилась за 217.85109615325928
Recall@40 = 0.158776013837593
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/e5980f7e31d54840b15c61a141e34041
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Параметры 2.2

In [41]:
params = {
    'factors': 200,
    'iters': 10,
    'regularization': 0.1,
    'alpha': 3,
    'use_cg': False
}

In [42]:
als_predicted2_2 = mlflow_als_predict_log(data, params)

Обучение параметрами: {'factors': 200, 'iters': 10, 'regularization': 0.1, 'alpha': 3, 'use_cg': False}


  0%|          | 0/10 [00:00<?, ?it/s]

ALS обучилась за 1314.901330947876
Recall@40 = 0.1542670646145688
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/443276c71a5c4652b2fab6f120f749cc
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Параметры 3

In [33]:
params = {
    'factors': 100,
    'iters': 10,
    'regularization': 0.01,
    'alpha': 20
}

In [34]:
als_predicted3 = mlflow_als_predict_log(data, params)

Обучение параметрами: {'factors': 100, 'iters': 10, 'regularization': 0.01, 'alpha': 20}


  0%|          | 0/10 [00:00<?, ?it/s]

ALS обучилась за 94.99921584129333
Recall@40 = 0.09391438769157508
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/ae4266a618c84724a84f04d05845d601
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Параметры 4

In [35]:
params = {
    'factors': 220,
    'iters': 20,
    'regularization': 0.1,
    'alpha': 5
}

In [36]:
als_predicted4 = mlflow_als_predict_log(data, params)

Обучение параметрами: {'factors': 220, 'iters': 20, 'regularization': 0.1, 'alpha': 5}


  0%|          | 0/20 [00:00<?, ?it/s]

ALS обучилась за 253.3436722755432
Recall@40 = 0.15332318029021064
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/0528162e737c47959a49e413f465a0bb
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Параметры 5

In [37]:
params = {
    'factors': 180,
    'iters': 20,
    'regularization': 0.1,
    'alpha': 1
}

In [38]:
als_predicted5 = mlflow_als_predict_log(data, params)

Обучение параметрами: {'factors': 180, 'iters': 20, 'regularization': 0.1, 'alpha': 1}


  0%|          | 0/20 [00:00<?, ?it/s]

ALS обучилась за 210.24808049201965
Recall@40 = 0.1539118432998824
🏃 View run als at: http://51.250.35.156:5000/#/experiments/43/runs/b972ef8ef15b4243b9da7ccbd8f2344e
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/43


## Подготовка данных для CatBoost

In [28]:
def split_cands_train_val(df_cands, test_size=0.2, seed=42):
    df_cands = df_cands.to_pandas()
    unique_groups = df_cands['cookie'].unique()

    rng = np.random.RandomState(seed)
    rng.shuffle(unique_groups)

    n_test = int(len(unique_groups) * test_size)
    val_groups = unique_groups[:n_test]
    train_groups = unique_groups[n_test:]

    df_cands_train = df_cands[df_cands['cookie'].isin(train_groups)]
    df_cands_val   = df_cands[df_cands['cookie'].isin(val_groups)]

    return pl.from_pandas(df_cands_train), pl.from_pandas(df_cands_val)

In [29]:
def prepare_for_catboost(als_preds):
    df_eval_positive = (
        df_eval.select(['cookie','node'])
        .with_columns(pl.lit(1).alias('label'))
    )

    candidates_eval = (
        als_preds
        .join(df_eval_positive, on=['cookie','node'], how='left')
        .with_columns(pl.col('label').fill_null(0).alias('label'))
    )

    candidates_train, candidates_val = split_cands_train_val(candidates_eval)
    catboost_features = [col for col in candidates_train.columns if col not in ('cookie', 'node', 'label')]

    return (
        candidates_train,
        candidates_val,
        catboost_features,
        candidates_eval
    )

## Обучение CatBoost

In [30]:
best_als_params = {
    'factors': 200,
    'iters': 10,
    'regularization': 0.1,
    'alpha': 3
}

In [47]:
catboost_params = {
    'iterations': 3000,
    'depth': 6,
    'l2_leaf_reg': 3.0,
    'learning_rate': 0.2,
    'loss_function': 'Logloss'
}

In [48]:
best_als_params_to_log = {f'als_{key}': value for key, value in best_als_params.items()}
catboost_params_to_log = {f'catboost_{key}': value for key, value in catboost_params.items()}

In [49]:
with mlflow.start_run(run_name='best_als + catboost'):
    mlflow.log_param('model_type', 'catboost + als')
    mlflow.log_params(best_als_params_to_log)
    mlflow.log_params(catboost_params_to_log)
    
    print('Начало обучения')
    train_start_time = time.time()

    best_als_predicted = als_predict(
        sparse_matrix=data['sparse_matrix'],
        users_to_pred=data['eval_users'],
        users_indicies=data['user_id_to_index'],
        index_items_ids=data['index_to_item_id'],
        factors=best_als_params.get("factors", 200),
        iters=best_als_params.get("iters", 10),
        regularization=best_als_params.get("regularization", 0.1),
        alpha=best_als_params.get("alpha", 3),
        use_cg=best_als_params.get("use_cg", True),
        random_state=best_als_params.get("random_state", 42)
    )

    candidates_train, candidates_val, catboost_features, candidates_eval = (
        prepare_for_catboost(best_als_predicted)
    )
    X_train = candidates_train.select(catboost_features).to_pandas()
    y_train = candidates_train['label'].to_numpy()
    X_val = candidates_val.select(catboost_features ).to_pandas()
    y_val = candidates_val['label'].to_numpy()

    catboost_ranker = CatBoostClassifier(**catboost_params)

    catboost_ranker.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        verbose=50
    )

    train_time = time.time() - train_start_time
    mlflow.log_metric('seconds_training', train_time)
    print('Два этапа обучились за', train_time)

    X = candidates_eval.select(catboost_features).to_pandas()
    probas = catboost_ranker.predict_proba(X)[:, 1]
    
    candidates_eval = candidates_eval.with_columns(pl.Series(probas).alias('ranker_score'))
    catboost_predicted = (
        candidates_eval
        .sort(['cookie', 'ranker_score'], descending=[False, True])
        .group_by('cookie')
        .head(40)
    )

    recall_40 = recall_at(df_eval, catboost_predicted, k=40)
    mlflow.log_metric('Recall_40', recall_40)

    print('Recall@40 =', recall_40)

Начало обучения


  0%|          | 0/10 [00:00<?, ?it/s]

0:	learn: 0.3884579	test: 0.3889936	best: 0.3889936 (0)	total: 178ms	remaining: 8m 52s
50:	learn: 0.0933000	test: 0.0951647	best: 0.0951496 (29)	total: 6.94s	remaining: 6m 41s
100:	learn: 0.0932051	test: 0.0952388	best: 0.0951496 (29)	total: 13.5s	remaining: 6m 26s
150:	learn: 0.0931379	test: 0.0953149	best: 0.0951496 (29)	total: 21s	remaining: 6m 36s
200:	learn: 0.0931050	test: 0.0953709	best: 0.0951496 (29)	total: 27.8s	remaining: 6m 27s
250:	learn: 0.0930891	test: 0.0954135	best: 0.0951496 (29)	total: 34.6s	remaining: 6m 18s
300:	learn: 0.0930760	test: 0.0954455	best: 0.0951496 (29)	total: 41.3s	remaining: 6m 10s
350:	learn: 0.0930670	test: 0.0954675	best: 0.0951496 (29)	total: 48.1s	remaining: 6m 2s
400:	learn: 0.0930662	test: 0.0954845	best: 0.0951496 (29)	total: 55.6s	remaining: 6m
450:	learn: 0.0930694	test: 0.0954959	best: 0.0951496 (29)	total: 1m 2s	remaining: 5m 51s
500:	learn: 0.0930699	test: 0.0955056	best: 0.0951496 (29)	total: 1m 8s	remaining: 5m 43s
550:	learn: 0.0930721

## Лучшие параметры als + catboost

In [51]:
catboost_params = {
    'iterations': 2000,
    'depth': 6,
    'l2_leaf_reg': 3.0,
    'learning_rate': 0.05,
    'loss_function': 'Logloss'
}