In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.53.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting packaging<25 (from mlflow-skinny==2.22.0->mlflow)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_co

# MLFLOW

In [None]:
import mlflow

mlflow.set_tracking_uri('http://51.250.35.156:5000/')

In [None]:
mlflow.set_experiment(experiment_id='45')

# Load Data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# !pip install polars==1.25.2 >> _


In [4]:
!pip install implicit >> _


In [5]:
# !mkdir data

In [6]:
# # takes 5 minutes
# !wget https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_1/clickstream.pq -O data/clickstream.pq >> _
# !wget https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_1/test_users.pq -O data/test_users.pq >> _
# !wget https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_1/cat_features.pq -O data/cat_features.pq >> _
# !wget https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_1/text_features.pq -O data/text_features.pq >> _
# !wget https://storage.yandexcloud.net/ds-ods/files/data/docs/competitions/Avitotechcomp2025/data_competition_1/events.pq -O data/events.pq >> _


In [7]:
from datetime import timedelta
import polars as pl
import implicit

In [8]:
DATA_DIR = 'archive'

df_test_users = pl.read_parquet(f'{DATA_DIR}/test_users.pq')
df_clickstream = pl.read_parquet(f'{DATA_DIR}/clickstream.pq')

df_cat_features = pl.read_parquet(f'{DATA_DIR}/cat_features.pq')
df_text_features = pl.read_parquet(f'{DATA_DIR}/text_features.pq')
df_event = pl.read_parquet(f'{DATA_DIR}/events.pq')

# PREPARE TRAIN EVAL

In [9]:
EVAL_DAYS_TRESHOLD = 14

In [10]:
treshhold = df_clickstream['event_date'].max() - timedelta(days=EVAL_DAYS_TRESHOLD)

In [11]:
df_train = df_clickstream.filter(df_clickstream['event_date']<= treshhold)
df_eval = df_clickstream.filter(df_clickstream['event_date']> treshhold)[['cookie', 'node', 'event']]

In [12]:
df_eval = df_eval.join(df_train, on=['cookie', 'node'], how='anti')


In [13]:
df_eval = df_eval.filter(
    pl.col('event').is_in(
        df_event.filter(pl.col('is_contact')==1)['event'].unique()
    )
)

In [14]:
df_eval = df_eval.filter(
        pl.col('cookie').is_in(df_train['cookie'].unique())
    ).filter(
        pl.col('node').is_in(df_train['node'].unique())
    )

In [15]:
df_eval = df_eval.unique(['cookie', 'node'])

# ALS

In [16]:
def get_als_pred(users, nodes, user_to_pred, model):
    user_ids = users.unique().to_list()
    item_ids = nodes.unique().to_list()
        
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
    index_to_item_id = {v:k for k,v in item_id_to_index.items()}
    
    rows = users.replace_strict(user_id_to_index).to_list()
    cols = nodes.replace_strict(item_id_to_index).to_list()
    
    values = [1] * len(users)
    
    sparse_matrix = csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids)))
    
    # model = implicit.als.AlternatingLeastSquares(iterations=10, factors=60)
    model.fit(sparse_matrix, )
    
    
    user4pred = np.array([user_id_to_index[i] for i in user_to_pred])
    
    recommendations, scores = model.recommend(user4pred, sparse_matrix[user4pred], N=40, filter_already_liked_items=True)
    
    df_pred = pl.DataFrame(
        {
            'node': [
                [index_to_item_id[i] for i in i] for i in recommendations.tolist()
            ], 
             'cookie': list(user_to_pred),
            'scores': scores.tolist()
            
        }
    )
    df_pred = df_pred.explode(['node', 'scores'])
    return df_pred

In [17]:
def get_params_als(model):
    params = {'als_factors': model.factors, 'als_regularization': model.regularization, 
              'als_alpha': model.alpha, 'als_dtype': model.dtype, 'als_use_native': model.use_native,
              'als_use_cg': model.use_cg,'als_terations': model.iterations, 'als_random_state': model.random_state}
    return params

In [22]:
from scipy.sparse import csr_matrix
import numpy as np
import implicit


users = df_train["cookie"]
nodes = df_train["node"]
eval_users = df_eval['cookie'].unique().to_list()

model = implicit.als.AlternatingLeastSquares(
    factors=180,
    regularization=0.12423506398,
    alpha=3.5323,
    dtype=np.float32,
    use_native=True,
    use_cg=True,
    iterations=15,
)

df_pred = get_als_pred(users, nodes,eval_users, model)

# recall_at(df_eval, df_pred, k=40)

  0%|          | 0/15 [00:01<?, ?it/s]

# CALC EVAL METRICS

In [23]:
def recall_at(df_true, df_pred, k=40):
    return  df_true[['node', 'cookie']].join(
        df_pred.group_by('cookie').head(k).with_columns(value=1)[['node', 'cookie', 'value']], 
        how='left',
        on = ['cookie', 'node']
    ).select(
        [pl.col('value').fill_null(0), 'cookie']
    ).group_by(
        'cookie'
    ).agg(
        [
            pl.col('value').sum()/pl.col(
                'value'
            ).count()
        ]
    )['value'].mean()


In [24]:
recall_at(df_eval, df_pred, k=40)

0.15552829894825254

# baseline

In [51]:
model_baseline = implicit.als.AlternatingLeastSquares(
    factors=60,
    iterations=10,
)

df_pred_baseline = get_als_pred(users, nodes,eval_users, model_baseline)

recall_at(df_eval, df_pred_baseline, k=40)

0.1517838724413563

# OPTUNA OPTIMIZATION

In [28]:
import optuna

In [29]:
users = df_train["cookie"]
nodes = df_train["node"]
user_to_pred = df_eval['cookie'].unique().to_list()

In [30]:
def get_als_pred_optuna(
        users, nodes, user_to_pred,
        factors,
        regularization,
        alpha,
        iterations,
        use_native,
        use_cg,
        dtype
):
    user_ids = users.unique().to_list()
    item_ids = nodes.unique().to_list()
        
    user_id_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
    index_to_item_id = {v:k for k,v in item_id_to_index.items()}
    
    rows = users.replace_strict(user_id_to_index).to_list()
    cols = nodes.replace_strict(item_id_to_index).to_list()
    
    values = [1] * len(users)
    
    sparse_matrix = csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(item_ids)))
    
    model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        alpha=alpha,
        iterations=iterations,
        use_native=use_native,
        use_cg=use_cg,
        dtype=dtype
    )
    model.fit(sparse_matrix, )
    
    
    user4pred = np.array([user_id_to_index[i] for i in user_to_pred])
    
    recommendations, scores = model.recommend(user4pred, sparse_matrix[user4pred], N=40, filter_already_liked_items=True)
    
    df_pred = pl.DataFrame(
        {
            'node': [
                [index_to_item_id[i] for i in i] for i in recommendations.tolist()
            ], 
             'cookie': list(user_to_pred),
            'scores': scores.tolist()
            
        }
    )
    df_pred = df_pred.explode(['node', 'scores'])
    return df_pred, model

In [31]:
def objective(trial):
    # Основные гиперпараметры ALS
    factors = trial.suggest_int("als_factors", 100, 120)
    regularization = trial.suggest_float("als_regularization", 1e-1, 1e1, log=True)
    alpha = trial.suggest_float("als_alpha", 5.0, 10.0, log=True)
    iterations = trial.suggest_int("als_iterations", 10, 15)

    # Реализация и прочие параметры
    dtype = trial.suggest_categorical("als_dtype", [np.float32, np.float64])
    use_native = trial.suggest_categorical("als_use_native", [True, False])
    use_cg = trial.suggest_categorical("als_use_cg", [True, False])
    # random_state = trial.suggest_int("als_random_state", 1, 9999)

    df_pred, model = get_als_pred_optuna(
        users, nodes, user_to_pred,
        factors=factors,
        regularization=regularization,
        alpha=alpha,
        iterations=iterations,
        use_native=use_native,
        use_cg=use_cg,
        dtype=dtype
    )

    score = recall_at(df_eval, df_pred, k=40)

    # сохраним параметры в trial.user_attrs
    trial.set_user_attr("params", {
        'als_factors': model.factors,
        'als_regularization': model.regularization,
        'als_alpha': alpha,
        'als_dtype': model.dtype,
        'als_use_native': model.use_native,
        'als_use_cg': model.use_cg,
        'als_iterations': model.iterations,
        'als_random_state': model.random_state
    })

    return score

In [19]:
# it takes to much cpu memmory and to much time

In [32]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best recall@40:", study.best_value)
print("Best ALS params:")

study.best_trial.user_attrs["params"], study.best_value

[I 2025-05-16 18:09:12,736] A new study created in memory with name: no-name-5579d1cf-ef5a-4d48-b697-cd5e4f11bf42


  0%|          | 0/10 [00:01<?, ?it/s]

[I 2025-05-16 18:11:54,124] Trial 0 finished with value: 0.15656242876459536 and parameters: {'als_factors': 100, 'als_regularization': 2.2547266593601476, 'als_alpha': 5.890851347827647, 'als_iterations': 10, 'als_dtype': <class 'numpy.float32'>, 'als_use_native': True, 'als_use_cg': True}. Best is trial 0 with value: 0.15656242876459536.


  0%|          | 0/14 [00:00<?, ?it/s]

[I 2025-05-16 18:16:22,287] Trial 1 finished with value: 0.15667660752896295 and parameters: {'als_factors': 108, 'als_regularization': 0.3320243092355322, 'als_alpha': 7.966318488604583, 'als_iterations': 14, 'als_dtype': <class 'numpy.float64'>, 'als_use_native': True, 'als_use_cg': True}. Best is trial 1 with value: 0.15667660752896295.


  0%|          | 0/14 [00:00<?, ?it/s]

[W 2025-05-16 18:25:25,344] Trial 2 failed with parameters: {'als_factors': 118, 'als_regularization': 0.4511716802105419, 'als_alpha': 6.393905811442937, 'als_iterations': 14, 'als_dtype': <class 'numpy.float32'>, 'als_use_native': False, 'als_use_cg': False} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_35/1932749649.py", line 14, in objective
    df_pred, model = get_als_pred_optuna(
                     ^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_35/3794598097.py", line 34, in get_als_pred_optuna
    model.fit(sparse_matrix, )
  File "/usr/local/lib/python3.11/dist-packages/implicit/cpu/als.py", line 163, in fit
    solver(
  File "/usr/local/lib/python3.11/dist-packages/implicit/cpu/als.py", line 490, in least_squares
    X[u] = user_factor(Y, YtY, Cui

KeyboardInterrupt: 

In [35]:
# optuna_best = [{'als_factors': 122,
#   'als_regularization': 0.16625063986324726,
#   'als_alpha': 6.384951229882748,
#   'als_dtype': dtype('float32'),
#   'als_use_native': True,
#   'als_use_cg': True,
#   'als_iterations': 11,
#   'als_random_state': None},
#  0.1576778771718187]

with mlflow.start_run(run_name='als_optuna2_best'):
    mlflow.log_params(study.best_trial.user_attrs["params"])
    mlflow.log_metrics({'Recall_40': study.best_value})

🏃 View run als_optuna2_best at: http://51.250.35.156:5000/#/experiments/45/runs/782f79c3119e48d1814c0658cf3641bc
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/45


# All experiments

In [39]:
model_optuna_best = implicit.als.AlternatingLeastSquares(
    factors=122,
    regularization=0.16625063986324726,
    alpha=6.384951229882748,
    dtype=np.float32,
    use_native=True,
    use_cg=True,
    iterations=11,
)

df_pred_optuna_best = get_als_pred(users, nodes,eval_users, model_optuna_best)

recall_at(df_eval, df_pred_optuna_best, k=40)

  0%|          | 0/11 [00:00<?, ?it/s]

0.15811003891937614

In [42]:
model_optuna2_best = implicit.als.AlternatingLeastSquares(
    factors=108,
    regularization=0.3320243092355322,
    alpha=7.966318488604583,
    dtype=np.float64,
    use_native=True,
    use_cg=True,
    iterations=14,
)

df_pred_optuna2_best = get_als_pred(users, nodes,eval_users, model_optuna2_best)

recall_at(df_eval, df_pred_optuna2_best, k=40)

  0%|          | 0/14 [00:00<?, ?it/s]

0.15599995726416083

In [45]:
model_als_2 = implicit.als.AlternatingLeastSquares(
    factors=105,
    regularization=0.36625063986324724,
    alpha=3.5323,
    dtype=np.float32,
    use_native=True,
    use_cg=True,
    iterations=12,
)

df_pred_als_2 = get_als_pred(users, nodes,eval_users, model_als_2)

recall_at(df_eval, df_pred_als_2, k=40)

  0%|          | 0/12 [00:00<?, ?it/s]

0.15699928212798264

In [47]:
model_als_2 = implicit.als.AlternatingLeastSquares(
    factors=125,
    regularization=0.26625063986324726,
    alpha=6.5323,
    dtype=np.float32,
    use_native=True,
    use_cg=False,
    iterations=15,
)

df_pred_als_2 = get_als_pred(users, nodes,eval_users, model_als_2)

recall_at(df_eval, df_pred_als_2, k=40)

  0%|          | 0/15 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [48]:
model_als_1 = implicit.als.AlternatingLeastSquares(
    factors=75,
    iterations=10,
)

df_pred_als_1 = get_als_pred(users, nodes,eval_users, model_als_1)

recall_at(df_eval, df_pred_als_1, k=40)

  0%|          | 0/10 [00:02<?, ?it/s]

0.15181344942673614

In [27]:
with mlflow.start_run(run_name='als_2'):
    
    model_als_2 = model_optuna2_best
    # df_pred = get_als_pred(users, nodes,eval_users, model_als_2)
    
    mlflow.log_params(get_params_als(model_als_2))
    mlflow.log_metrics({'Recall_40': recall_at(df_eval, df_pred_als_2, k=40)})

🏃 View run als_2 at: http://51.250.35.156:5000/#/experiments/45/runs/9f74e5d83c3f4528859890915771549e
🧪 View experiment at: http://51.250.35.156:5000/#/experiments/45
