# Feature engineering
Create custom features. Features categories:
- User features
- Item features
- Frequency features

- first step: load data and prepared before training/validating/testing.

In [25]:
import pandas as pd

In [26]:
postgres_uri = "postgresql+psycopg2://backend:backend@localhost:5432/app_db"

customers = pd.read_sql_table("customers", postgres_uri)
articles = pd.read_sql_table("articles", postgres_uri)
transactions = pd.read_sql_table("transactions", postgres_uri)

In [28]:
customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1995 entries, 0 to 1994
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   customer_uuid           1995 non-null   object
 1   fn                      1995 non-null   int64 
 2   active                  1995 non-null   int64 
 3   club_member_status      1995 non-null   object
 4   fashion_news_frequency  708 non-null    object
 5   age                     1995 non-null   int64 
 6   postal_code             1995 non-null   object
 7   customer_id             1995 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 124.8+ KB


In [29]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15034 entries, 0 to 15033
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   article_uuid        15034 non-null  object
 1   prod_name           15034 non-null  object
 2   product_type_no     15034 non-null  int64 
 3   product_type_name   15034 non-null  object
 4   product_group_no    15034 non-null  int64 
 5   product_group_name  15034 non-null  object
 6   department_no       15034 non-null  int64 
 7   department_name     15034 non-null  object
 8   index_code          15034 non-null  object
 9   index_name          15034 non-null  object
 10  index_group_no      15034 non-null  int64 
 11  index_group_name    15034 non-null  object
 12  section_no          15034 non-null  int64 
 13  section_name        15034 non-null  object
 14  garment_group_no    15034 non-null  int64 
 15  garment_group_name  15034 non-null  object
 16  detail_desc         15

In [30]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109816 entries, 0 to 109815
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   transaction_uuid  109816 non-null  object        
 1   t_dat             109816 non-null  datetime64[ns]
 2   price             109816 non-null  float64       
 3   sales_channel_id  109816 non-null  int64         
 4   customer_uuid     109816 non-null  object        
 5   article_uuid      109816 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 5.0+ MB


prepared uset/item features

In [31]:
article_features = articles[[
    "article_uuid", "article_id", "product_type_no", "product_group_no", "department_no", "index_code",
    "index_group_no", "section_no", "garment_group_no"
]]
customer_features = customers[[
    "customer_uuid", "customer_id", "age"
]]

create interaction matrix (user X item)

In [32]:
interactions = transactions.copy()

In [33]:
interactions = interactions.merge(customer_features, how="left", on="customer_uuid")
interactions = interactions.merge(article_features, how="left", on="article_uuid")

In [34]:
interactions = interactions[["customer_id", "article_id", "t_dat"]]

In [35]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109816 entries, 0 to 109815
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   customer_id  109816 non-null  int64         
 1   article_id   109816 non-null  int64         
 2   t_dat        109816 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 2.5 MB


# Training
Let's divide the data into three parts. The first part is the data for the first stage. The second part is the data for the second stage. The third is the test data for evaluating the overall model.  
We will use time intervals. Min date: 2020-03-22; max date: 2020-09-22  
- first stage train (2020-03-22/2020-06-22)
- first stage validate (2020-06-22/2020-08-22)
- second stage train (2020-06-22/2020-08-22)
- second stage validate (2020-08-22/2020-09-11)
- test (2020-09-11/2020-09-22)

It is necessary for users from the first training sample to be in all other samples. This code seems to guarantee this condition

In [49]:
min_date = "2020-03-22 00:00:00"
date_1 = "2020-08-01 00:00:00"
date_2 = "2020-09-17 00:00:00"

train = interactions[interactions["t_dat"] <= date_1]
validate = interactions[(interactions["t_dat"] > date_1) & (interactions["t_dat"] <= date_2)]
test = interactions[(interactions["t_dat"] > date_2)]


# assert len(set(validate["customer_id"]) - set(train["customer_id"])) == 0
# assert len(set(test["customer_id"]) - set(train["customer_id"])) == 0

print(
    f"train length: {len(train)};\n"
    f"validate length: {len(validate)};\n"
    f"test length: {len(test)};"
)

all_customers = len(set(interactions["customer_id"]))
train_customers = len(set(train["customer_id"]))
validate_customers = len(set(validate["customer_id"]))
test_customers = len(set(test["customer_id"]))

print(
    f"\nall customers: {all_customers}\n"
    f"train customers: {train_customers}\n"
    f"validate customers: {validate_customers}\n"
    f"test customers: {test_customers}\n"
)

train length: 85680;
validate length: 22242;
test length: 1894;

all customers: 1995
train customers: 1967
validate customers: 1249
test customers: 153



# Training first stage
Create als model which predicts user x items interactions

In [50]:
import os

import numpy as np
import optuna
import implicit

import mlflow
import mlflow.pyfunc

from scipy.sparse import coo_matrix, csr_matrix

from metrics import recall_at_k, precision_at_k

import plotly.express as px
import plotly.graph_objects as go

from typing import List, Any, Dict

  from .autonotebook import tqdm as notebook_tqdm


In [51]:
os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "admin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "password"

In [53]:
length_customers = len(customers)
length_articles = len(articles)

In [54]:
def to_coo(interactions, users_len, items_len):
    row = interactions["customer_id"].values
    col = interactions["article_id"].values
    data = np.ones(interactions.shape[0])

    return coo_matrix((data, (row, col)), shape=(users_len, items_len), dtype=np.float32)


def to_csr(interactions, users_len, items_len):
    coo_matrix = to_coo(interactions, users_len, items_len)
    csr_matrix = coo_matrix.tocsr()
    return csr_matrix


train_matrix = to_csr(train, length_customers, length_articles)
validate_matrix = to_csr(validate, length_customers, length_articles)

In [55]:
def items_vector(df, customer_id):
    return df[df["customer_id"] == customer_id]["article_id"].to_list()


def estimate_metrics(model, df, k):
    data = df.copy()
    data["candidates"] = data["customer_id"].apply(lambda x: model.recommend(x, train_matrix[x], N=k)[0])
    recall = []
    precision = []
    for _, actual, candidates in data.values:
        recall.append(recall_at_k(actual, candidates, k=k))
        precision.append(precision_at_k(actual, candidates, k=k))
    return {"recall": np.mean(recall), "precision": np.mean(precision)}


validate_matrix = pd.DataFrame({"customer_id": validate["customer_id"].unique()})
validate_matrix["actual"] = validate_matrix["customer_id"].apply(lambda x: items_vector(validate, x))

In [56]:
class ALSModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model: implicit.als.AlternatingLeastSquares, user_item: csr_matrix):
        self.model = model
        self.sparse_matrix = user_item

    def predict(self, context, model_input: List[int], params: Dict[str, Any] = None) -> List[list]:
        if params is None:
            params = {}
        N = params.get("N", 10)

        return self.model.recommend(model_input, self.sparse_matrix[model_input], N=N)

    def get_raw_model(self):
        return self.model

In [57]:
def objective(trial):
    factors = trial.suggest_int("factors", 10, 300)
    regularization = trial.suggest_float("regularization", 1e-6, 1e-2, log=True)
    alpha = trial.suggest_int("alpha", 5, 50)
    iterations = trial.suggest_int("iterations", 50, 350)

    model = implicit.als.AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        alpha=alpha,
        iterations=iterations,
        random_state=42)

    model.fit(train_matrix, show_progress=False)

    return estimate_metrics(model, validate_matrix, 25)["recall"]


study = optuna.create_study(
    direction="maximize",
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=5, n_warmup_steps=30, interval_steps=10
    ),
)
study.optimize(objective, n_trials=25)

with mlflow.start_run(run_name="ALS w/ recall&precision"):
    als_model = implicit.als.AlternatingLeastSquares()
    als_model.fit(train_matrix)

    mlflow.log_params(study.best_params)

    # recall & precision @10
    metrics = estimate_metrics(als_model, validate_matrix, 10)
    mlflow.log_metric("recall_k10", metrics["recall"])
    mlflow.log_metric("precision_k10", metrics["precision"])


    xs = np.arange(100, 800, 50)
    ys = [estimate_metrics(als_model, validate_matrix, k)["recall"] for k in xs]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=xs,
        y=ys,
        name="recall"
    ))

    fig.add_trace(go.Scatter(
        x=xs,
        y=np.gradient(ys) * 10,
        name="change rate / gradient * 10",
        line=dict(color='red', width=2, dash='dot')
    ))

    mlflow.log_figure(fig, "plot_artifacts/change_recall_rate.png")
    mlflow.pyfunc.log_model(
        artifact_path="als_model",
        python_model=ALSModel(als_model, train_matrix)
    )

[I 2025-04-07 00:21:02,626] A new study created in memory with name: no-name-e5c16bda-4912-4043-83e9-c5ca56743c33
  check_blas_config()
[I 2025-04-07 00:21:11,194] Trial 0 finished with value: 0.008700128770800992 and parameters: {'factors': 126, 'regularization': 5.7109755460160295e-05, 'alpha': 7, 'iterations': 207}. Best is trial 0 with value: 0.008700128770800992.
[I 2025-04-07 00:21:19,927] Trial 1 finished with value: 0.007183714801241945 and parameters: {'factors': 105, 'regularization': 2.6170764408529827e-05, 'alpha': 43, 'iterations': 240}. Best is trial 0 with value: 0.008700128770800992.
[I 2025-04-07 00:21:22,990] Trial 2 finished with value: 0.007887660246343159 and parameters: {'factors': 148, 'regularization': 0.0007835834838545464, 'alpha': 10, 'iterations': 67}. Best is trial 0 with value: 0.008700128770800992.
[I 2025-04-07 00:21:26,223] Trial 3 finished with value: 0.012175917463826458 and parameters: {'factors': 30, 'regularization': 2.6063051640966838e-06, 'alpha'

üèÉ View run ALS w/ recall&precision at: http://localhost:5000/#/experiments/0/runs/ea958efa07b849cb90fbb602fc6d7aad
üß™ View experiment at: http://localhost:5000/#/experiments/0


In [58]:
mlflow.register_model(
    "runs:/ea958efa07b849cb90fbb602fc6d7aad/als_model",
    "ALS_Model"
)

Successfully registered model 'ALS_Model'.
2025/04/07 00:37:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ALS_Model, version 1
Created version '1' of model 'ALS_Model'.


<ModelVersion: aliases=[], creation_timestamp=1743971839517, current_stage='None', description='', last_updated_timestamp=1743971839517, name='ALS_Model', run_id='ea958efa07b849cb90fbb602fc6d7aad', run_link='', source='s3://mlflow/0/ea958efa07b849cb90fbb602fc6d7aad/artifacts/als_model', status='READY', status_message=None, tags={}, user_id='', version='1'>

# Second stage train

prepare data for train/validate CatBoost step. 

match candidates for second step model

In [59]:
from catboost import CatBoost

In [66]:
def freq_feature(left, right, group_by, agg_col, feature_name):
    return left.merge(
        right.groupby(by=group_by)[agg_col]\
            .count()
            .rename(feature_name) / 1,
            how="left",
            on=group_by
    )


def match_candidates(df):
    candidates = pd.DataFrame({"customer_id": df["customer_id"].unique()})
    candidates["candidates"] = candidates["customer_id"].apply(lambda x: als_model.recommend(x, train_matrix[x], 300)[0])

    articles = candidates.apply(lambda x: pd.Series(x["candidates"]), axis=1).stack().reset_index(level=1, drop=True)
    articles.name = "article_id"

    return candidates.drop("candidates", axis=1).join(articles)


def merge_features(data, customer_features, article_features):
    data = data.merge(customer_features, how="left", on="customer_id")
    data = data.merge(article_features, how="left", on="article_id")
    return data


def add_freq_features(data, right):
    data = freq_feature(data, right, ["article_id"], "article_uuid", "article_freq")
    data = freq_feature(data, right, ["customer_id", "product_group_no"], "article_id", "product_group_freq")
    data = freq_feature(data, right, ["customer_id", "index_code"], "article_id", "index_freq")
    data = freq_feature(data, right, ["customer_id", "garment_group_no"], "article_id", "garment_group_freq")
    return data

In [67]:
candidates = match_candidates(train)

In [68]:
X_train = train.copy()
X_train = X_train[["customer_id", "article_id"]]

In [69]:
X_train.loc[0:, "target"] = 1

In [70]:
X_train = X_train.merge(candidates, how="outer", on=["customer_id", "article_id"])
X_train = X_train.drop_duplicates(subset=["customer_id", "article_id"])
X_train.fillna(0, inplace=True)

In [71]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 655606 entries, 0 to 675779
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   customer_id  655606 non-null  int64  
 1   article_id   655606 non-null  int64  
 2   target       655606 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 20.0 MB


In [72]:
X_train = merge_features(X_train, customer_features, article_features)

In [73]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655606 entries, 0 to 655605
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_id       655606 non-null  int64  
 1   article_id        655606 non-null  int64  
 2   target            655606 non-null  float64
 3   customer_uuid     655606 non-null  object 
 4   age               655606 non-null  int64  
 5   article_uuid      655606 non-null  object 
 6   product_type_no   655606 non-null  int64  
 7   product_group_no  655606 non-null  int64  
 8   department_no     655606 non-null  int64  
 9   index_code        655606 non-null  object 
 10  index_group_no    655606 non-null  int64  
 11  section_no        655606 non-null  int64  
 12  garment_group_no  655606 non-null  int64  
dtypes: float64(1), int64(9), object(3)
memory usage: 65.0+ MB


# Feature engineering

In [74]:
full_transactions = transactions.copy()
full_transactions = full_transactions.merge(customer_features, how="left", on="customer_uuid")
full_transactions = full_transactions.merge(article_features, how="left", on="article_uuid")

full_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109816 entries, 0 to 109815
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   transaction_uuid  109816 non-null  object        
 1   t_dat             109816 non-null  datetime64[ns]
 2   price             109816 non-null  float64       
 3   sales_channel_id  109816 non-null  int64         
 4   customer_uuid     109816 non-null  object        
 5   article_uuid      109816 non-null  object        
 6   customer_id       109816 non-null  int64         
 7   age               109816 non-null  int64         
 8   article_id        109816 non-null  int64         
 9   product_type_no   109816 non-null  int64         
 10  product_group_no  109816 non-null  int64         
 11  department_no     109816 non-null  int64         
 12  index_code        109816 non-null  object        
 13  index_group_no    109816 non-null  int64         
 14  sect

In [75]:
X_train = add_freq_features(X_train, full_transactions)

In [76]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 655606 entries, 0 to 655605
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   customer_id         655606 non-null  int64  
 1   article_id          655606 non-null  int64  
 2   target              655606 non-null  float64
 3   customer_uuid       655606 non-null  object 
 4   age                 655606 non-null  int64  
 5   article_uuid        655606 non-null  object 
 6   product_type_no     655606 non-null  int64  
 7   product_group_no    655606 non-null  int64  
 8   department_no       655606 non-null  int64  
 9   index_code          655606 non-null  object 
 10  index_group_no      655606 non-null  int64  
 11  section_no          655606 non-null  int64  
 12  garment_group_no    655606 non-null  int64  
 13  article_freq        655606 non-null  float64
 14  product_group_freq  600969 non-null  float64
 15  index_freq          618457 non-nul

In [77]:
X_train.sample(1)

Unnamed: 0,customer_id,article_id,target,customer_uuid,age,article_uuid,product_type_no,product_group_no,department_no,index_code,index_group_no,section_no,garment_group_no,article_freq,product_group_freq,index_freq,garment_group_freq
383780,1167,10622,0.0,123ff592-5f16-4016-9786-4501470b16c0,26,0abb9007-9e1b-463e-b751-3679f1f3b7c7,272,1,1722,A,1,15,1009,20.0,5.0,20.0,


In [78]:
def prepare(data, freq_columns, drop_columns):
    for col_name in freq_columns:
        data[col_name].fillna(0, inplace=True)
    data = data.drop(columns=drop_columns)
    return data


In [79]:
freq_columns = ["article_freq", "product_group_freq", "index_freq", "garment_group_freq"]
drop_columns = ["customer_uuid", "article_uuid"]

X_train = prepare(X_train, freq_columns, drop_columns)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [80]:
y_train = X_train[["target"]]
X_train = X_train.drop(columns=["target"])

In [81]:
def rerank(customer_id, df, k=5):
    return df[df["customer_id"] == customer_id].\
                sort_values("proba", ascending=False).head(k)["article_id"].tolist()

In [83]:
def match_rerank(data, rerank_model):
    data = match_candidates(validate)

    data = merge_features(data, customer_features, article_features)
    data = add_freq_features(data, full_transactions)
    data = prepare(data, freq_columns, drop_columns)

    data["proba"] = rerank_model.predict(data, prediction_type="Probability")[:, 1]

    compared = pd.DataFrame({"customer_id": validate["customer_id"].unique()})
    compared["reranked"] = compared["customer_id"].apply(lambda x: rerank(x, data, 25))
    compared["actual"] = compared["customer_id"].apply(lambda x: validate[validate["customer_id"] == x]["article_id"].to_list())

    return compared


def objective(trial):
    with mlflow.start_run():
        params = {
            "objective": "Logloss",
            "iterations": trial.suggest_int("iterations", 50, 500),
            "max_depth": trial.suggest_int("max_depth", 6, 12),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-5, 1e2),
            "task_type": "GPU"
        }

        catboost_model = CatBoost(
            params=params
        )

        catboost_model.fit(X_train, y_train, cat_features=["index_code"], verbose=False)
        candidates = match_rerank(validate, catboost_model)

        recall = []
        precision = []

        for _, reranked, actual in candidates.values:
            recall.append(recall_at_k(actual, reranked, 25))
            precision.append(precision_at_k(actual, reranked, 25))

        return np.mean(recall)


study = optuna.create_study(
    direction="maximize",
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=5, n_warmup_steps=30, interval_steps=10
    )
)

study.optimize(objective, n_trials=15)

with mlflow.start_run(run_name="catboost/match candidates + rerank"):
    catboost_model = CatBoost(params=study.best_params)
    catboost_model.fit(X_train, y_train, cat_features=["index_code"], verbose=False)

    mlflow.log_params(study.best_params)

    candidates = match_rerank(validate, catboost_model)

    recall = []
    precision = []

    for _, reranked, actual in candidates.values:
        recall.append(recall_at_k(actual, reranked, 25))
        precision.append(precision_at_k(actual, reranked, 25))

    mean_recall = np.mean(recall)
    mean_precision = np.mean(precision)

    mlflow.log_metric("recall", mean_recall)
    mlflow.log_metric("precision", mean_precision)
    
    mlflow.catboost.log_model(
        catboost_model,
        artifact_path="catboost_model"
    )

[I 2025-04-07 00:42:04,250] A new study created in memory with name: no-name-e1f7947b-3a00-4e7a-bdae-2ea3a24eaacd

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:42:27,933] Trial 0 finished with value: 0.0052428015022492995 and parameters: {'iterations': 463, 'max_depth': 6, 'learning_rate': 0.21995640385718615, 'l2_leaf_reg': 3.054204786

üèÉ View run nervous-fowl-706 at: http://localhost:5000/#/experiments/0/runs/9be4d6d200cd473294796778c7d80fc0
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:43:00,233] Trial 1 finished with value: 0.005718581151906159 and parameters: {'iterations': 284, 'max_depth': 10, 'learning_rate': 0.15238726645574266, 'l2_leaf_reg': 60.82671006038214}. Best is trial 1 with value: 0.005718581151906159.


üèÉ View run gregarious-mouse-820 at: http://localhost:5000/#/experiments/0/runs/525913be2549431ba41ce98bf697daf0
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:44:59,539] Trial 2 finished with value: 0.007665945464752999 and parameters: {'iterations': 500, 'max_depth': 12, 'learning_rate': 0.12778575765386604, 'l2_leaf_reg': 0.9983338815280869}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run defiant-shad-905 at: http://localhost:5000/#/experiments/0/runs/51e19f1e455d45829bc79f6cd32b212d
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:45:17,011] Trial 3 finished with value: 0.005221400918189882 and parameters: {'iterations': 220, 'max_depth': 8, 'learning_rate': 0.17553366907794396, 'l2_leaf_reg': 0.9300470657963538}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run bouncy-pig-259 at: http://localhost:5000/#/experiments/0/runs/15756274a02d4c3b99623bb0953a01bf
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:45:23,706] Trial 4 finished with value: 0.0038864677879165955 and parameters: {'iterations': 195, 'max_depth': 7, 'learning_rate': 0.04317373625670447, 'l2_leaf_reg': 4.6354465518347086}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run bemused-flea-102 at: http://localhost:5000/#/experiments/0/runs/abac5fe4bc09416caa04bb103e0f2526
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:46:20,738] Trial 5 finished with value: 0.0074331818286600855 and parameters: {'iterations': 230, 'max_depth': 12, 'learning_rate': 0.12376099628349964, 'l2_leaf_reg': 0.0012548350443660461}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run big-asp-384 at: http://localhost:5000/#/experiments/0/runs/9beddd2b04d64f6481156140bc997fb2
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:46:43,723] Trial 6 finished with value: 0.0053950863499463665 and parameters: {'iterations': 304, 'max_depth': 8, 'learning_rate': 0.21726009464398408, 'l2_leaf_reg': 0.00020046522467127959}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run gaudy-stork-437 at: http://localhost:5000/#/experiments/0/runs/0ca82912e9a54b519efbad39080dbe30
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:46:49,446] Trial 7 finished with value: 0.005028896102737782 and parameters: {'iterations': 140, 'max_depth': 7, 'learning_rate': 0.22471141838665476, 'l2_leaf_reg': 0.001660578673529807}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run enchanting-calf-805 at: http://localhost:5000/#/experiments/0/runs/35009534348044ce83eb584f135e6922
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:46:55,835] Trial 8 finished with value: 0.005796966825739095 and parameters: {'iterations': 104, 'max_depth': 10, 'learning_rate': 0.2142378968002199, 'l2_leaf_reg': 0.0552841544682491}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run defiant-ray-878 at: http://localhost:5000/#/experiments/0/runs/33d345141a314e70ba54da874d8bcd2e
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:47:19,708] Trial 9 finished with value: 0.005547773553821373 and parameters: {'iterations': 462, 'max_depth': 6, 'learning_rate': 0.2503856529341314, 'l2_leaf_reg': 87.58368602881974}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run magnificent-flea-136 at: http://localhost:5000/#/experiments/0/runs/503bf351a0d240ec96d3d5228380ebd2
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:48:51,473] Trial 10 finished with value: 0.0069668230715907295 and parameters: {'iterations': 378, 'max_depth': 12, 'learning_rate': 0.07266703981556745, 'l2_leaf_reg': 1.1180586911093245e-05}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run unique-fox-146 at: http://localhost:5000/#/experiments/0/runs/6e5c0c7a7a00413bb6e797a70a07e987
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:50:20,800] Trial 11 finished with value: 0.0071632152068163224 and parameters: {'iterations': 368, 'max_depth': 12, 'learning_rate': 0.11060874634337206, 'l2_leaf_reg': 0.017168479691749127}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run marvelous-croc-549 at: http://localhost:5000/#/experiments/0/runs/f2bf321f938a49d2a8acbf77bec580b3
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:51:40,952] Trial 12 finished with value: 0.006888920063176172 and parameters: {'iterations': 500, 'max_depth': 11, 'learning_rate': 0.12252110871928876, 'l2_leaf_reg': 0.06572863072102657}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run nebulous-mare-240 at: http://localhost:5000/#/experiments/0/runs/ddd362395c5e4f54b74b2ee7db4c42f6
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:51:46,309] Trial 13 finished with value: 0.006085927550652655 and parameters: {'iterations': 52, 'max_depth': 11, 'learning_rate': 0.29016202830350646, 'l2_leaf_reg': 0.0016396825943400543}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run entertaining-ox-716 at: http://localhost:5000/#/experiments/0/runs/9b7819fe3cf34a5ba3fe1523d87d9900
üß™ View experiment at: http://localhost:5000/#/experiments/0



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



[I 2025-04-07 00:53:14,252] Trial 14 finished with value: 0.003966282893755058 and parameters: {'iterations': 368, 'max_depth': 12, 'learning_rate': 0.01260012847711331, 'l2_leaf_reg': 0.518346950909053}. Best is trial 2 with value: 0.007665945464752999.


üèÉ View run grandiose-shrike-631 at: http://localhost:5000/#/experiments/0/runs/29fb7accea204a24aa275df1b5b1b5f2
üß™ View experiment at: http://localhost:5000/#/experiments/0



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





üèÉ View run catboost/match candidates + rerank at: http://localhost:5000/#/experiments/0/runs/3d848c5423f246d69fd6ddd57f65d5bf
üß™ View experiment at: http://localhost:5000/#/experiments/0


In [84]:
mlflow.register_model(
    "runs:/3d848c5423f246d69fd6ddd57f65d5bf/catboost_model",
    "CatBoost_Model"
)

Successfully registered model 'CatBoost_Model'.
2025/04/07 00:56:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CatBoost_Model, version 1
Created version '1' of model 'CatBoost_Model'.


<ModelVersion: aliases=[], creation_timestamp=1743972983050, current_stage='None', description='', last_updated_timestamp=1743972983050, name='CatBoost_Model', run_id='3d848c5423f246d69fd6ddd57f65d5bf', run_link='', source='s3://mlflow/0/3d848c5423f246d69fd6ddd57f65d5bf/artifacts/catboost_model', status='READY', status_message=None, tags={}, user_id='', version='1'>