In [93]:
import mlflow
import os
from dotenv import load_dotenv
from sklearn.metrics import roc_auc_score, f1_score, log_loss, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
import optuna
from optuna.integration.mlflow import MLflowCallback
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
import numpy as np

In [94]:
# %# =====================================================
#                 Optuna - TPE - Bayesian search
# ========================================================
TABLE_NAME = "users_churn" # ваш код здесь #
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_kruglikovAlex' # ваш код здесь #
RUN_NAME = 'model_bayesian_search'
REGISTRY_MODEL_NAME = 'churn_model_kruglikovAlex_b2c' # ваш код здесь #
FS_ASSETS = "../fs_assets" 

In [95]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [96]:
# подгружаем .env
load_dotenv()

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" # ваш код здесь
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [97]:
STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"

In [98]:
df = pd.read_csv('/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv')
df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,...,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,...,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,...,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,...,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,...,No,No,No,No,Female,0,No,No,No,1


In [99]:
df = df.drop(columns=['end_date'])
df.head()

Unnamed: 0,id,begin_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1


In [100]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

In [101]:
split_column = "begin_date" # ваш код здесь
stratify_column = df[target] # ваш код здесь
test_size = 0.2 # ваш код здесь

In [102]:
df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    stratify=stratify_column,
    shuffle=True,
) 

In [103]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5615, 3)
Размер выборки для теста: (1404, 3)


In [104]:
def objective(trial: optuna.Trial) -> float:
    param = {"learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
             "depth": trial.suggest_int("depth", 1, 12),
             "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
             "random_strength": trial.suggest_float("random_strength", 0.1, 5),
             "loss_function": "Logloss",
             "task_type": "CPU",
             "random_seed": 0,
             "iterations": 300,
             "verbose": False,
             }# ваш код здесь #
    
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        # ваш код здесь #
        train_x = X_train.iloc[train_index]
        train_y = y_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        val_y = y_train.iloc[val_index]
        
        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    # ваш код здесь #
    err_1 = median(array(metrics["err1"]))
    err_2 = median(array(metrics["err2"]))
    auc = median(array(metrics["auc"]))
    precision = median(array(metrics["precision"]))
    recall = median(array(metrics["recall"]))
    f1 = median(array(metrics["f1"]))
    logloss = median(array(metrics["logloss"]))
		

    return auc

In [105]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

In [106]:
experiment_id

'1'

In [107]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [108]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

In [109]:
run_id

'2979f28bd3fe44259d8043a29295a847'

In [110]:
mlflc = MLflowCallback(
    tracking_uri=f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}',
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: run_id}}
)

  mlflc = MLflowCallback(


In [111]:
mlflc

<optuna.integration.mlflow.MLflowCallback at 0x7f457edaa830>

In [112]:
study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction='maximize', study_name=STUDY_NAME, storage=STUDY_DB_NAME, load_if_exists=True) # ваш код здесь #

[I 2025-07-27 21:05:38,225] Using an existing study with name 'churn_model' instead of creating a new one.


In [113]:
from collections import defaultdict

In [114]:
from statistics import median
from numpy import array

In [115]:
study.optimize(objective, n_trials=10, callbacks=[mlflc])

[I 2025-07-27 21:05:48,381] Trial 56 finished with value: 0.8238263825839862 and parameters: {'learning_rate': 0.05892243429917434, 'depth': 3, 'l2_leaf_reg': 1.2979742431624373, 'random_strength': 2.9085730093056053}. Best is trial 12 with value: 0.8247575289141135.
[I 2025-07-27 21:05:49,320] Trial 57 finished with value: 0.8231424458722122 and parameters: {'learning_rate': 0.07710321428119458, 'depth': 2, 'l2_leaf_reg': 1.6646233030134399, 'random_strength': 3.107194587974692}. Best is trial 12 with value: 0.8247575289141135.
[I 2025-07-27 21:05:50,373] Trial 58 finished with value: 0.8227802509805275 and parameters: {'learning_rate': 0.05093906006777791, 'depth': 4, 'l2_leaf_reg': 1.6514302084852661, 'random_strength': 2.34684182255129}. Best is trial 12 with value: 0.8247575289141135.
[I 2025-07-27 21:05:51,171] Trial 59 finished with value: 0.8232735260192938 and parameters: {'learning_rate': 0.07805756325961735, 'depth': 2, 'l2_leaf_reg': 1.379450884243, 'random_strength': 3.090

In [116]:
best_params = study.best_params

In [117]:
best_params

{'learning_rate': 0.035907322101501306,
 'depth': 4,
 'l2_leaf_reg': 1.3180051947332347,
 'random_strength': 2.552251829748683}

In [118]:
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")

Number of finished trials: 66
Best params: {'learning_rate': 0.035907322101501306, 'depth': 4, 'l2_leaf_reg': 1.3180051947332347, 'random_strength': 2.552251829748683}


In [119]:
model = CatBoostClassifier(**best_params)
model

<catboost.core.CatBoostClassifier at 0x7f457e7dff40>

In [120]:
model.fit(X_train, y_train)

0:	learn: 0.6837261	total: 1.47ms	remaining: 1.47s
1:	learn: 0.6694664	total: 3.12ms	remaining: 1.56s
2:	learn: 0.6560993	total: 4.53ms	remaining: 1.51s
3:	learn: 0.6418792	total: 5.94ms	remaining: 1.48s
4:	learn: 0.6273064	total: 7.44ms	remaining: 1.48s
5:	learn: 0.6150288	total: 8.98ms	remaining: 1.49s
6:	learn: 0.6028784	total: 10.5ms	remaining: 1.49s
7:	learn: 0.5935782	total: 12ms	remaining: 1.48s
8:	learn: 0.5837130	total: 13.5ms	remaining: 1.49s
9:	learn: 0.5758562	total: 15.1ms	remaining: 1.49s
10:	learn: 0.5676902	total: 16.6ms	remaining: 1.49s
11:	learn: 0.5642213	total: 17.9ms	remaining: 1.47s
12:	learn: 0.5576666	total: 19.3ms	remaining: 1.47s
13:	learn: 0.5547332	total: 20.5ms	remaining: 1.45s
14:	learn: 0.5492571	total: 22.1ms	remaining: 1.45s
15:	learn: 0.5417479	total: 23.6ms	remaining: 1.45s
16:	learn: 0.5355474	total: 26.4ms	remaining: 1.53s
17:	learn: 0.5300208	total: 49.5ms	remaining: 2.7s
18:	learn: 0.5247004	total: 51.4ms	remaining: 2.65s
19:	learn: 0.5199841	tota

<catboost.core.CatBoostClassifier at 0x7f457e7dff40>

In [121]:
# оцениваем модель на тестовом наборе
test_score = model.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score) 

Точность на тестовой выборке: 0.7799145299145299


In [122]:
prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

In [123]:
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
err2

0.11752136752136752

In [124]:
probas

array([0.40863717, 0.07137338, 0.16824318, ..., 0.04597038, 0.51082142,
       0.08952383])

In [125]:
# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ваш код здесь #
auc = roc_auc_score(y_test, probas) # ваш код здесь #
precision = precision_score(y_test, prediction) # ваш код здесь #
recall = recall_score(y_test, prediction) # ваш код здесь #
f1 = f1_score(y_test, prediction) # ваш код здесь #
logloss = log_loss(y_test, prediction) # ваш код здесь #

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [126]:
# настройки для логирования в MLFlow
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    X_test,
    prediction
)

input_example = X_test[:10] 
metadata = {'model_type': 'monthly'} 

  inputs = _infer_schema(model_input) if model_input is not None else None


In [127]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [128]:
class CatboostModelProba(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model

    def predict(self, context, model_input):
        import numpy as np
        predictions = np.sqrt(self._model.predict(model_input))

        return predictions

In [129]:
custom_model = CatboostModelProba(model) 

In [138]:
run = mlflow.active_run()

In [139]:
run

In [136]:
run.info.run_id

AttributeError: 'NoneType' object has no attribute 'info'

In [140]:
run_id

'2979f28bd3fe44259d8043a29295a847'

In [None]:
artifact_path = "vc"

In [None]:
#with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
with mlflow.start_run(experiment_id=experiment_id,
                 run_name=RUN_NAME,
                 run_id=run_id # pass None to start a new run
                 ) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.pyfunc.log_model( 
        # ваш код здесь #
        python_model=custom_model,
        #cb_model=model,
        signature=signature,
        pip_requirements=pip_requirements,
        metadata = metadata,
        input_example = input_example,
        artifact_path=artifact_path,
        await_registration_for=60,
        registered_model_name=REGISTRY_MODEL_NAME,
        )

    # логируем метрики эксперимента
    # где ключи — это названия метрик, а значения — числовые значения метрик
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)

    # логируем файл как артефакт эксперимента — 'users_churn.csv'
    mlflow.log_artifact("/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv", "dataframe")

Registered model 'churn_model_kruglikovAlex_b2c' already exists. Creating a new version of this model...
2025/07/27 21:18:55 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_kruglikovAlex_b2c, version 14
Created version '14' of model 'churn_model_kruglikovAlex_b2c'.


In [142]:
run_id

'2979f28bd3fe44259d8043a29295a847'

In [143]:
mlflow.end_run()

In [42]:
import joblib
# сохранение результата шага
os.makedirs('../models', exist_ok=True) # создание директории, если её ещё нет
with open('../models/fitted_model_Bayesian_Optuna.pkl', 'wb') as fd:
    joblib.dump(model, fd)