In [119]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import pandas as pd
import mlflow
import os
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, f1_score, log_loss, recall_score, precision_score, confusion_matrix

In [120]:
TABLE_NAME = "users_churn" # ваш код здесь #
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_kruglikovAlex' # ваш код здесь #
RUN_NAME = 'model_grid_search'
REGISTRY_MODEL_NAME = 'churn_model_kruglikovAlex_b2c' # ваш код здесь #
FS_ASSETS = "../fs_assets" 

In [121]:
df = pd.read_csv('/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv')
df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,...,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,...,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,...,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,...,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,...,No,No,No,No,Female,0,No,No,No,1


In [122]:
df = df.drop(columns=['end_date'])
df.head()

Unnamed: 0,id,begin_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1


In [123]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

In [125]:
split_column = "begin_date" # ваш код здесь
stratify_column = df[target] # ваш код здесь
test_size = 0.2 # ваш код здесь

In [126]:
df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    stratify=stratify_column,
    shuffle=True,
) 

In [127]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5615, 3)
Размер выборки для теста: (1404, 3)


In [128]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

In [129]:
# задайте сетку гиперпараметров для GridSearchCV
params = {'depth' : [3, 4, 5, 6, 7],
          'learning_rate' : [0.01, 0.1, 0.9],
          'iterations'    : [1, 2, 3],
          'l2_leaf_reg': [1, 5, 10, 15, 20]
          }

In [130]:
# создание модели 
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=iterations,
                           random_seed=random_seed, 
                           verbose=verbose, 
                           loss_function=loss_function, 
                           task_type=task_type)
model

<catboost.core.CatBoostClassifier at 0x7f73f88b5e10>

In [131]:
# создайте и выполните случайный поиск
cv = GridSearchCV(estimator=model, param_grid = params, cv = 2, n_jobs=-1)
cv

In [132]:
# выполняем поиск
clf = cv.fit(X_train, y_train)
clf

In [133]:
# выведите лучшие параметры и оценку точности
print("Лучшие гиперпараметры:", clf.best_params_)
print("Лучший счет:", clf.best_score_)

Лучшие гиперпараметры: {'depth': 5, 'iterations': 3, 'l2_leaf_reg': 10, 'learning_rate': 0.9}
Лучший счет: 0.7959033785093635


In [134]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_iterations,param_l2_leaf_reg,param_learning_rate,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.124234,0.006954,0.06163,0.000283,3,1,1,0.01,"{'depth': 3, 'iterations': 1, 'l2_leaf_reg': 1...",0.740385,0.751336,0.74586,0.005476,211
1,0.040862,0.002631,0.002253,0.000257,3,1,1,0.1,"{'depth': 3, 'iterations': 1, 'l2_leaf_reg': 1...",0.740385,0.751336,0.74586,0.005476,211
2,0.045488,0.002303,0.00221,0.00012,3,1,1,0.9,"{'depth': 3, 'iterations': 1, 'l2_leaf_reg': 1...",0.740385,0.751336,0.74586,0.005476,211
3,0.049407,0.001427,0.002278,5e-05,3,1,5,0.01,"{'depth': 3, 'iterations': 1, 'l2_leaf_reg': 5...",0.740385,0.751336,0.74586,0.005476,211
4,0.044616,0.002347,0.002294,0.000291,3,1,5,0.1,"{'depth': 3, 'iterations': 1, 'l2_leaf_reg': 5...",0.740385,0.751336,0.74586,0.005476,211


In [135]:
best_params = clf.best_params_

In [136]:
# обучите модель с лучшими параметрами на всем обучающем наборе
best_model = CatBoostClassifier(**best_params, 
                                random_seed=random_seed, 
                                verbose=verbose, 
                                loss_function=loss_function, 
                                task_type=task_type) # или clf.best_estimator_
best_model

<catboost.core.CatBoostClassifier at 0x7f73f88b7520>

In [137]:
best_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f73f88b7520>

In [138]:
# оцениваем модель на тестовом наборе
test_score = best_model.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score) 

Точность на тестовой выборке: 0.7706552706552706


In [139]:
prediction = best_model.predict(X_test)
probas = best_model.predict_proba(X_test)[:, 1]

In [140]:
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
err2

0.12321937321937322

In [141]:
probas

array([0.79847494, 0.36140979, 0.29580487, ..., 0.10088989, 0.07937295,
       0.1357307 ])

In [142]:
# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ваш код здесь #
auc = roc_auc_score(y_test, probas) # ваш код здесь #
precision = precision_score(y_test, prediction) # ваш код здесь #
recall = recall_score(y_test, prediction) # ваш код здесь #
f1 = f1_score(y_test, prediction) # ваш код здесь #
logloss = log_loss(y_test, prediction) # ваш код здесь #

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [143]:
metrics

{'err1': 0.0633903133903134,
 'err2': 0.12321937321937322,
 'auc': 0.8192110822630483,
 'precision': 0.6603053435114504,
 'recall': 0.42610837438423643,
 'f1': 0.5179640718562875,
 'logloss': 8.266421931122311}

In [144]:
# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = clf.cv_results_['mean_fit_time'][clf.best_index_] # среднее время обучения
metrics["std_fit_time"] = clf.cv_results_['std_fit_time'][clf.best_index_] # стандартное отклонение времени обучения
metrics["mean_test_score"] = clf.cv_results_['mean_test_score'][clf.best_index_] # средний результат на тесте
metrics["std_test_score"] = clf.cv_results_['std_test_score'][clf.best_index_] # стандартное отклонение результата на тесте
metrics["best_score"] = clf.best_score_ # или best_model.score(X_test, y_test) # лучший результат кросс-валидации

In [145]:
metrics

{'err1': 0.0633903133903134,
 'err2': 0.12321937321937322,
 'auc': 0.8192110822630483,
 'precision': 0.6603053435114504,
 'recall': 0.42610837438423643,
 'f1': 0.5179640718562875,
 'logloss': 8.266421931122311,
 'mean_fit_time': 0.06778419017791748,
 'std_fit_time': 0.006188035011291504,
 'mean_test_score': 0.7959033785093635,
 'std_test_score': 0.002529669923684863,
 'best_score': 0.7959033785093635}

In [146]:
# настройки для логирования в MLFlow
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    X_test,
    prediction.astype(int)
)

input_example = X_test[:10] 
metadata = {'model_type': 'monthly'} 

  inputs = _infer_schema(model_input) if model_input is not None else None


In [147]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [148]:
# создаём новый эксперимент в MLflow с указанным названием 
# если эксперимент с таким именем уже существует, 
# MLflow возвращает идентификатор существующего эксперимента
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id

'1'

In [149]:
class CatboostModelProba(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model

    def predict(self, context, model_input):
        import numpy as np
        predictions = np.sqrt(self._model.predict(model_input))

        return predictions

In [150]:
custom_model = CatboostModelProba(best_model) 

In [151]:
# подгружаем .env
load_dotenv()

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" # ваш код здесь
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [152]:
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.pyfunc.log_model( 
        # ваш код здесь #
        python_model=custom_model,
        #cb_model=model,
        signature=signature,
        pip_requirements=pip_requirements,
        metadata = metadata,
        input_example = input_example,
        artifact_path="models",
        await_registration_for=60,
        registered_model_name=REGISTRY_MODEL_NAME,
        )

    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    # логируем метрики эксперимента
    mlflow.log_params(best_params)
    # где ключи — это названия метрик, а значения — числовые значения метрик
    mlflow.log_metrics(metrics)

    # логируем файл как артефакт эксперимента — 'users_churn.csv'
    mlflow.log_artifact("/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv", "dataframe")

Registered model 'churn_model_kruglikovAlex_b2c' already exists. Creating a new version of this model...
2025/07/24 12:46:14 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_kruglikovAlex_b2c, version 9
Created version '9' of model 'churn_model_kruglikovAlex_b2c'.


In [153]:
run_id

'3b0fa8c0eace47e09b83514ea40a10f7'

In [154]:
client = mlflow.MlflowClient()
model_metadata = client.get_latest_versions(REGISTRY_MODEL_NAME, stages=["None"])
latest_model_version = model_metadata[0].version
latest_model_version

'9'

In [155]:
import joblib
# сохранение результата шага
os.makedirs('../models', exist_ok=True) # создание директории, если её ещё нет
with open('../models/fitted_model_GridSearchCV.pkl', 'wb') as fd:
    joblib.dump(model, fd)

os.makedirs('../models/cv', exist_ok=True) # создание директории, если её ещё нет
with open('../models/cv/fitted_best_model_cv.pkl', 'wb') as fd:
    joblib.dump(model, fd)

In [156]:
# %# =====================================================
#                  RandomizedSearchCV
# ========================================================
TABLE_NAME = "users_churn" # ваш код здесь #
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_kruglikovAlex' # ваш код здесь #
RUN_NAME = 'model_random_search'
REGISTRY_MODEL_NAME = 'churn_model_kruglikovAlex_b2c' # ваш код здесь #
FS_ASSETS = "../fs_assets" 

In [157]:
df = pd.read_csv('/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv')
df.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,...,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,...,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,...,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,...,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,...,No,No,No,No,Female,0,No,No,No,1


In [158]:
df = df.drop(columns=['end_date'])

features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

In [159]:
split_column = "begin_date" # ваш код здесь
stratify_column = df[target] # ваш код здесь
test_size = 0.2 # ваш код здесь

In [218]:
df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    stratify=stratify_column,
    shuffle=True,
) 

In [219]:
print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5615, 3)
Размер выборки для теста: (1404, 3)


In [220]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

In [221]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [222]:
# определение сетки гиперпараметров для RandomizedSearchCV
param_distributions = {
    'depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'iterations': [1, 2, 3],
    'l2_leaf_reg': [1, 5, 10, 15, 20],
} 

In [223]:
# создание модели 
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=iterations, 
                           random_seed=random_seed, 
                           verbose=verbose, 
                           loss_function=loss_function, 
                           task_type=task_type)# ваш код здесь
model

<catboost.core.CatBoostClassifier at 0x7f73f88b4550>

In [166]:
CatBoostClassifier?

[0;31mInit signature:[0m
[0mCatBoostClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0miterations[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdepth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0ml2_leaf_reg[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmodel_size_reg[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrsm[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss_function[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mborder_count[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfeature_border_type[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mper_float_feature_quantization[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minput_borders[0m[0;34m=[0m[0;3

In [224]:
cv = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=100, cv=5, random_state=42, n_jobs=-1)
cv

In [225]:
clf = cv.fit(X_train, y_train)

In [226]:
clf

In [227]:
# выведите лучшие параметры и оценку точности
print("Лучшие гиперпараметры:", clf.best_params_)
print("Лучший счет:", clf.best_score_)

Лучшие гиперпараметры: {'learning_rate': 0.9, 'l2_leaf_reg': 1, 'iterations': 3, 'depth': 6}
Лучший счет: 0.7934105075690117


In [228]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_l2_leaf_reg,param_iterations,param_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.060098,0.020093,0.001211,0.000322,0.01,15,1,3,"{'learning_rate': 0.01, 'l2_leaf_reg': 15, 'it...",0.768477,0.780053,0.77382,0.756901,0.729297,0.76171,0.017904,95
1,0.072547,0.027918,0.001372,0.000273,0.1,5,1,7,"{'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iter...",0.780053,0.788958,0.772039,0.777382,0.756901,0.775067,0.010605,73
2,0.065945,0.011578,0.001432,0.000314,0.01,1,3,5,"{'learning_rate': 0.01, 'l2_leaf_reg': 1, 'ite...",0.799644,0.805877,0.779163,0.788068,0.769368,0.788424,0.013263,7
3,0.057202,0.018803,0.001121,0.00017,0.01,20,2,7,"{'learning_rate': 0.01, 'l2_leaf_reg': 20, 'it...",0.794301,0.801425,0.779163,0.782725,0.760463,0.783615,0.014057,43
4,0.034345,0.011994,0.001166,0.00019,0.1,20,1,6,"{'learning_rate': 0.1, 'l2_leaf_reg': 20, 'ite...",0.780053,0.788958,0.772039,0.777382,0.756901,0.775067,0.010605,73


In [229]:
best_params = clf.best_params_
best_params

{'learning_rate': 0.9, 'l2_leaf_reg': 1, 'iterations': 3, 'depth': 6}

In [230]:
# обучите модель с лучшими параметрами на всем обучающем наборе
best_model = CatBoostClassifier(**best_params, 
                                random_seed=random_seed, 
                                verbose=verbose, 
                                loss_function=loss_function, 
                                task_type=task_type) # или clf.best_estimator_
best_model

<catboost.core.CatBoostClassifier at 0x7f73f6f8f310>

In [231]:
best_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f73f6f8f310>

In [232]:
# оцениваем модель на тестовом наборе
test_score = best_model.score(X_test, y_test)
print("Точность на тестовой выборке:", test_score) 

Точность на тестовой выборке: 0.7749287749287749


In [233]:
prediction = best_model.predict(X_test)
probas = best_model.predict_proba(X_test)[:, 1]

In [234]:
probas

array([0.80765771, 0.0514267 , 0.13545913, ..., 0.39416784, 0.12411261,
       0.14370189])

In [235]:
prediction

array([1, 0, 0, ..., 0, 0, 0])

In [236]:
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
err2

0.10826210826210826

In [237]:
# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ваш код здесь #
auc = roc_auc_score(y_test, probas) # ваш код здесь #
precision = precision_score(y_test, prediction) # ваш код здесь #
recall = recall_score(y_test, prediction) # ваш код здесь #
f1 = f1_score(y_test, prediction) # ваш код здесь #
logloss = log_loss(y_test, prediction) # ваш код здесь #

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [238]:
# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean() #clf.cv_results_['mean_fit_time'][clf.best_index_] # среднее время обучения
metrics["std_fit_time"] = cv_results['std_fit_time'].mean() # clf.cv_results_['std_fit_time'][clf.best_index_] # стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean() # clf.cv_results_['mean_test_score'][clf.best_index_] # средний результат на тесте
metrics["std_test_score"] = cv_results['std_test_score'].mean() # clf.cv_results_['std_test_score'][clf.best_index_] # стандартное отклонение результата на тесте
metrics["best_score"] = clf.best_score_ # лучший результат кросс-валидации
metrics

{'err1': 0.07051282051282051,
 'err2': 0.10826210826210826,
 'auc': 0.8039262663158033,
 'precision': 0.6055776892430279,
 'recall': 0.41192411924119243,
 'f1': 0.49032258064516127,
 'logloss': 8.112389224331212,
 'mean_fit_time': 0.10591277170181275,
 'std_fit_time': 0.030126130385694467,
 'mean_test_score': 0.7803383793410507,
 'std_test_score': 0.013184274197239283,
 'best_score': 0.7934105075690117}

In [239]:
# настройки для логирования в MLFlow
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    X_test,
    prediction
)

input_example = X_test[:10] 
metadata = {'model_type': 'monthly'} 

  inputs = _infer_schema(model_input) if model_input is not None else None


In [240]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [241]:
# создаём новый эксперимент в MLflow с указанным названием 
# если эксперимент с таким именем уже существует, 
# MLflow возвращает идентификатор существующего эксперимента
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id

'1'

In [242]:
class CatboostModelProba(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model

    def predict(self, context, model_input):
        import numpy as np
        predictions = np.sqrt(self._model.predict(model_input))

        return predictions

In [243]:
custom_model = CatboostModelProba(best_model) 

In [244]:
# подгружаем .env
load_dotenv()

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" # ваш код здесь
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [245]:
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.pyfunc.log_model( 
        # ваш код здесь #
        python_model=custom_model,
        #cb_model=model,
        signature=signature,
        pip_requirements=pip_requirements,
        metadata = metadata,
        input_example = input_example,
        artifact_path="models",
        await_registration_for=60,
        registered_model_name=REGISTRY_MODEL_NAME,
        )

    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    # логируем метрики эксперимента
    # где ключи — это названия метрик, а значения — числовые значения метрик
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)

    # логируем файл как артефакт эксперимента — 'users_churn.csv'
    mlflow.log_artifact("/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv", "dataframe")

Registered model 'churn_model_kruglikovAlex_b2c' already exists. Creating a new version of this model...
2025/07/24 12:55:46 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_kruglikovAlex_b2c, version 10
Created version '10' of model 'churn_model_kruglikovAlex_b2c'.


In [246]:
run_id

'63de0445b4af4d02a1c49c160176062e'

In [247]:
client = mlflow.MlflowClient()
model_metadata = client.get_latest_versions(REGISTRY_MODEL_NAME, stages=["None"])
latest_model_version = model_metadata[0].version
latest_model_version

'10'

In [248]:
import joblib
# сохранение результата шага
os.makedirs('../models', exist_ok=True) # создание директории, если её ещё нет
with open('../models/fitted_model_RandomizedSearchCV.pkl', 'wb') as fd:
    joblib.dump(model, fd)

os.makedirs('../models/cv', exist_ok=True) # создание директории, если её ещё нет
with open('../models/cv/fitted_best_model_cv_r.pkl', 'wb') as fd:
    joblib.dump(model, fd)