In [1]:
# делаем import необходимых библиотек
import os
import mlflow
import psycopg
import pandas as pd
from dotenv import load_dotenv
from sklearn.metrics import roc_auc_score, f1_score, log_loss, recall_score, precision_score, confusion_matrix

In [3]:
# scripts/fit.py
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostClassifier
import yaml
import os
import joblib

In [4]:
# загружаем результат предыдущего шага: inital_data.csv
data = pd.read_csv('/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv')

In [5]:
data.head()

Unnamed: 0,id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,7020,2020-01-01,,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,...,No,No,No,No,Female,0,Yes,No,No,0
1,7021,2017-04-01,,One year,No,Mailed check,56.95,1889.5,DSL,Yes,...,Yes,No,No,No,Male,0,No,No,No,0
2,7022,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,...,No,No,No,No,Male,0,No,No,No,1
3,7023,2016-05-01,,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,...,Yes,Yes,No,No,Male,0,No,No,No,0
4,7024,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,...,No,No,No,No,Female,0,No,No,No,1


In [19]:
data.drop(columns=['id', 'begin_date', 'end_date'], inplace=True) 

In [22]:
data.head()

Unnamed: 0,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,No,Yes,No,No,No,No,Female,0,Yes,No,No,0
1,One year,No,Mailed check,56.95,1889.5,DSL,Yes,No,Yes,No,No,No,Male,0,No,No,No,0
2,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,Yes,Yes,No,No,No,No,Male,0,No,No,No,1
3,One year,No,Bank transfer (automatic),42.3,1840.75,DSL,Yes,No,Yes,Yes,No,No,Male,0,No,No,No,0
4,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,No,No,No,No,No,No,Female,0,No,No,No,1


In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    data,
    data['target'],
    stratify=data['target']
)

In [23]:
# обучение модели
cat_features = data.select_dtypes(include='object')
potential_binary_features = cat_features.nunique() == 2

In [24]:
binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
other_cat_features = cat_features[potential_binary_features[~potential_binary_features].index]
num_features = data.select_dtypes(['float'])

In [31]:
binary_cols = binary_cat_features.columns.tolist()
non_binary_cat_cols = other_cat_features.columns.tolist()
num_cols = num_features.columns.tolist()

In [33]:
non_binary_cat_cols.append('senior_citizen')

In [57]:
preprocessor = ColumnTransformer(
    [
        ('binary', OneHotEncoder(drop='if_binary'), binary_cols),
        ('cat', CatBoostEncoder(return_df=False), non_binary_cat_cols),
        ('num', StandardScaler(), num_cols)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

In [58]:
model = CatBoostClassifier(auto_class_weights='Balanced')

In [59]:
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

In [60]:
# обучите пайплайн
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


Learning rate set to 0.020938
0:	learn: 0.6849951	total: 3.15ms	remaining: 3.15s
1:	learn: 0.6764106	total: 5.99ms	remaining: 2.99s
2:	learn: 0.6692109	total: 8.82ms	remaining: 2.93s
3:	learn: 0.6624049	total: 11.6ms	remaining: 2.88s
4:	learn: 0.6539149	total: 14.3ms	remaining: 2.85s
5:	learn: 0.6474451	total: 17ms	remaining: 2.82s
6:	learn: 0.6410311	total: 19.8ms	remaining: 2.81s
7:	learn: 0.6346707	total: 22.5ms	remaining: 2.79s
8:	learn: 0.6278481	total: 25.3ms	remaining: 2.78s
9:	learn: 0.6215417	total: 28.7ms	remaining: 2.84s
10:	learn: 0.6170435	total: 31.4ms	remaining: 2.82s
11:	learn: 0.6109816	total: 34.1ms	remaining: 2.81s
12:	learn: 0.6059944	total: 36.9ms	remaining: 2.8s
13:	learn: 0.6009671	total: 39.6ms	remaining: 2.79s
14:	learn: 0.5957320	total: 42.5ms	remaining: 2.79s
15:	learn: 0.5902442	total: 45.2ms	remaining: 2.78s
16:	learn: 0.5851594	total: 47.9ms	remaining: 2.77s
17:	learn: 0.5804251	total: 50.7ms	remaining: 2.77s
18:	learn: 0.5762851	total: 53.5ms	remaining: 2

In [61]:
# получите предсказания для тестовой выборки
y_pred_proba = pipeline.predict_proba(X_test)[:, 1] 

print('f1:', f1_score(y_test, y_pred))
print('roc_auc:', roc_auc_score(y_test, y_pred_proba))

f1: 0.6095060577819198
roc_auc: 0.8259305380289472


In [62]:
# сохранение результата шага
os.makedirs('../models', exist_ok=True) # создание директории, если её ещё нет
with open('../models/fitted_model_new.pkl', 'wb') as fd:
    joblib.dump(pipeline, fd)

In [10]:
pip install scikit-learn==1.4.0

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn==1.4.0
  Using cached scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.1
    Uninstalling scikit-learn-1.3.1:
      Successfully uninstalled scikit-learn-1.3.1
Successfully installed scikit-learn-1.4.0
Note: you may need to restart the kernel to use updated packages.


In [63]:
prediction = y_pred
proba = y_pred_proba

In [64]:
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()

In [65]:
err2

0.18632478632478633

In [66]:
proba

array([0.15880529, 0.3430753 , 0.62772707, ..., 0.69363905, 0.93751179,
       0.53342286])

In [67]:
# импортируйте необходимые вам модули

# заведите словарь со всеми метриками
metrics = {}

# посчитайте метрики из модуля sklearn.metrics
# err_1 — ошибка первого рода
# err_2 — ошибка второго рода
_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel() # ваш код здесь #
auc = roc_auc_score(y_test, proba) # ваш код здесь #
precision = precision_score(y_test, prediction) # ваш код здесь #
recall = recall_score(y_test, prediction) # ваш код здесь #
f1 = f1_score(y_test, prediction) # ваш код здесь #
logloss = log_loss(y_test, prediction) # ваш код здесь #

# запишите значения метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [68]:
metrics

{'err1': 0.15954415954415954,
 'err2': 0.18632478632478633,
 'auc': 0.8259305380289472,
 'precision': 0.5387149917627677,
 'recall': 0.7017167381974249,
 'f1': 0.6095060577819198,
 'logloss': 8.605293886062727}

In [69]:
# задаём название эксперимента и имя запуска для логирования в MLflow

EXPERIMENT_NAME = "churn_kruglikovAlex" # ваш код здесь (напишите своё уникальное имя эксперимента)
RUN_NAME = "model_0_registry"
REGISTRY_MODEL_NAME = "churn_model_kruglikovAlex_2"

In [70]:
X_tr_transformed = preprocessor.fit_transform(X_train, y_train)

In [71]:
X_tr = pd.DataFrame(X_tr_transformed, columns=preprocessor.get_feature_names_out())

In [72]:
X_tr.head()

Unnamed: 0,paperless_billing_Yes,internet_service_Fiber optic,online_security_Yes,online_backup_Yes,device_protection_Yes,tech_support_Yes,streaming_tv_Yes,streaming_movies_Yes,gender_Male,partner_Yes,dependents_Yes,multiple_lines_Yes,type,payment_method,senior_citizen,monthly_charges,total_charges
0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.265388,0.265388,0.0,1.161276,1.166885
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.265388,0.265388,0.0,-1.47683,-0.594922
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.132694,0.265388,0.0,-0.676899,0.038722
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.265388,0.132694,0.0,-1.475163,-0.951512
4,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.088463,0.088463,0.0,1.712895,2.503006


In [90]:
# ваш код здесь

pip_requirements = "../requirements.txt" # ваш код здесь
signature = mlflow.models.infer_signature(
    X_tr,
    prediction.astype(int)
)
# ваш код здесь
input_example = X_test[:10] # ваш код здесь
metadata = metadata = {'model_type': 'monthly'} # ваш код здесь


In [75]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [76]:
# создаём новый эксперимент в MLflow с указанным названием 
# если эксперимент с таким именем уже существует, 
# MLflow возвращает идентификатор существующего эксперимента
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME) # ваш код здесь
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

In [77]:
experiment_id

'1'

In [78]:
class CatboostModelProba(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model

    def predict(self, context, model_input):
        import numpy as np
        predictions = np.sqrt(self._model.predict(model_input))

        return predictions

In [81]:
custom_model = CatboostModelProba(pipeline) 

In [None]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" # ваш код здесь
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv('AWS_SECRET_ACCESS_KEY')

In [85]:
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id # ваш код здесь
    
    model_info = mlflow.pyfunc.log_model( 
        # ваш код здесь #
        python_model=custom_model,
        #cb_model=model,
        signature=signature,
        pip_requirements=pip_requirements,
        metadata = metadata,
        input_example = input_example,
        artifact_path="models",
        await_registration_for=60,
        registered_model_name=REGISTRY_MODEL_NAME,
        )

    # логируем метрики эксперимента
    # где ключи — это названия метрик, а значения — числовые значения метрик
    mlflow.log_metrics(metrics)

    # логируем файл как артефакт эксперимента — 'users_churn.csv'
    mlflow.log_artifact("/home/mle-user/mle_projects/mle-dvc/data/initial_data.csv", "dataframe")


Successfully registered model 'churn_model_kruglikovAlex_2'.
2025/07/11 21:23:20 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: churn_model_kruglikovAlex_2, version 1
Created version '1' of model 'churn_model_kruglikovAlex_2'.


In [86]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
# получаем данные о запуске эксперимента по его уникальному идентификатору
run = mlflow.get_run(run_id) # ваш код здесь

In [87]:
# проверяем, что статус запуска эксперимента изменён на 'FINISHED'
# это утверждение (assert) можно использовать для автоматической проверки того, 
# что эксперимент был завершён успешно
assert run.info.status == "FINISHED" # ваш код здесь

In [88]:
run.info.status

'FINISHED'

In [89]:
run_id

'486b5c7a38614c6b98301c8290458a03'

In [95]:
client = mlflow.MlflowClient()

In [96]:
client

<mlflow.tracking.client.MlflowClient at 0x7f8ece0f3370>

In [103]:
REGISTRY_MODEL_NAME = "churn_model_kruglikovAlex"
models = client.search_model_versions(
    filter_string=f"name LIKE '{REGISTRY_MODEL_NAME}%'"
)

for model in models:
    print(model) 

<ModelVersion: aliases=[], creation_timestamp=1752269000770, current_stage='None', description='', last_updated_timestamp=1752269000770, name='churn_model_kruglikovAlex_2', run_id='486b5c7a38614c6b98301c8290458a03', run_link='', source='s3://s3-student-mle-20250507-60d03b0a2f-freetrack/1/486b5c7a38614c6b98301c8290458a03/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>
<ModelVersion: aliases=[], creation_timestamp=1752009634051, current_stage='Production', description='', last_updated_timestamp=1752095262716, name='churn_model_kruglikovAlex', run_id='74fd405fdfa34a609caf709078a04aed', run_link='', source='s3://s3-student-mle-20250507-60d03b0a2f-freetrack/1/74fd405fdfa34a609caf709078a04aed/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>
<ModelVersion: aliases=[], creation_timestamp=1752009191402, current_stage='Staging', description='', last_updated_timestamp=1752095252031, name='churn_model_kruglikovAlex', ru

In [107]:
model_name_1 = models[-1].name
model_version_1 = models[-1].version
model_stage_1 = models[-1].current_stage

In [108]:
model_name_1, model_version_1, model_stage_1

('churn_model_kruglikovAlex', '1', 'Staging')

In [111]:
model_name_2 = models[-2].name
model_version_2 = models[-2].version
model_stage_2 = models[-2].current_stage

In [112]:
model_name_2, model_version_2, model_stage_2

('churn_model_kruglikovAlex', '2', 'Production')

In [127]:
model_name_3 = models[-3].name
model_version_3 = models[-3].version
model_stage_3 = models[-3].current_stage

In [128]:
model_name_3, model_version_3, model_stage_3

('churn_model_kruglikovAlex_2', '1', 'None')

In [129]:
client.transition_model_version_stage(model_name_3, model_version_3, "production")
client.transition_model_version_stage(model_name_2, model_version_2, "staging") 

<ModelVersion: aliases=[], creation_timestamp=1752009634051, current_stage='Staging', description='', last_updated_timestamp=1752274380964, name='churn_model_kruglikovAlex', run_id='74fd405fdfa34a609caf709078a04aed', run_link='', source='s3://s3-student-mle-20250507-60d03b0a2f-freetrack/1/74fd405fdfa34a609caf709078a04aed/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>

In [133]:
REGISTRY_MODEL_NAME = "churn_model_kruglikovAlex_2_b2c"
REGISTRY_MODEL_NAME_new = "churn_model_kruglikovAlex"
client.rename_registered_model(
    name=REGISTRY_MODEL_NAME, 
    new_name=f"{REGISTRY_MODEL_NAME_new}_b2c"
) 