## Яндекс Практикум, курс "Инженер Машинного Обучения" (2024 г.)
## Проект 2-го спринта: "Улучшение baseline-модели"

### Этап 1. Развертывание MLflow Tracking Server и MLflow Model Registry. Регистрация существующей модели

In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from datetime import datetime

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, cross_validate
from catboost import CatBoostRegressor

import mlflow
import joblib

Загружаем параметры подключения к БД из переменных окружения

In [2]:
load_dotenv()

True

In [3]:
username = os.environ.get('DB_DESTINATION_USER')
password = os.environ.get('DB_DESTINATION_PASSWORD')
host = os.environ.get('DB_DESTINATION_HOST')
port = os.environ.get('DB_DESTINATION_PORT')
db = os.environ.get('DB_DESTINATION_NAME')

Скачиваем очищенный датасет с квартирами, подготовленный на проекте 1-го спринта

In [4]:
conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
data = pd.read_sql('select * from clean_flats_dataset', conn, index_col='flat_id')

Выполняем ту же предобработку, что и в проекте 1-го спринта

In [5]:
# Вместо года постройки добавляем возраст здания
data['building_age'] = (datetime.now().year - data['build_year']).astype('float')

# Удаляем лишние колонки (studio является константным признаком, см. EDA)
data.drop(
    columns=['id', 'build_year', 'studio'], 
    inplace=True
)

# Изменяем тип целочисленных признаков rooms и building_type_int на object, чтобы работать с ними как с категориальными
data[['rooms', 'building_type_int']] = data[['rooms', 'building_type_int']].astype('object')

# Изменяем тип целых колонок на float
num_int_cols = data.select_dtypes('int').columns
data[num_int_cols] = data[num_int_cols].astype('float') 

In [6]:
data.head()

Unnamed: 0_level_0,floor,kitchen_area,living_area,rooms,is_apartment,total_area,price,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,building_age
flat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
8348,8.0,10.6,56.0,3,False,88.599998,10990000.0,4,55.542187,37.483067,2.64,409.0,18.0,True,6.0
8350,3.0,7.0,28.0,2,False,44.700001,8999000.0,4,55.857765,37.422684,2.64,143.0,9.0,True,57.0
8351,16.0,10.9,54.799999,4,False,89.099998,24000000.0,4,55.562908,37.570431,2.7,164.0,16.0,True,28.0
8352,2.0,7.4,66.300003,4,False,93.0,17500000.0,1,55.653507,37.649426,2.7,59.0,6.0,True,59.0
8354,4.0,9.1,17.700001,1,False,34.0,7500000.0,1,55.796406,37.459873,3.0,72.0,9.0,True,60.0


Сохраняем предобработанный датасет для последующего логирования в MLflow

In [35]:
os.makedirs('../data', exist_ok=True)
data.to_csv('../data/initial_data.csv')

Разделяем данные на признаки и целевую переменную

In [None]:
X = data[data.columns.drop('price')]
y = data['price']

Формируем пайплайн из трансформации данных и модели

In [19]:
# Группируем признаки по типам
cat_features = X.select_dtypes(include=['bool', 'object'])
is_bin_cat_features = cat_features.nunique() == 2
bin_cat_features = cat_features[is_bin_cat_features[is_bin_cat_features].index]
other_cat_features = cat_features[is_bin_cat_features[~is_bin_cat_features].index]
num_features = X.select_dtypes(include=['float']) 

# Задаем трансформацию признаков
preprocessor = ColumnTransformer(
    [
        ('bin_cat', OneHotEncoder(drop='if_binary'), bin_cat_features.columns.tolist()),
        ('other_cat', CatBoostEncoder(), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=True
)

# Создаем модель с параметрами по умолчанию
model = CatBoostRegressor(loss_function='MAPE', random_state=42)
    
# Сохраняем параметры модели для последующего логирования
params = {
    'tree_count': model.tree_count_,
    'learning_rate': model.learning_rate_
}

# Собираем трансформацию и модель в один пайплайн
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

Запускаем пайплайн обучения на всех данных

In [21]:
pipeline.fit(X, y)

0:	learn: 0.9936596	total: 28.1ms	remaining: 28s
1:	learn: 0.9866032	total: 51.3ms	remaining: 25.6s
2:	learn: 0.9810744	total: 71.9ms	remaining: 23.9s
3:	learn: 0.9742922	total: 94.7ms	remaining: 23.6s
4:	learn: 0.9694587	total: 119ms	remaining: 23.8s
5:	learn: 0.9643812	total: 143ms	remaining: 23.7s
6:	learn: 0.9638215	total: 166ms	remaining: 23.5s
7:	learn: 0.9628158	total: 189ms	remaining: 23.5s
8:	learn: 0.9613698	total: 213ms	remaining: 23.4s
9:	learn: 0.9598602	total: 239ms	remaining: 23.7s
10:	learn: 0.9581459	total: 263ms	remaining: 23.7s
11:	learn: 0.9568790	total: 285ms	remaining: 23.4s
12:	learn: 0.9553486	total: 307ms	remaining: 23.3s
13:	learn: 0.9535159	total: 334ms	remaining: 23.5s
14:	learn: 0.9513620	total: 355ms	remaining: 23.3s
15:	learn: 0.9533327	total: 379ms	remaining: 23.3s
16:	learn: 0.9551945	total: 402ms	remaining: 23.2s
17:	learn: 0.9554007	total: 425ms	remaining: 23.2s
18:	learn: 0.9575114	total: 450ms	remaining: 23.2s
19:	learn: 0.9559879	total: 473ms	remai

Проводим кросс-валидацию

In [22]:
cv_strategy = KFold(
    n_splits=5, 
    shuffle=True,
    random_state=42
)

cv_res = cross_validate(
    pipeline,
    X,
    y,
    cv=cv_strategy,
    n_jobs=-1,
    scoring='neg_mean_absolute_percentage_error'
)

0:	learn: 0.9956181	total: 54.4ms	remaining: 54.3s
0:	learn: 0.9955574	total: 47.7ms	remaining: 47.7s
1:	learn: 0.9876479	total: 110ms	remaining: 54.9s
1:	learn: 0.9919563	total: 92.9ms	remaining: 46.4s
2:	learn: 0.9857652	total: 158ms	remaining: 52.5s
2:	learn: 0.9879332	total: 133ms	remaining: 44.1s
3:	learn: 0.9833805	total: 202ms	remaining: 50.4s
3:	learn: 0.9841831	total: 179ms	remaining: 44.6s
4:	learn: 0.9761795	total: 246ms	remaining: 49s
4:	learn: 0.9817233	total: 230ms	remaining: 45.8s
5:	learn: 0.9714270	total: 296ms	remaining: 49s
5:	learn: 0.9780932	total: 299ms	remaining: 49.6s
6:	learn: 0.9657264	total: 338ms	remaining: 48s
7:	learn: 0.9625422	total: 381ms	remaining: 47.3s
6:	learn: 0.9735788	total: 350ms	remaining: 49.7s
8:	learn: 0.9621320	total: 426ms	remaining: 47s
7:	learn: 0.9685384	total: 399ms	remaining: 49.5s
9:	learn: 0.9600436	total: 473ms	remaining: 46.8s
8:	learn: 0.9665864	total: 448ms	remaining: 49.3s
9:	learn: 0.9629904	total: 494ms	remaining: 48.9s
10:	l

In [23]:
cv_res

{'fit_time': array([49.58913159, 49.69197702, 47.06737161, 47.73233366, 20.32109499]),
 'score_time': array([0.27220488, 0.34412122, 0.29786301, 0.30163956, 0.14526916]),
 'test_score': array([ -1.24758356,  -5.25807617,  -1.39696464, -32.62560499,
        -16.59356275])}

Создаем словарь с метриками

In [26]:
metrics = {}

metrics['fit_time'] = round(cv_res['fit_time'].mean(), 3)
metrics['score_time'] = round(cv_res['score_time'].mean(), 3)
metrics['MAPE'] = round(-cv_res['test_score'].mean(), 3)

print(metrics)

{'fit_time': 42.88, 'score_time': 0.272, 'MAPE': 11.424}


Устанавливаем параметры MLflow Tracking Server и MLflow Model Registry, а также идентификаторы эксперимента, запуска и модели.

NB: Перед этим нужно поднять MLflow, для этого в терминале перейти в папку проекта и выполнить команду
sh run_mlflow_server.sh

In [36]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" 
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") 
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") 

TRACKING_SERVER_HOST = '127.0.0.1' 
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") 

EXPERIMENT_NAME = 'mle-project-sprint-2'
RUN_NAME = 'step_1'
REGISTRY_MODEL_NAME = 'flats_price_model_sprint_2_step_1'

Оборачиваем пайплайн в класс CustomMlflowModel для логирования в MLflow

In [37]:
class CustomMlflowModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model
    
    def predict(self, context, model_input):
        return self._model.predict(model_input)
    
custom_model = CustomMlflowModel(pipeline)

Логируем модель, метрики, параметры и другие артефакты в MLflow

In [39]:
metadata = {'model_type': 'regression'}
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    X,
    pipeline.predict(X)
) 
input_example = data[:10]

#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(params)
    mlflow.log_artifact("../data/initial_data.csv", "artifacts")
    
    model_info = mlflow.pyfunc.log_model( 
        python_model=custom_model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        metadata=metadata,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'flats_price_model_sprint_2_step_1'.
2024/04/18 04:35:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: flats_price_model_sprint_2_step_1, version 1
Created version '1' of model 'flats_price_model_sprint_2_step_1'.


In [40]:
print(run_id)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
assert experiment.lifecycle_stage == "active"

run = mlflow.get_run(run_id)
assert run.info.status == 'FINISHED'

64fa92a4b2f848018acec0bfa7435f1f


In [41]:
# Удаляем файл с датасетом
os.remove('../data/initial_data.csv')