## Яндекс Практикум, курс "Инженер Машинного Обучения" (2024 г.)
## Проект 2-го спринта: "Улучшение baseline-модели"

### Этап 1. Развертывание MLflow Tracking Server и MLflow Model Registry. Регистрация существующей модели

In [25]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from datetime import datetime
import time

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold, train_test_split, cross_validate
from sklearn.metrics import mean_absolute_percentage_error
from catboost import CatBoostRegressor

import mlflow
import joblib

Загружаем параметры подключения к БД из переменных окружения

In [3]:
load_dotenv()

True

In [4]:
username = os.environ.get('DB_DESTINATION_USER')
password = os.environ.get('DB_DESTINATION_PASSWORD')
host = os.environ.get('DB_DESTINATION_HOST')
port = os.environ.get('DB_DESTINATION_PORT')
db = os.environ.get('DB_DESTINATION_NAME')

Скачиваем очищенный датасет с квартирами, подготовленный на проекте 1-го спринта

In [5]:
conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})

In [37]:
data = pd.read_sql('select * from clean_flats_dataset', conn, index_col='flat_id')

In [40]:
data.head()

Unnamed: 0_level_0,id,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,price,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
flat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
8348,23114,8,10.6,56.0,3,False,False,88.599998,10990000,2018,4,55.542187,37.483067,2.64,409,18,True
8350,23116,3,7.0,28.0,2,False,False,44.700001,8999000,1967,4,55.857765,37.422684,2.64,143,9,True
8351,23118,16,10.9,54.799999,4,False,False,89.099998,24000000,1996,4,55.562908,37.570431,2.7,164,16,True
8352,23120,2,7.4,66.300003,4,False,False,93.0,17500000,1965,1,55.653507,37.649426,2.7,59,6,True
8354,23122,4,9.1,17.700001,1,False,False,34.0,7500000,1964,1,55.796406,37.459873,3.0,72,9,True


Выполняем ту же предобработку, что и в проекте 1-го спринта

In [41]:
# Вместо года постройки добавляем возраст здания
data['building_age'] = (datetime.now().year - data['build_year']).astype('float')

# Удаляем лишние колонки (studio является константным признаком, см. EDA)
data.drop(
    columns=['id', 'build_year', 'studio'], 
    inplace=True
)

# Изменяем тип целочисленных признаков rooms и building_type_int на object, чтобы работать с ними как с категориальными
data[['rooms', 'building_type_int']] = data[['rooms', 'building_type_int']].astype('object')

# Изменяем тип оставшихся целых колонок на float
num_int_cols = data.select_dtypes('int').columns
data[num_int_cols] = data[num_int_cols].astype('float') 

Сохраняем предобработанный датасет для последующего логирования в MLflow

In [42]:
os.makedirs('../data', exist_ok=True)
data.to_csv('../data/initial_data.csv')

Отделяем признаки от целевой переменной и разделяем данные на обучающую и тестовую выборки

In [43]:
X, y = data[data.columns.drop('price')], data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Формируем пайплайн из трансформации данных и модели

In [44]:
# Группируем признаки по типам
cat_features = X.select_dtypes(include=['bool', 'object'])
is_bin_cat_features = cat_features.nunique() == 2
bin_cat_features = cat_features[is_bin_cat_features[is_bin_cat_features].index]
other_cat_features = cat_features[is_bin_cat_features[~is_bin_cat_features].index]
num_features = X.select_dtypes(include=['float']) 

# Задаем трансформацию признаков
preprocessor = ColumnTransformer(
    [
        ('bin_cat', OneHotEncoder(drop='if_binary'), bin_cat_features.columns.tolist()),
        ('other_cat', CatBoostEncoder(), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=True
)

# Создаем модель с параметрами по умолчанию
model = CatBoostRegressor(loss_function='MAPE', random_state=42)
    
# Собираем трансформацию и модель в один пайплайн
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

Запускаем пайплайн на обучающих данных

In [45]:
start = time.time()
pipeline.fit(X_train, y_train)
end = time.time()

0:	learn: 0.9926024	total: 18ms	remaining: 18s
1:	learn: 0.9844538	total: 38ms	remaining: 19s
2:	learn: 0.9823041	total: 62.5ms	remaining: 20.8s
3:	learn: 0.9802411	total: 103ms	remaining: 25.7s
4:	learn: 0.9782806	total: 123ms	remaining: 24.4s
5:	learn: 0.9729185	total: 156ms	remaining: 25.9s
6:	learn: 0.9697098	total: 198ms	remaining: 28.1s
7:	learn: 0.9662280	total: 263ms	remaining: 32.6s
8:	learn: 0.9676588	total: 330ms	remaining: 36.3s
9:	learn: 0.9684221	total: 426ms	remaining: 42.1s
10:	learn: 0.9653920	total: 520ms	remaining: 46.8s
11:	learn: 0.9617821	total: 609ms	remaining: 50.2s
12:	learn: 0.9605817	total: 694ms	remaining: 52.7s
13:	learn: 0.9626742	total: 759ms	remaining: 53.4s
14:	learn: 0.9610633	total: 808ms	remaining: 53s
15:	learn: 0.9598398	total: 855ms	remaining: 52.6s
16:	learn: 0.9604260	total: 913ms	remaining: 52.8s
17:	learn: 0.9586910	total: 983ms	remaining: 53.7s
18:	learn: 0.9612646	total: 1s	remaining: 51.8s
19:	learn: 0.9652439	total: 1.02s	remaining: 50.2s


Вычисляем метрику MAPE

In [46]:
y_pred = pipeline.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)

metrics = {}
metrics['MAPE'] = mape
metrics['fit_time'] = end - start
print(metrics)

{'MAPE': 1.2431108061531475, 'fit_time': 22.053139209747314}


Устанавливаем параметры MLflow Tracking Server и MLflow Model Registry, а также идентификаторы эксперимента, запуска и модели.

NB: Перед этим нужно поднять MLflow, для этого в терминале перейти в папку mlflow_server и выполнить команду
sh run_mlflow_server.sh

In [47]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" 
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") 
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") 

TRACKING_SERVER_HOST = '127.0.0.1' 
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") 

EXPERIMENT_NAME = 'mle-project-sprint-2'
RUN_NAME = 'step_1'
REGISTRY_MODEL_NAME = 'flats_price_model_sprint_2_step_1'

Оборачиваем пайплайн в класс CustomMlflowModel для логирования в MLflow

In [48]:
class CustomMlflowModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model
    
    def predict(self, context, model_input):
        return self._model.predict(model_input)
    
custom_model = CustomMlflowModel(pipeline)

Логируем модель, метрики, параметры и другие артефакты в MLflow

In [49]:
metadata = {'model_type': 'regression'}
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    X,
    pipeline.predict(X)
) 
input_example = data[:10]

#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_artifact("../data/initial_data.csv", "artifacts")
    
    model_info = mlflow.pyfunc.log_model( 
        python_model=custom_model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        metadata=metadata,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )

  inputs = _infer_schema(model_input) if model_input is not None else None
Successfully registered model 'flats_price_model_sprint_2_step_1'.
2024/04/18 20:30:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: flats_price_model_sprint_2_step_1, version 1
Created version '1' of model 'flats_price_model_sprint_2_step_1'.


In [50]:
print(run_id)

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
assert experiment.lifecycle_stage == "active"

run = mlflow.get_run(run_id)
assert run.info.status == 'FINISHED'

49ba17f1fa08478fbf22086a447ec1f7


In [51]:
# Удаляем файл с датасетом
os.remove('../data/initial_data.csv')