## Яндекс Практикум, курс "Инженер Машинного Обучения" (2024 г.)
## Проект 2-го спринта: "Улучшение baseline-модели"

__Содержание__:

* [Этап 2: Проведение EDA](#step_2)
* [Этап 3: Генерация признаков и обучение модели](#step_3)
* [Этап 4: Отбор признаков и обучение новой версии модели](#step_4)
* [Этап 5: Подбор гиперпараметров и обучение новой версии модели](#step_5)

In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import psycopg2 as psycopg
from sqlalchemy import create_engine
from datetime import datetime
import time

from category_encoders import CatBoostEncoder
from catboost import CatBoostRegressor

import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID

import joblib
from collections import defaultdict
from statistics import median

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_validate, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
    FunctionTransformer
)

from autofeat import AutoFeatRegressor

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import optuna
from optuna.integration.mlflow import MLflowCallback

Загружаем переменные окружения

In [2]:
load_dotenv()

True

### Этап 2: Проведение EDA <a id="step_2"></a>

После выполнения EDA логируем его результаты, включая ноутбук с кодом EDA "notebook_eda.ipynb" и md-файл с выводами "conclusions_eda.md"

In [3]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" 
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") 
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") 

TRACKING_SERVER_HOST = '127.0.0.1' 
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") 

EXPERIMENT_NAME = 'mle-project-sprint-2'
RUN_NAME = 'step_2'

In [4]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.log_artifact("notebook_eda.ipynb", "artifacts")
    mlflow.log_artifact("conclusions_eda.md", "artifacts")

2024-04-18 20:41:40,886 INFO: Found credentials in environment variables.


In [5]:
print(run_id)

run = mlflow.get_run(run_id)
assert run.info.status == 'FINISHED'

f3c48bd28d9148059a110971fbb48b14


### Этап 3: Генерация признаков и обучение модели <a id="step_3"></a>

Извлекаем очищенные данные

In [3]:
username = os.environ.get('DB_DESTINATION_USER')
password = os.environ.get('DB_DESTINATION_PASSWORD')
host = os.environ.get('DB_DESTINATION_HOST')
port = os.environ.get('DB_DESTINATION_PORT')
db = os.environ.get('DB_DESTINATION_NAME')

In [4]:
conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})

In [5]:
data = pd.read_sql('select * from clean_flats_dataset', conn, index_col='flat_id')

In [6]:
data.head()

Unnamed: 0_level_0,id,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,price,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator
flat_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
8348,23114,8,10.6,56.0,3,False,False,88.599998,10990000,2018,4,55.542187,37.483067,2.64,409,18,True
8350,23116,3,7.0,28.0,2,False,False,44.700001,8999000,1967,4,55.857765,37.422684,2.64,143,9,True
8351,23118,16,10.9,54.799999,4,False,False,89.099998,24000000,1996,4,55.562908,37.570431,2.7,164,16,True
8352,23120,2,7.4,66.300003,4,False,False,93.0,17500000,1965,1,55.653507,37.649426,2.7,59,6,True
8354,23122,4,9.1,17.700001,1,False,False,34.0,7500000,1964,1,55.796406,37.459873,3.0,72,9,True


Выполняем ту же предобработку, что и в проекте 1-го спринта

In [7]:
# Вместо года постройки добавляем возраст здания
data['building_age'] = (datetime.now().year - data['build_year']).astype('float')

# Удаляем лишние колонки (studio является константным признаком, см. EDA)
data.drop(
    columns=['id', 'build_year', 'studio'], 
    inplace=True
)

# Изменяем тип целочисленных признаков rooms и building_type_int на object, чтобы работать с ними как с категориальными
data[['rooms', 'building_type_int']] = data[['rooms', 'building_type_int']].astype('object')

# Изменяем тип целых колонок на float
num_int_cols = data.select_dtypes('int').columns
data[num_int_cols] = data[num_int_cols].astype('float') 

Отделяем признаки от целевой переменной и разделяем данные на обучающую и тестовую выборки

In [8]:
X, y = data[data.columns.drop('price')], data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Группируем признаки по типам

In [33]:
cat_features = X.select_dtypes(include=['bool', 'object'])
is_bin_cat_features = cat_features.nunique() == 2
bin_cat_features = cat_features[is_bin_cat_features[is_bin_cat_features].index]
other_cat_features = cat_features[is_bin_cat_features[~is_bin_cat_features].index]
num_features = X.select_dtypes(include=['float'])

Создаем энкодеры для кодирования существующих и генерации новых признаков

In [40]:
# Для бинарных признаков
encoder_oh = OneHotEncoder(
    categories='auto',
    handle_unknown='ignore', 
    max_categories=10,
    sparse_output=False,
    drop='first'
) 

# Для небинарных категориальных признаков
encoder_cb = CatBoostEncoder()

# Для генерации новых числовых признаков
encoder_pol = PolynomialFeatures(degree=3)

# Для генерации новых числовых признаков
encoder_kbd = KBinsDiscretizer(
    n_bins=5, 
    encode='ordinal', 
    strategy='uniform', 
    subsample=None
)

# Для автогенерации новых числовых признаков
encoder_afr = AutoFeatRegressor(
    transformations=('exp', 'abs', 'sqrt', '^2', '^3'),
    categorical_cols=cat_features.columns.tolist(),
    feateng_steps=1,
    n_jobs=-1
)

Формируем пайплайн из трансформера данных и модели

Задаем трансформацию данных

In [36]:
# Все трансформации, за исключением AutoFeatRegressor
preprocessor_wo_afr = ColumnTransformer(
    [
        ('bin_cat', encoder_oh, bin_cat_features.columns.tolist()),
        ('other_cat', encoder_cb, other_cat_features.columns.tolist()),
        ('num_sс', StandardScaler(), num_features.columns.tolist()),
        ('num_pol', encoder_pol, num_features.columns.tolist()),
        ('num_kbd', encoder_kbd, num_features.columns.tolist())
        
    ],
    remainder='passthrough',
    verbose_feature_names_out=True,
    n_jobs=-1
)

# Только AutoFeatRegressor
preprocessor_afr = ColumnTransformer(
    [
        ('num_afr', encoder_afr)
    ]
)



# Создаем модель с параметрами по умолчанию
model = CatBoostRegressor(loss_function='MAPE', random_state=42)
  
# Объединяем в один пайплайн
pipeline = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)

Запускаем пайплайн на обучающих данных

In [37]:
start = time.time()
pipeline.fit(X_train, y_train)
end = time.time()

ValueError: not enough values to unpack (expected 3, got 2)

In [25]:
encoded_X_train = preprocessor.fit_transform(X_train, y_train) 

In [28]:
preprocessor.get_feature_names_out()

AttributeError: Transformer num_afr (type AutoFeatRegressor) does not provide get_feature_names_out.

In [30]:
encoded_X = preprocessor.fit_transform(X, y) 

encoded_X = pd.DataFrame(
    encoded_X, 
    columns=preprocessor.get_feature_names_out()
) 

enriched_X = pd.concat([X, encoded_X], axis=1) 


In [33]:
encoded_X

Unnamed: 0,num__scaler__floor,num__scaler__kitchen_area,num__scaler__living_area,num__scaler__total_area,num__scaler__latitude,num__scaler__longitude,num__scaler__ceiling_height,num__scaler__flats_count,num__scaler__floors_total,num__scaler__building_age,...,num__num_kbd__latitude,num__num_kbd__longitude,num__num_kbd__ceiling_height,num__num_kbd__flats_count,num__num_kbd__floors_total,num__num_kbd__building_age,bin_cat__is_apartment_True,bin_cat__has_elevator_True,other_cat__rooms,other_cat__building_type_int
0,0.278883,0.808993,1.897083,2.116089,-1.822537,-0.814095,-0.390417,1.398782,0.964159,-1.727832,...,0.0,1.0,1.0,3.0,3.0,0.0,0.0,1.0,1.271694e+07,1.271694e+07
1,-0.829836,-0.365523,0.006168,-0.450307,1.259034,-1.223464,-0.390417,-0.591115,-0.783611,0.875652,...,3.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.271694e+07,1.185347e+07
2,2.052833,0.906869,1.816044,2.145319,-1.620196,-0.221806,0.050686,-0.434018,0.575765,-0.604761,...,1.0,2.0,2.0,1.0,2.0,1.0,0.0,1.0,1.271694e+07,1.090198e+07
3,-1.051580,-0.235021,2.592670,2.373313,-0.735511,0.313743,0.050686,-1.219504,-1.366200,0.977749,...,1.0,3.0,2.0,0.0,0.0,2.0,0.0,1.0,1.835847e+07,1.271694e+07
4,-0.608092,0.319611,-0.689418,-1.075830,0.659869,-0.971336,2.256200,-1.122253,-0.783611,1.028798,...,3.0,1.0,4.0,0.0,1.0,2.0,0.0,1.0,1.271694e+07,1.510847e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99735,0.278883,-0.691777,1.221756,0.677972,0.129953,1.553176,-0.390417,-0.583634,-0.783611,0.671457,...,2.0,4.0,1.0,1.0,1.0,2.0,0.0,1.0,2.086497e+07,1.182931e+07
99736,-0.829836,0.286986,1.262276,1.180728,0.091101,0.612785,0.785857,-1.062407,0.769962,-1.115248,...,2.0,3.0,3.0,0.0,2.0,0.0,0.0,1.0,1.662899e+07,1.699598e+07
99737,2.052833,0.417487,0.283052,0.093371,-0.570829,1.090480,-0.390417,1.855112,0.769962,-1.013150,...,2.0,3.0,1.0,3.0,2.0,0.0,0.0,1.0,1.287129e+07,1.182927e+07
99738,0.057139,0.645866,0.478897,0.368133,0.054931,-1.293032,-0.390417,0.388872,0.769962,-1.523637,...,2.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0,1.287119e+07,1.182922e+07


In [32]:
enriched_X.dtypes

floor                           float64
kitchen_area                    float64
living_area                     float64
rooms                            object
is_apartment                     object
                                 ...   
num__num_kbd__building_age      float64
bin_cat__is_apartment_True      float64
bin_cat__has_elevator_True      float64
other_cat__rooms                float64
other_cat__building_type_int    float64
Length: 324, dtype: object

In [39]:
preprocessor.get_feature_names_out()

array(['num__scaler__floor', 'num__scaler__kitchen_area',
       'num__scaler__living_area', 'num__scaler__total_area',
       'num__scaler__latitude', 'num__scaler__longitude',
       'num__scaler__ceiling_height', 'num__scaler__flats_count',
       'num__scaler__floors_total', 'num__scaler__building_age',
       'num__num_pol__1', 'num__num_pol__floor',
       'num__num_pol__kitchen_area', 'num__num_pol__living_area',
       'num__num_pol__total_area', 'num__num_pol__latitude',
       'num__num_pol__longitude', 'num__num_pol__ceiling_height',
       'num__num_pol__flats_count', 'num__num_pol__floors_total',
       'num__num_pol__building_age', 'num__num_pol__floor^2',
       'num__num_pol__floor kitchen_area',
       'num__num_pol__floor living_area',
       'num__num_pol__floor total_area', 'num__num_pol__floor latitude',
       'num__num_pol__floor longitude',
       'num__num_pol__floor ceiling_height',
       'num__num_pol__floor flats_count',
       'num__num_pol__floor floors_t

Вычисляем метрику MAPE

In [22]:
y_pred = pipeline.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)

metrics = {}
metrics['MAPE'] = mape
metrics['fit_time'] = end - start
print(metrics)

{'MAPE': 1.3258694718121424, 'fit_time': 214.12311005592346}


Отдельно обучим AutoFeatRegressor, чтобы залогировать его вместе с основной моделью

In [24]:
encoder_afr.fit(X_train[num_features.columns].values, y_train.values)

  if np.max(np.abs(correlations[c].ravel()[:i])) < 0.9:


Также сохраним обогащенные данные для использования на следующих этапах

In [None]:
enriched_X = preprocessor.fit_transform(X_train)

In [None]:
type(enriched_X)

In [None]:
enriched_X.get_features_names_out()

Устанавливаем параметры MLflow Tracking Server и MLflow Model Registry, а также идентификаторы эксперимента, запуска и модели
(сначала поднимаем MLflow, выполнив скрипт "../run_mlflow_server" из командной строки терминала)

In [None]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" 
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") 
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") 

TRACKING_SERVER_HOST = '127.0.0.1' 
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") 

EXPERIMENT_NAME = 'mle-project-sprint-2'
RUN_NAME = 'step_3'
REGISTRY_MODEL_NAME = 'flats_price_model_sprint_2_step_3'

Оборачиваем пайплайн в класс CustomMlflowModel для логирования в MLflow

In [None]:
class CustomMlflowModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model
    
    def predict(self, context, model_input):
        return self._model.predict(model_input)

In [None]:
custom_model = CustomMlflowModel(pipeline)

Логируем модель, метрики, параметры и другие артефакты в MLflow

In [None]:
metadata = {'model_type': 'regression'}
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    X,
    pipeline.predict(X)
) 
input_example = data[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(params)
        
    model_info = mlflow.pyfunc.log_model( 
        python_model=custom_model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        metadata=metadata,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )
    
    afr_info = mlflow.sklearn.log_model(
        af_reg, 
        artifact_path="models",
        registered_model_name='af_reg_sprint_2_step_3'
    )

In [None]:
print(run_id)

run = mlflow.get_run(run_id)
assert run.info.status == 'FINISHED'

__Выводы по результатам 3-го этапа:__ #TODO

По сравнению с базовой моделью из 1-го этапа средняя ошибка MAPE на кросс-валидации уменьшилась с до . 
При этом увеличилось время обучения и.

### Этап 4: Отбор признаков и обучение новой версии модели <a id="step_4"></a>

In [None]:
Создаем папку для хранения артефактов, относящихся к отбору признаков

In [None]:
FS_ASSETS = "fs_assets"
os.makedirs(f'../{FS_ASSETS}', exist_ok=True)

In [None]:
Будем использовать обогащенные признаки enriched_X, полученные на предыдущем этапе. Создаем для них новый пайплайн

In [None]:
# Pipeline before feature selection

cat_features = enriched_X.select_dtypes(include=['bool', 'object'])
is_bin_cat_features = cat_features.nunique() == 2
bin_cat_features = cat_features[is_bin_cat_features[is_bin_cat_features].index]
other_cat_features = cat_features[is_bin_cat_features[~is_bin_cat_features].index]
num_features = enriched_X.select_dtypes(include=['float'])

preprocessor_before_fs = ColumnTransformer(
    [
        ('bin_cat', OneHotEncoder(drop='if_binary'), bin_cat_features.columns.tolist()),
        ('other_cat', CatBoostEncoder(), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=True
)

# Создаем модель с параметрами по умолчанию
model = CatBoostRegressor(loss_function='MAPE', random_state=42)
  
params = {
    'tree_count': model.tree_count_,
    'learning_rate': model.learning_rate_
}

pipeline_before_fs = Pipeline(
    [
        ('preprocessor', preprocessor_before_fs),
        ('model', model)
    ]
)

Отбираем признаки с помощью метода Sequential Backward Selection

In [None]:
sbs = SequentialFeatureSelector(
    pipeline_before_fs,    
    forward=False,      
    floating=True,     
    scoring='neg_mean_absolute_percentage_error', 
    cv=5,               
    n_jobs=-1
) 

sbs = sbs.fit(enriched_X, y)

sbs_df = pd.DataFrame.from_dict(sbs.get_metric_dict()).T
sbs_df.to_csv(f"../{FS_ASSETS}/sbs.csv")

График отбора признаков для SBS

In [None]:
fig = plot_sfs(sbs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Backward Selection (w. StdDev)')
plt.grid()
plt.show()
plt.savefig(f"../{FS_ASSETS}/sbs.png")

На графике видно, что #TODO

Отбираем признаки с помощью метода Sequential Forward Selection

In [None]:
sfs = SequentialFeatureSelector(
    pipeline_before_fs,
    forward=True,      
    floating=True,     
    scoring='neg_mean_absolute_percentage_error', 
    cv=5,               
    n_jobs=-1
) 

sfs = sfs.fit(enriched_X, y)

sfs_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
sfs_df.to_csv(f"../{ASSETS}/sfs.csv")

In [None]:
График отбора признаков для SFS

In [None]:
fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
plt.savefig(f"../{FS_ASSETS}/sfs.png")

In [None]:
Из графика следует, что #TODO

In [None]:
Объединяем индексы отобранных признаков из SBS и SFS

In [None]:
selected_features_indices = list(set(sbs.k_feature_idx_ + sfs.k_feature_idx_))
selected_enriched_X = enriched_X[:, selected_features_indices]

In [None]:
Создаем новый пайплайн для работы с обогащенными признаками после их отбора

In [None]:
# Pipeline after feature selection

cat_features = selected_enriched_X.select_dtypes(include=['bool', 'object'])
is_bin_cat_features = cat_features.nunique() == 2
bin_cat_features = cat_features[is_bin_cat_features[is_bin_cat_features].index]
other_cat_features = cat_features[is_bin_cat_features[~is_bin_cat_features].index]
num_features = selected_enriched_X.select_dtypes(include=['float'])

preprocessor_after_fs = ColumnTransformer(
    [
        ('bin_cat', OneHotEncoder(drop='if_binary'), bin_cat_features.columns.tolist()),
        ('other_cat', CatBoostEncoder(), other_cat_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=True
)

# Создаем модель с параметрами по умолчанию
model = CatBoostRegressor(loss_function='MAPE', random_state=42)
  
params = {
    'tree_count': model.tree_count_,
    'learning_rate': model.learning_rate_
}


pipeline_after_fs = Pipeline(
    [
        ('preprocessor', preprocessor_after_fs),
        ('model', model)
    ]
)

In [None]:
Обучаем новый пайплайн на отобранных признаках

In [None]:
pipeline_after_fs.fit(selected_enriched_X, y)

In [None]:
Проводим кросс-валидацию

In [None]:
cv_res = cross_validate(
    pipeline_after_fs,
    selected_enriched_X,
    y,
    cv=cv_strategy,
    n_jobs=-1,
    scoring='neg_mean_absolute_percentage_error'
)

In [None]:
metrics = {}

metrics['fit_time'] = round(cv_res['fit_time'].mean(), 3)
metrics['score_time'] = round(cv_res['score_time'].mean(), 3)
metrics['MAPE'] = round(-cv_res['test_score'].mean(), 3)

In [None]:
metrics

In [None]:
RUN_NAME = 'step_4'
REGISTRY_MODEL_NAME = 'flats_price_model_sprint_2_step_4'

In [None]:
custom_model = CustomMlflowModel(pipeline_after_fs)

In [None]:
metadata = {'model_type': 'regression'}
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    selected_enriched_X,
    pipeline_after_fs.predict(selected_enriched_X)
) 
input_example = pd.concat([selected_enriched_X, data['price']], axis=1)[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    mlflow.log_metrics(metrics)
    mlflow.log_params(params)
    mlflow.log_artifacts(FS_ASSETS)
        
    model_info = mlflow.pyfunc.log_model( 
        python_model=custom_model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        metadata=metadata,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )   

In [None]:
print(run_id)

run = mlflow.get_run(run_id)
assert run.info.status == 'FINISHED'

__Выводы по результатам 4-го этапа:__ #TODO
    
После отбора признаков двумя методами и их объединения средняя ошибка MAPE на кросс-валидации уменьшилась с до. 
Время обучения

### Этап 5: Подбор гиперпараметров и обучение новой версии модели <a id="step_5"></a>

Будем использовать признаки selected_enriched_X, обогащенные и отобранные на предыдущих этапах, а также созданный
для них папйлайн pipeline_after_fs

__Randomized Search__

In [None]:
# Задаем сетку параметров
params = {
    'iterations': [100, 200, 300, 400],  
    'learning_rate': np.logspace(-3, -1, 5), 
    'depth': [3, 4, 5, 6, 7],
    'l2_leaf_reg': np.logspace(-2, 0, 3)
}

search = RandomizedSearchCV(
    estimator=pipeline_after_fs, 
    param_distributions=params, 
    n_iter=20, 
    cv=5, 
    random_state=random_seed, 
    n_jobs=-1
)

search.fit(selected_enriched_X, y)

In [None]:
cv_results = pd.DataFrame(search.cv_results_)
print(cv_results)

In [None]:
best_params_random = search.best_params_ 
print(best_params_random)

__Bayesian Search__

In [12]:
def objective(trial: optuna.Trial) -> float:
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, 1, log=True),
        "depth": trial.suggest_int("depth", 3, 8, 1),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.01, 1, 1),
        "iterations": trial.suggest_int("iterations", 100, 400, 100),
        "random_state": 42,
        "loss_function": "MAPE"
    } 
       
    model = CatBoostRegressor(**params)
    pipeline = Pipeline(
        [
            ('preprocessor', preprocessor_after_fs),
            ('model', model)
        ]
    )
     
    skf = StratifiedKFold(n_splits=2) 
    metrics = defaultdict(list)

    for i, (train_index, val_index) in enumerate(skf.split(selected_enriched_X, y)):
        X_train = selected_enriched_X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_val = selected_enriched_X.iloc[val_index]
        y_val = y.iloc[val_index]
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        
        mape = mean_absolute_percentage_error(y_val, y_pred)
        metrics["mape"].append(mape)
        
    mape = median(metrics["mape"])
    
    return mape

In [20]:
TRACKING_SERVER_HOST = "127.0.0.1" 
TRACKING_SERVER_PORT = 5000 

EXPERIMENT_NAME = 'mle-project-sprint-2' 
RUN_NAME = 'step_5' 
REGISTRY_MODEL_NAME = 'flats_price_model_sprint_2_step_5' 

os.environ["MLFLOW_S3_ENDPOINT_URL"] = 'https://storage.yandexcloud.net'
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "flats_price_model"

In [21]:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as parent_run:
    parent_run_id = parent_run.info.run_id

mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="MAPE",
    create_experiment=False,
    mlflow_kwargs={'experiment_id': experiment_id, 'tags': {MLFLOW_PARENT_RUN_ID: parent_run_id}}
)
    
study = optuna.create_study(
    sampler=optuna.samplers.TPESampler(),
    direction='minimize',
    study_name=STUDY_NAME,
    storage=STUDY_DB_NAME,
    load_if_exists=True
) 
study.optimize(objective, n_trials=10, callbacks=[mlflc]) 
best_params_optuna = study.best_params 
print(best_params_optuna)

  mlflc = MLflowCallback(
[I 2024-04-09 17:32:36,157] Using an existing study with name 'churn_model' instead of creating a new one.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-04-09 17:32:37,524] Trial 111 finished with value: 0.8249230415606885 and parameters: {'learning_rate': 0.0468416501053928, 'depth': 4, 'l2_leaf_reg': 0.4978396379083133, 'random_strength': 4.74396931529672}. Best is trial 106 with value: 0.8335079990917593.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-04-09 17:32:39,809] Trial 112 finished with value: 0.816866211192768 and parameters: {'learning_rate': 0.049864992196812445, 'depth': 6, 'l2_leaf_reg': 0.2944113725725097, 'random_strength': 4.877493509858359}. Best is trial 106 with value: 0.8335079990917593.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-04-09 17:32:41,282] Trial 113 finished with value: 0.8193538154768124 and parameters: {'learning_rate': 

0:	learn: 0.6772090	total: 2.44ms	remaining: 2.44s
1:	learn: 0.6603691	total: 4.87ms	remaining: 2.43s
2:	learn: 0.6411902	total: 6.73ms	remaining: 2.24s
3:	learn: 0.6291897	total: 8.42ms	remaining: 2.1s
4:	learn: 0.6150240	total: 10.3ms	remaining: 2.06s
5:	learn: 0.6010801	total: 12.3ms	remaining: 2.03s
6:	learn: 0.5920378	total: 14ms	remaining: 1.99s
7:	learn: 0.5806338	total: 16ms	remaining: 1.98s
8:	learn: 0.5714374	total: 17.9ms	remaining: 1.97s
9:	learn: 0.5609182	total: 19.9ms	remaining: 1.97s
10:	learn: 0.5510304	total: 22ms	remaining: 1.97s
11:	learn: 0.5420405	total: 23.9ms	remaining: 1.97s
12:	learn: 0.5325409	total: 25.8ms	remaining: 1.96s
13:	learn: 0.5248101	total: 28ms	remaining: 1.97s
14:	learn: 0.5172767	total: 30ms	remaining: 1.97s
15:	learn: 0.5107952	total: 32ms	remaining: 1.97s
16:	learn: 0.5050981	total: 33.9ms	remaining: 1.96s
17:	learn: 0.5013743	total: 35.7ms	remaining: 1.95s
18:	learn: 0.4957781	total: 37.7ms	remaining: 1.95s
19:	learn: 0.4900548	total: 39.6ms	

Registered model 'best_cb_clf' already exists. Creating a new version of this model...
2024/04/09 17:32:55 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: best_cb_clf, version 26


Number of finished trials: 121
Best params: {'learning_rate': 0.03946119818412257, 'depth': 3, 'l2_leaf_reg': 0.11267435830906039, 'random_strength': 4.797122330990984}


Created version '26' of model 'best_cb_clf'.


In [None]:
Обучаем модель с лучшими параметрами

In [None]:
model = CatBoostRegressor(**best_params)
best_pipeline_after_fs = Pipeline(
    [
        ('preprocessor', preprocessor_after_fs),
        ('model', model)
    ]
)
best_pipeline_after_fs.fit(selected_enriched_X, y)

In [None]:
Проводим кросс-валидацию модели с лучшими параметрами

In [None]:
cv_strategy = KFold(
    n_splits=5, 
    shuffle=True,
    random_state=42
)

cv_res = cross_validate(
    best_pipeline_after_fs,
    selected_enriched_X,
    y,
    cv=cv_strategy,
    n_jobs=-1,
    scoring='neg_mean_absolute_percentage_error'
)

In [None]:
metrics = {}

metrics['fit_time'] = round(cv_res['fit_time'].mean(), 3)
metrics['score_time'] = round(cv_res['score_time'].mean(), 3)
metrics['MAPE'] = round(-cv_res['test_score'].mean(), 3)

print(metrics)

In [None]:
Создаем кастомную модель

In [None]:
class CustomMlflowModel(mlflow.pyfunc.PythonModel):
    def __init__(self, model):
        super().__init__()
        self._model = model
    
    def predict(self, context, model_input):
        return self._model.predict(model_input)
    
custom_model = CustomMlflowModel(best_pipeline_after_fs)

In [None]:
Логируем лучшую модель

In [None]:
metadata = {'model_type': 'regression'}
pip_requirements = "../requirements.txt" 
signature = mlflow.models.infer_signature(
    selected_enriched_X,
    best_pipeline_after_fs.predict(selected_enriched_X)
) 
input_example = pd.concat([selected_enriched_X, data['price']], axis=1)[:10]


with mlflow.start_run(run_id=parent_run_id) as run:
    mlflow.log_params(best_params)
    mlflow.log_metrics(metrics)
    
    model_info = mlflow.pyfunc.log_model(
        python_model=custom_model,
        artifact_path="models",
        await_registration_for=60,
        registered_model_name=REGISTRY_MODEL_NAME,
        metadata=metadata,
        signature=signature,
        input_example=input_example,
        pip_requirements=pip_requirements
    )


print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")