# Построение модели (CatBoostClassifier+optuna)

### ВАЖНО:
Для подбора гиперпараметров использовалась optuna, реализующая баесовский поиск оптимальных параметров по метрике

При обучении минимизировалась кросэнтропия в бинарном признаке satisfaction (параметр loss_function)

Обучение проводилось с помощью стратифицированной кросс-валидации и усредненной оценки f1-score

В данном notebook представлен код для подбора как через cpu, так и через gpu


In [1]:
import time
from catboost import CatBoostClassifier
import pandas as pd
import matplotlib.pyplot as plt

import optuna
from optuna.visualization import plot_slice

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import f1_score, roc_auc_score, classification_report

In [2]:
RANDOM_STATE = 42

In [3]:
df_prep_fact = pd.read_csv('../data/pilot_prep_factor.csv')

# перевод столбцов с вариантами ответа в uint8 для передачи в catboost как категориальных фичей
for name_col in ['комфорт, еда, развлечения', 'сервис/обслуживание', 'интернет сервис', 'основные функции']:
    df_prep_fact[name_col] = df_prep_fact[name_col].astype('uint8')
    
y = df_prep_fact['satisfaction']
X = df_prep_fact.drop(columns = ['satisfaction'])

In [4]:
# проверка на дисбалланс классов - особой нет, но лучше все равно использовать не accuracy
y.value_counts()

0    57802
1    44237
Name: satisfaction, dtype: int64

## Через CPU и мультипроцессинг optuna

In [7]:
def objective(trial):
    depth = trial.suggest_int('depth', 2, 10)
    n_estimators = trial.suggest_int('n_estimators', 20, 300, step=5)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 0.01, 0.9, step=0.01)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.9, step=0.01)
    
    loss_function = 'CrossEntropy'
    cat_features = ['gender', 'customer_type', 'type_of_travel', 'class']
    cat_features.extend(['комфорт, еда, развлечения', 'сервис/обслуживание', 'интернет сервис', 'основные функции'])
    
    params = {
        'depth': depth,
        'n_estimators': n_estimators,   
        'l2_leaf_reg': l2_leaf_reg,
        'learning_rate': learning_rate,
        
        'loss_function': loss_function,
        'cat_features': cat_features,
        'random_state': RANDOM_STATE,
        'verbose': 0
    }
    
    model = CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    pred = cross_val_predict(model, X, y, cv=skf)
    
    result = f1_score(y, pred)
    return result    

In [8]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=-1)

[I 2024-01-21 01:39:35,768] A new study created in memory with name: no-name-861ec8ac-99d1-4341-b73b-ffe83cda7b05
[I 2024-01-21 01:39:39,138] Trial 2 finished with value: 0.8663107783324089 and parameters: {'depth': 2, 'n_estimators': 30, 'l2_leaf_reg': 0.62, 'learning_rate': 0.8}. Best is trial 2 with value: 0.8663107783324089.
[I 2024-01-21 01:39:55,478] Trial 1 finished with value: 0.8805053234485786 and parameters: {'depth': 4, 'n_estimators': 195, 'l2_leaf_reg': 0.12, 'learning_rate': 0.59}. Best is trial 1 with value: 0.8805053234485786.
[I 2024-01-21 01:40:03,403] Trial 3 finished with value: 0.8759477821318432 and parameters: {'depth': 2, 'n_estimators': 210, 'l2_leaf_reg': 0.67, 'learning_rate': 0.8300000000000001}. Best is trial 1 with value: 0.8805053234485786.
[I 2024-01-21 01:41:14,360] Trial 4 finished with value: 0.8528473595756115 and parameters: {'depth': 10, 'n_estimators': 240, 'l2_leaf_reg': 0.28, 'learning_rate': 0.8400000000000001}. Best is trial 1 with value: 0.8

103.71


## Через GPU и 85% загрузку его

In [13]:
def objective_gpu(trial):
    depth = trial.suggest_int('depth', 10, 16)
    n_estimators = trial.suggest_int('n_estimators', 20, 200, step=1)
    l2_leaf_reg = trial.suggest_float('l2_leaf_reg', 0.01, 0.9, step=0.01)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.05, step=0.0001)
    
    loss_function = 'CrossEntropy'
    cat_features = ['gender', 'customer_type', 'type_of_travel', 'class']
    # cat_features.extend(['комфорт, еда, развлечения', 'сервис/обслуживание', 'интернет сервис', 'основные функции'])
    
    params = {
        'depth': depth,
        'n_estimators': n_estimators,   
        'l2_leaf_reg': l2_leaf_reg,
        'learning_rate': learning_rate,
        
        'loss_function': loss_function,
        'cat_features': cat_features,
        'random_state': RANDOM_STATE,
        'task_type': 'GPU',
        'gpu_ram_part': 0.85,
        
        'verbose': 0
    }
    
    model = CatBoostClassifier(**params)
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
    pred = cross_val_predict(model, X, y, cv=skf)
    
    result = f1_score(y, pred)
    return result    

In [23]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_gpu, n_trials=100)

[I 2024-01-21 01:47:55,556] A new study created in memory with name: no-name-7b28e2e7-6cd4-4d79-b01c-b1304282e63f
[I 2024-01-21 01:48:19,727] Trial 0 finished with value: 0.871788392846888 and parameters: {'depth': 7, 'n_estimators': 280, 'l2_leaf_reg': 0.5800000000000001, 'learning_rate': 0.63}. Best is trial 0 with value: 0.871788392846888.
[I 2024-01-21 01:48:22,245] Trial 1 finished with value: 0.881171789400842 and parameters: {'depth': 5, 'n_estimators': 55, 'l2_leaf_reg': 0.48000000000000004, 'learning_rate': 0.68}. Best is trial 1 with value: 0.881171789400842.
[I 2024-01-21 01:48:29,247] Trial 2 finished with value: 0.8758482666605324 and parameters: {'depth': 10, 'n_estimators': 135, 'l2_leaf_reg': 0.31, 'learning_rate': 0.42000000000000004}. Best is trial 1 with value: 0.881171789400842.
[I 2024-01-21 01:48:31,703] Trial 3 finished with value: 0.761172627198137 and parameters: {'depth': 7, 'n_estimators': 40, 'l2_leaf_reg': 0.03, 'learning_rate': 0.01}. Best is trial 1 with 

[I 2024-01-21 01:54:08,150] Trial 36 finished with value: 0.8798571826022442 and parameters: {'depth': 7, 'n_estimators': 140, 'l2_leaf_reg': 0.45, 'learning_rate': 0.06999999999999999}. Best is trial 18 with value: 0.8827215549094785.
[I 2024-01-21 01:54:14,485] Trial 37 finished with value: 0.8116640036521342 and parameters: {'depth': 9, 'n_estimators': 150, 'l2_leaf_reg': 0.34, 'learning_rate': 0.01}. Best is trial 18 with value: 0.8827215549094785.
[I 2024-01-21 01:54:33,826] Trial 38 finished with value: 0.8747550658152562 and parameters: {'depth': 6, 'n_estimators': 265, 'l2_leaf_reg': 0.27, 'learning_rate': 0.66}. Best is trial 18 with value: 0.8827215549094785.
[I 2024-01-21 01:54:57,035] Trial 39 finished with value: 0.8691335325212595 and parameters: {'depth': 8, 'n_estimators': 220, 'l2_leaf_reg': 0.41000000000000003, 'learning_rate': 0.5800000000000001}. Best is trial 18 with value: 0.8827215549094785.
[I 2024-01-21 01:55:00,338] Trial 40 finished with value: 0.878327919262

[I 2024-01-21 01:59:27,441] Trial 72 finished with value: 0.8795155709342559 and parameters: {'depth': 9, 'n_estimators': 170, 'l2_leaf_reg': 0.19, 'learning_rate': 0.03}. Best is trial 54 with value: 0.8827303469680826.
[I 2024-01-21 01:59:34,091] Trial 73 finished with value: 0.8824073645741777 and parameters: {'depth': 8, 'n_estimators': 185, 'l2_leaf_reg': 0.14, 'learning_rate': 0.15000000000000002}. Best is trial 54 with value: 0.8827303469680826.
[I 2024-01-21 01:59:41,387] Trial 74 finished with value: 0.8824376422499884 and parameters: {'depth': 10, 'n_estimators': 140, 'l2_leaf_reg': 0.49, 'learning_rate': 0.09999999999999999}. Best is trial 54 with value: 0.8827303469680826.
[I 2024-01-21 02:00:36,854] Trial 75 finished with value: 0.8779455144889814 and parameters: {'depth': 10, 'n_estimators': 295, 'l2_leaf_reg': 0.47000000000000003, 'learning_rate': 0.11}. Best is trial 54 with value: 0.8827303469680826.
[I 2024-01-21 02:00:42,348] Trial 76 finished with value: 0.880116009

## Визуализация подбора гиперпараметров optuna

In [15]:
params = study.best_params.keys()
plot_slice(study, params=params)

Результат:

- качество модели незначительно улучшается с повышением глубины деревьев
- найден оптимальный диапазон learning_rate(0,1-0,3)
- найден оптимальный диапазон n_estimators (110-200 деревьев)
- небольшая регуляризация (ненулевой l2_leaf_reg) положительно влияет на модель

## Но для предсказания обычно у нас нет данных об оценках данного пассажира
## Научимся предсказывать только на том, что реально есть

In [5]:
X.columns

Index(['gender', 'age', 'customer_type', 'type_of_travel', 'class',
       'flight_distance', 'комфорт, еда, развлечения', 'сервис/обслуживание',
       'интернет сервис', 'основные функции'],
      dtype='object')

In [6]:
X = X[['gender', 'age', 'customer_type', 'type_of_travel', 'class','flight_distance']]

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_gpu, n_trials=100)

[I 2024-01-22 22:22:32,181] A new study created in memory with name: no-name-86c0d0c6-1154-42f0-8055-6cf6c9a5b0a8
[I 2024-01-22 22:22:41,117] Trial 0 finished with value: 0.7744191484386624 and parameters: {'depth': 12, 'n_estimators': 134, 'l2_leaf_reg': 0.44, 'learning_rate': 0.0076}. Best is trial 0 with value: 0.7744191484386624.
[I 2024-01-22 22:22:47,781] Trial 1 finished with value: 0.7667072934583271 and parameters: {'depth': 10, 'n_estimators': 130, 'l2_leaf_reg': 0.3, 'learning_rate': 0.039}. Best is trial 0 with value: 0.7744191484386624.
[I 2024-01-22 22:22:56,009] Trial 2 finished with value: 0.7726057418365112 and parameters: {'depth': 11, 'n_estimators': 142, 'l2_leaf_reg': 0.08, 'learning_rate': 0.014600000000000002}. Best is trial 0 with value: 0.7744191484386624.
[I 2024-01-22 22:23:42,991] Trial 3 finished with value: 0.7690073620460636 and parameters: {'depth': 16, 'n_estimators': 74, 'l2_leaf_reg': 0.45, 'learning_rate': 0.0499}. Best is trial 0 with value: 0.77441

[I 2024-01-22 22:31:19,508] Trial 36 finished with value: 0.7672306120131713 and parameters: {'depth': 11, 'n_estimators': 118, 'l2_leaf_reg': 0.17, 'learning_rate': 0.042300000000000004}. Best is trial 27 with value: 0.7756782684617293.
[I 2024-01-22 22:31:22,464] Trial 37 finished with value: 0.7753365643511039 and parameters: {'depth': 10, 'n_estimators': 43, 'l2_leaf_reg': 0.27, 'learning_rate': 0.040100000000000004}. Best is trial 27 with value: 0.7756782684617293.
[I 2024-01-22 22:31:24,892] Trial 38 finished with value: 0.7745940986964222 and parameters: {'depth': 11, 'n_estimators': 26, 'l2_leaf_reg': 0.45, 'learning_rate': 0.048600000000000004}. Best is trial 27 with value: 0.7756782684617293.
[I 2024-01-22 22:31:29,286] Trial 39 finished with value: 0.7753885229669809 and parameters: {'depth': 12, 'n_estimators': 48, 'l2_leaf_reg': 0.11, 'learning_rate': 0.033}. Best is trial 27 with value: 0.7756782684617293.
[I 2024-01-22 22:31:35,192] Trial 40 finished with value: 0.770630

[I 2024-01-22 22:33:56,966] Trial 72 finished with value: 0.7745618044409485 and parameters: {'depth': 10, 'n_estimators': 59, 'l2_leaf_reg': 0.28, 'learning_rate': 0.0175}. Best is trial 50 with value: 0.7760758570386579.
[I 2024-01-22 22:34:00,731] Trial 73 finished with value: 0.7744986556425816 and parameters: {'depth': 10, 'n_estimators': 70, 'l2_leaf_reg': 0.26, 'learning_rate': 0.014700000000000001}. Best is trial 50 with value: 0.7760758570386579.
[I 2024-01-22 22:34:05,157] Trial 74 finished with value: 0.7753187810676465 and parameters: {'depth': 10, 'n_estimators': 83, 'l2_leaf_reg': 0.32, 'learning_rate': 0.022000000000000002}. Best is trial 50 with value: 0.7760758570386579.
[I 2024-01-22 22:34:10,351] Trial 75 finished with value: 0.7680185504114184 and parameters: {'depth': 10, 'n_estimators': 97, 'l2_leaf_reg': 0.2, 'learning_rate': 0.028600000000000004}. Best is trial 50 with value: 0.7760758570386579.
[I 2024-01-22 22:34:15,128] Trial 76 finished with value: 0.7757534

## Обучение и сохранение модели с наилучшими показателями

In [16]:
best_model = CatBoostClassifier(**study.best_params)
best_model.fit(X, y, verbose=0)


<catboost.core.CatBoostClassifier at 0x1d9a51deee0>

In [18]:
best_model.save_model('../data/best_CatBoost', format="cbm")

## Проверка работы модели

In [19]:
load_model = CatBoostClassifier().load_model("../data/best_CatBoost", format='cbm')

In [20]:
pred_load = load_model.predict(X)
f1_score(y, pred_load)

0.7656276633220078