# Подбор гиперпараметров для RandomForest, CatBoost, XGBosst

In [28]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline as mp
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
import optuna

Подготовка данных

In [29]:
train_df = pd.read_csv('../data/train_preprocessed.csv')
test_df = pd.read_csv('../data/test_preprocessed.csv')

X_train = train_df.drop('Churn', axis=1)
y_train = train_df['Churn']
X_test = test_df.drop('Churn', axis=1)
y_test = test_df['Churn']

scaler = StandardScaler()

pipeline = mp(
    scaler,
    SMOTE(random_state=42)
)

X_train_processed, y_train_processed = pipeline.fit_resample(X_train, y_train)

X_test_processed = scaler.transform(X_test)

### Использование RandomiseSearchCV для подбора гиперпараметров моделей

In [30]:
import warnings
warnings.filterwarnings('ignore')

scoring = ['accuracy', 'f1', 'recall', 'roc_auc']

#### Random Forest

In [31]:
params = {
    'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500],
    'max_depth': [3, 5, 7, 10, 12, 15, 17, 20, None],
    'min_samples_split': [2, 5, 8, 10, 12, 15, 18, 20],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 8, 10, 12, 15],
    'max_features': ['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

model = RandomForestClassifier(random_state=42)

results = {}

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring=scoring,
    random_state=42,
    n_iter=20,
    n_jobs=-1,
    cv=5,
    refit='accuracy',
    return_train_score=True
)

rs.fit(X_train_processed, y_train_processed)

results['best_params'] = rs.best_params_
results['best_score'] = rs.best_score_
results['metrics'] = {
    metric: rs.cv_results_[f'mean_test_{metric}'][rs.best_index_]
    for metric in scoring
}

print(results)

{'best_params': {'n_estimators': 250, 'min_samples_split': 12, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'gini'}, 'best_score': np.float64(0.8363174832523315), 'metrics': {'accuracy': np.float64(0.8363174832523315), 'f1': np.float64(0.8378455696042787), 'recall': np.float64(0.8603671964904287), 'roc_auc': np.float64(0.914333621161096)}}


#### CatBoost

In [32]:
params = {
    'iterations': [100, 200, 300, 400, 500],
    'depth': [4, 6, 8, 10, 12, 14, 16, 18, 20],
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
    'l2_leaf_reg': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

model = CatBoostClassifier(random_state=42, verbose=False)

results = {}

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring=scoring,
    random_state=42,
    n_iter=20,
    n_jobs=-1,
    cv=5,
    refit='accuracy',
    return_train_score=True
)

rs.fit(X_train_processed, y_train_processed)

results['best_params'] = rs.best_params_
results['best_score'] = rs.best_score_
results['metrics'] = {
    metric: rs.cv_results_[f'mean_test_{metric}'][rs.best_index_]
    for metric in scoring
}

print(results)

KeyboardInterrupt: 

#### XGBoost

In [None]:
params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 18, 20],
    'learning_rate': [0.01, 0.1, 0.2, 0.3]
}

model = XGBClassifier(random_state=42)

results = {}

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring=scoring,
    random_state=42,
    n_iter=10,
    n_jobs=-1,
    cv=5,
    refit='accuracy',
    return_train_score=True
)

rs.fit(X_train_processed, y_train_processed)

results['best_params'] = rs.best_params_
results['best_score'] = rs.best_score_
results['metrics'] = {
    metric: rs.cv_results_[f'mean_test_{metric}'][rs.best_index_]
    for metric in scoring
}

print(results)