<a href="https://colab.research.google.com/github/ithelga/bank-churn-predictor/blob/main/notebooks/Team2_HW4_Ensemble_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ансамбли моделей

In [54]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import LinearSegmentedColormap
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
colors = ["#FFAFCC", "#FFC8DD", "#CDB4DB", "#BDE0FE", "#A2D2FF"]
graph_path = 'drive/MyDrive/Colab Notebooks/Bank churn predictor/graph'

In [5]:
data_path = 'drive/MyDrive/Colab Notebooks/Bank churn predictor/data'

row_df = pd.read_csv(f'{data_path}/row_dataset.csv')                   # исходные данные
preprocessed_df = pd.read_csv(f'{data_path}/preprocessed_dataset.csv') # предобработанные данные
derived_df = pd.read_csv(f'{data_path}/derived_dataset.csv')           # сгенерированные признаки
extract_df = pd.read_csv(f'{data_path}/extract_dataset.csv')           # отобранные признаки
cleaned_df = pd.read_csv(f'{data_path}/cleaned_dataset.csv')           # очищенные от выбросов

# Выбор модели

In [6]:
pip install lightgbm xgboost



In [7]:
def evaluate_models_ensemble(df, dataset_name, is_raw_data=False):
    """
    Обучение и оценка ансамблей (RandomForest, LightGBM, XGBoost)

    Параметры:
    - df: DataFrame с данными
    - dataset_name: название датасета
    - is_raw_data: удаляет категориальные признаки (для raw_df)

    Возвращает:
    - Список словарей с метриками
    """
    data = df.copy()

    non_features = ['RowNumber', 'CustomerId', 'Surname']
    if 'Exited' in data.columns:
        target = data['Exited']
        features = data.drop(columns=['Exited'] + [col for col in non_features if col in data.columns])
    else:
        raise ValueError("Столбец 'Exited' не найден в данных.")

    if is_raw_data:
        cat_cols = features.select_dtypes(include=['object', 'category']).columns
        features = features.drop(columns=cat_cols)
        features = features.fillna(features.median())

    X_train, X_test, y_train, y_test = train_test_split(
        features, target,
        test_size=0.25,
        random_state=42,
        stratify=target
    )

    models = {
    'RandomForest': RandomForestClassifier(
        class_weight='balanced_subsample',
        n_estimators=200,
        min_samples_leaf=20,
        min_samples_split=10,
        max_features='sqrt',
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    ),
    'LightGBM': LGBMClassifier(
        is_unbalance=True,
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        verbosity=-1,
        force_col_wise=True
    ),
    'XGBoost': XGBClassifier(
        scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
        use_label_encoder=False,
        eval_metric='logloss',
        n_estimators=200,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )
    }

    results = []

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        metrics = {
            'Dataset': dataset_name,
            'Model': model_name,
            'F1': f1_score(y_test, y_pred),
            'ROC-AUC': roc_auc_score(y_test, y_proba),
            'Recall': recall_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred)
        }

        results.append(metrics)

    return results

In [8]:
# Список датасетов и их характеристик
datasets = [
    {'data': row_df, 'name': 'Первоначальные данные', 'is_raw': True},
    {'data': preprocessed_df, 'name': 'Предобработанные данные', 'is_raw': False},
    {'data': derived_df, 'name': 'Сгенерированные признаки', 'is_raw': False},
    {'data': extract_df, 'name': 'Отобранные признаки', 'is_raw': False},
    {'data': cleaned_df, 'name': 'Отобранные признаки без выбросов', 'is_raw': False}
]

In [9]:
all_results_ensemble = []
for dataset in datasets:
    all_results_ensemble.extend(
        evaluate_models_ensemble(
            dataset['data'],
            dataset['name'],
            is_raw_data=dataset['is_raw']
        )
    )

results_ensemble_df = pd.DataFrame(all_results_ensemble)

In [10]:
# Настраиваем порядок датасетов
dataset_order = [
    'Первоначальные данные',
    'Предобработанные данные',
    'Сгенерированные признаки',
    'Отобранные признаки',
    'Отобранные признаки без выбросов'
]

In [30]:
def style_comparison_by_column(df):
    def _style(row):
        styles = []
        for i, val in enumerate(row):
            if pd.isna(val):
                styles.append('')
                continue

            col = df.columns[i]
            metric, model = col[0], col[1]
            base_color = model_colors[model]

            # Градиент внутри одного столбца (по значению)
            col_values = df[col].dropna()
            min_val, max_val = col_values.min(), col_values.max()
            norm = 0 if max_val == min_val else (val - min_val) / (max_val - min_val)

            # Интерполяция от 30% цвета → 100%
            min_intensity = 0.3
            intensity = min_intensity + norm * (1 - min_intensity)

            r, g, b = [int(base_color[i:i+2], 16) for i in (1, 3, 5)]
            r = int(255 - (255 - r) * intensity)
            g = int(255 - (255 - g) * intensity)
            b = int(255 - (255 - b) * intensity)

            style = f'background-color: rgb({r},{g},{b}); font-size: 12pt; text-align: center; padding: 5px;'

            # Жирный — если лучшее значение в строке по метрике
            same_metric = [c for c in df.columns if c[0] == metric]
            row_vals = [row[c] for c in same_metric if pd.notna(row[c])]
            row_max = max(row_vals)
            if val == row_max:
                style += 'font-weight: bold; color: black;'

            styles.append(style)
        return styles
    return _style

In [33]:
# Преобразуем результаты в pivot
pivot_ensemble_df = results_ensemble_df.set_index(['Dataset', 'Model']).unstack()
pivot_ensemble_df = pivot_ensemble_df.loc[dataset_order]

model_colors = {
    'RandomForest': "#CDB4DB",
    'LightGBM': "#FFAFCC",
    'XGBoost': "#A2D2FF"
}

table_styles = [
    {'selector': 'th',
     'props': [('background-color', 'white'),
               ('color', 'black'),
               ('text-align', 'center'),
               ('font-size', '12pt'),
               ('padding', '5px')]},
    {'selector': 'th.level0',
     'props': [('font-weight', 'bold'),
               ('border-bottom', '1px solid #aaa'),
               ('text-align', 'center')]},
    {'selector': 'th.level1',
     'props': [('font-style', 'italic')]},
    {'selector': 'td',
     'props': [('padding', '8px'),
               ('border', '1px solid #eee')]}
]

styled_ensemble_result = pivot_ensemble_df.style \
    .format("{:.3f}") \
    .apply(style_comparison_by_column(pivot_ensemble_df), axis=1) \
    .set_table_styles(table_styles) \
    .set_caption(
        f"<span style='font-size:16pt'>Сравнение ансамблей: "
        f"<span style='color:{model_colors['RandomForest']}; font-weight:bold'>RandomForest</span>, "
        f"<span style='color:{model_colors['LightGBM']}; font-weight:bold'>LightGBM</span>, "
        f"<span style='color:{model_colors['XGBoost']}; font-weight:bold'>XGBoost</span></span>"
    ) \
    .set_properties(**{'min-width': '120px'})
styled_ensemble_result

Unnamed: 0_level_0,F1,F1,F1,ROC-AUC,ROC-AUC,ROC-AUC,Recall,Recall,Recall,Precision,Precision,Precision
Model,LightGBM,RandomForest,XGBoost,LightGBM,RandomForest,XGBoost,LightGBM,RandomForest,XGBoost,LightGBM,RandomForest,XGBoost
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Первоначальные данные,0.576,0.597,0.557,0.837,0.85,0.812,0.653,0.724,0.58,0.515,0.508,0.536
Предобработанные данные,0.62,0.619,0.599,0.859,0.869,0.84,0.672,0.717,0.585,0.575,0.544,0.613
Сгенерированные признаки,0.601,0.604,0.59,0.842,0.855,0.824,0.65,0.711,0.57,0.558,0.525,0.612
Отобранные признаки,0.598,0.608,0.579,0.839,0.859,0.821,0.684,0.741,0.619,0.532,0.516,0.543
Отобранные признаки без выбросов,0.539,0.565,0.492,0.827,0.845,0.794,0.64,0.722,0.526,0.465,0.464,0.462


# Обучение модели

Обучите модель из Спринта 2. Оцените метрики. Сделайте вывод.


In [55]:
def evaluate_models(df, dataset_name, is_raw_data=False):
    """
    Обучение и оценка моделей (логистическая регрессия и случайный лес с RandomizedSearchCV)

    Параметры:
    - df: DataFrame с данными
    - dataset_name: название датасета
    - is_raw_data: если True, удаляет категориальные признаки (только для row_df)

    Возвращает:
    - Список словарей с метриками для обеих моделей
    """
    data = df.copy()

    non_features = ['RowNumber', 'CustomerId', 'Surname']
    if 'Exited' in data.columns:
        target = data['Exited']
        features = data.drop(columns=['Exited'] + [col for col in non_features if col in data.columns])
    else:
        raise ValueError("Столбец 'Exited' не найден в данных.")

    if is_raw_data:
        cat_cols = features.select_dtypes(include=['object', 'category']).columns
        features = features.drop(columns=cat_cols)
        features = features.fillna(features.median())

    X_train, X_test, y_train, y_test = train_test_split(
        features, target,
        test_size=0.25,
        random_state=42,
        stratify=target
    )

    results = []

    # --- Logistic Regression ---
    lr_model = LogisticRegression(
        class_weight='balanced',
        max_iter=500,
        solver='lbfgs',
        random_state=42
    )
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    y_proba_lr = lr_model.predict_proba(X_test)[:, 1]
    results.append({
        'Dataset': dataset_name,
        'Model': 'LogisticRegression',
        'F1': f1_score(y_test, y_pred_lr),
        'ROC-AUC': roc_auc_score(y_test, y_proba_lr),
        'Recall': recall_score(y_test, y_pred_lr),
        'Precision': precision_score(y_test, y_pred_lr)
    })

    # --- Random Forest + RandomizedSearchCV ---
    rf_base = RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )

    param_dist = {
        'n_estimators': [100, 200],
        'max_depth': [5, 8, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5, 10],
        'max_features': ['sqrt', 'log2']
    }

    random_search = RandomizedSearchCV(
        rf_base,
        param_distributions=param_dist,
        n_iter=20,
        scoring=make_scorer(f1_score),
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    random_search.fit(X_train, y_train)
    best_rf = random_search.best_estimator_

    y_pred_rf = best_rf.predict(X_test)
    y_proba_rf = best_rf.predict_proba(X_test)[:, 1]

    results.append({
        'Dataset': dataset_name,
        'Model': 'RandomForest',
        'F1': f1_score(y_test, y_pred_rf),
        'ROC-AUC': roc_auc_score(y_test, y_proba_rf),
        'Recall': recall_score(y_test, y_pred_rf),
        'Precision': precision_score(y_test, y_pred_rf)
    })

    print(f"[{dataset_name}] Лучшие параметры RandomForest:", random_search.best_params_)

    return results

In [56]:
# Сбор результатов
all_results = []
for dataset in datasets:
    all_results.extend(evaluate_models(
        dataset['data'],
        dataset['name'],
        is_raw_data=dataset['is_raw']
    ))

# Итоговая таблица
results_df = pd.DataFrame(all_results)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Первоначальные данные] Лучшие параметры RandomForest: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 10}
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Предобработанные данные] Лучшие параметры RandomForest: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'max_depth': None}
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Сгенерированные признаки] Лучшие параметры RandomForest: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': 8}
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Отобранные признаки] Лучшие параметры RandomForest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'max_depth': None}
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[Отобранные признаки без вы

In [57]:
# Преобразуем таблицу
pivoted_df = results_df.set_index(['Dataset', 'Model']).unstack()
pivoted_df = pivoted_df.loc[dataset_order]

# Новые цвета моделей
model_colors = {
    'LogisticRegression': "#FFAFCC",  # Розовый
    'RandomForest': "#CDB4DB"         # Фиолетовый
}

# Финальный стиль таблицы
styled_result = (
    pivoted_df.style
        .format("{:.3f}")
        .apply(style_comparison_by_column(pivoted_df), axis=1)
        .set_table_styles(table_styles)
        .set_caption(
            f"<span style='font-size:16pt'>Сравнение моделей: "
            f"<span style='color:{model_colors['LogisticRegression']}; font-weight:bold'>LogisticRegression</span> "
            f"vs <span style='color:{model_colors['RandomForest']}; font-weight:bold'>RandomForest</span></span>"
        )
        .set_properties(**{'min-width': '120px'})
)

styled_result

Unnamed: 0_level_0,F1,F1,ROC-AUC,ROC-AUC,Recall,Recall,Precision,Precision
Model,LogisticRegression,RandomForest,LogisticRegression,RandomForest,LogisticRegression,RandomForest,LogisticRegression,RandomForest
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Первоначальные данные,0.458,0.599,0.747,0.853,0.69,0.684,0.342,0.532
Предобработанные данные,0.507,0.636,0.783,0.867,0.705,0.642,0.396,0.63
Сгенерированные признаки,0.539,0.617,0.815,0.855,0.715,0.717,0.433,0.542
Отобранные признаки,0.547,0.603,0.818,0.848,0.741,0.646,0.434,0.565
Отобранные признаки без выбросов,0.514,0.573,0.836,0.842,0.758,0.679,0.389,0.495
