<a href="https://colab.research.google.com/github/ithelga/bank-churn-predictor/blob/main/notebooks/Team2_HW5_Final_Train_Pipeline_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
import os

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [65]:
data_path = 'drive/MyDrive/Colab Notebooks/Bank churn predictor/data'
row_df = pd.read_csv(f'{data_path}/row_dataset.csv')

In [66]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = MinMaxScaler()
        self.ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.num_cols = ['CreditScore', 'Age', 'Tenure', 'Balance',
                         'NumOfProducts', 'EstimatedSalary']
        self.cat_cols = ['Geography']

    def _clean_data(self, X):
        df = X.copy()

        # Удалим неинформативные столбцы
        df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], errors='ignore')

        # Заполнение пропусков
        df['Geography'] = df['Geography'].fillna(df['Geography'].mode()[0])
        df['Age'] = df['Age'].fillna(df['Age'].median())

        # Удалим остальные пропуски и дубликаты
        df = df.dropna()
        df = df.drop_duplicates()

        # Gender в 0/1
        df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

        return df

    def fit(self, X, y=None):
        df = self._clean_data(X)

        # Обучаем scaler и OHE
        self.scaler.fit(df[self.num_cols])
        self.ohe.fit(df[self.cat_cols])

        return self

    def transform(self, X):
        df = self._clean_data(X)

        # One-hot encoding для Geography
        geo_encoded = self.ohe.transform(df[self.cat_cols])
        geo_df = pd.DataFrame(
            geo_encoded,
            columns=self.ohe.get_feature_names_out(self.cat_cols),
            index=df.index
        )

        # Масштабирование числовых
        df_scaled = self.scaler.transform(df[self.num_cols])
        df_scaled = pd.DataFrame(df_scaled, columns=self.num_cols, index=df.index)

        # Итоговый датафрейм
        final_df = pd.concat([df_scaled, geo_df, df['Gender']], axis=1)

        return final_df

In [67]:
def train_and_export_model(row_df, save_dir):
    import os
    import joblib
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier

    os.makedirs(save_dir, exist_ok=True)

    # Разделяем данные
    target = row_df['Exited']
    features = row_df.drop(columns=['Exited'])

    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        features, target, test_size=0.25, stratify=target, random_state=42
    )

    # Предобработка
    preprocessor = Preprocessor()
    X_clean = preprocessor._clean_data(X_train_raw)
    y_clean = y_train.loc[X_clean.index]

    preprocessor.fit(X_train_raw)
    X_train_proc = preprocessor.transform(X_train_raw)
    X_train_proc = X_train_proc.loc[y_clean.index]

    # Модель с уже известными лучшими параметрами
    model = RandomForestClassifier(
        class_weight='balanced',
        n_estimators=200,
        min_samples_split=5,
        min_samples_leaf=5,
        max_features='sqrt',
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train_proc, y_clean)

    # Сохраняем
    joblib.dump(preprocessor, os.path.join(save_dir, "preprocessor.pkl"))
    joblib.dump(model, os.path.join(save_dir, "random_forest_model.pkl"))
    X_test_raw.assign(Exited=y_test.values).to_csv(os.path.join(save_dir, "test_raw.csv"), index=False)

    print("Модель, препроцессор и тестовые данные сохранены.")

In [68]:
train_and_export_model(row_df, save_dir='drive/MyDrive/Colab Notebooks/Bank churn predictor/final_model')

Модель, препроцессор и тестовые данные сохранены.


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

def train_and_evaluate(row_df):
    target = row_df['Exited']
    features = row_df.drop(columns=['Exited'])

    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        features, target, test_size=0.25, stratify=target, random_state=42
    )

    # Logistic Regression (baseline)
    lr_model = LogisticRegression(
        class_weight='balanced',
        max_iter=500,
        solver='lbfgs',
        random_state=42
    )

    # Только числовые колонки + заполнение NaN
    X_train_lr = X_train_raw.select_dtypes(include='number').copy()
    X_test_lr = X_test_raw.select_dtypes(include='number').copy()

    # Заполнение пропусков медианой
    X_train_lr = X_train_lr.fillna(X_train_lr.median())
    X_test_lr = X_test_lr.fillna(X_train_lr.median())

    lr_model.fit(X_train_lr, y_train)
    y_pred_lr = lr_model.predict(X_test_lr)
    y_proba_lr = lr_model.predict_proba(X_test_lr)[:, 1]

    print("Logistic Regression (первоначальные данные):")
    print(f"F1: {f1_score(y_test, y_pred_lr):.3f}")
    print(f"Precision: {precision_score(y_test, y_pred_lr):.3f}")
    print(f"Recall: {recall_score(y_test, y_pred_lr):.3f}")
    print(f"ROC-AUC: {roc_auc_score(y_test, y_proba_lr):.3f}")
    print()

    #  Random Forest (предобработка)
    preprocessor = Preprocessor()

    X_train_clean = preprocessor._clean_data(X_train_raw)
    y_train_clean = y_train.loc[X_train_clean.index]
    preprocessor.fit(X_train_clean)
    X_train_proc = preprocessor.transform(X_train_clean)

    X_test_clean = preprocessor._clean_data(X_test_raw)
    y_test_clean = y_test.loc[X_test_clean.index]
    X_test_proc = preprocessor.transform(X_test_clean)

    model = RandomForestClassifier(
        class_weight='balanced',
        n_estimators=200,
        min_samples_split=5,
        min_samples_leaf=5,
        max_features='sqrt',
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train_proc, y_train_clean)

    y_pred_rf = model.predict(X_test_proc)
    y_proba_rf = model.predict_proba(X_test_proc)[:, 1]

    print("RandomForest (на предобработанных данных):")
    print(f"F1: {f1_score(y_test_clean, y_pred_rf):.3f}")
    print(f"Precision: {precision_score(y_test_clean, y_pred_rf):.3f}")
    print(f"Recall: {recall_score(y_test_clean, y_pred_rf):.3f}")
    print(f"ROC-AUC: {roc_auc_score(y_test_clean, y_proba_rf):.3f}")

In [72]:
train_and_evaluate(row_df)

Logistic Regression (первоначальные данные):
F1: 0.479
Precision: 0.365
Recall: 0.696
ROC-AUC: 0.752

RandomForest (на предобработанных данных):
F1: 0.612
Precision: 0.580
Recall: 0.647
ROC-AUC: 0.856
