In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
init_drop = ["id", "CustomerId", "Surname"]
test_ids = test["id"].tolist()

train = train.drop(init_drop, axis=1).copy()
test = test.drop(init_drop, axis=1).copy()

In [5]:
def feat_age_category(df):
    return pd.cut(df["Age"], bins=[18, 30, 40, 50, 60, 100], labels=["18-30", "30-40", "40-50", "50-60", "60+"], include_lowest=True)

def feat_credit_score_range(df):
    return pd.cut(df["CreditScore"], bins=[0, 300, 600, 700, 800, 900], labels=["0-300", "300-600", "600-700", "700-800", "900+"])

def feat_geo_gender(df):
    return df["Geography"] + "_" + df["Gender"]

def feat_age_gender(df):
    return df["Age_Category"].astype(str) + "_" + df["Gender"]

def feat_cred_bal_sal(df):
    return (df["CreditScore"] * df["Balance"] ) / df["EstimatedSalary"]

def feat_bal_est(df):
    return df["Balance"] / df["EstimatedSalary"]

def feat_tenure_age_div(df):
    return df["Tenure"] / df["Age"]

def feat_age_tenure_prod(df):
    return df["Age"] * df["Tenure"]

def feat_balance_products(df):
    return df["Balance"] / df["NumOfProducts"]

In [6]:
feats = {
    "Age_Category"       : feat_age_category,
    "Credit_Score_Range" : feat_credit_score_range,
    "Geo_Gender"         : feat_geo_gender,
    "Age_Gender"         : feat_age_gender,
    "Cred_Bal_Sal"       : feat_cred_bal_sal,
    "Bal_Est"            : feat_bal_est,
    "Tenure_Age_Div"     : feat_tenure_age_div,
    "Tenure_Age_Prod"    : feat_age_tenure_prod,
    "Balance_Products"   : feat_balance_products,
}

In [7]:
for feat, func in feats.items():
    train[feat] = func(train)
    test[feat] = func(test)

In [8]:
train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age_Category,Credit_Score_Range,Geo_Gender,Age_Gender,Cred_Bal_Sal,Bal_Est,Tenure_Age_Div,Tenure_Age_Prod,Balance_Products
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0,30-40,600-700,France_Male,30-40_Male,0.0,0.0,0.090909,99.0,0.0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0,30-40,600-700,France_Male,30-40_Male,0.0,0.0,0.030303,33.0,0.0
2,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0,30-40,600-700,France_Male,30-40_Male,0.0,0.0,0.25,400.0,0.0
3,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,30-40,300-600,France_Male,30-40_Male,1022.940581,1.760655,0.058824,68.0,148882.54
4,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0,30-40,700-800,Spain_Male,30-40_Male,0.0,0.0,0.151515,165.0,0.0


In [9]:
from sklearn.preprocessing import LabelEncoder

label_raw = ["Geography", "Gender", "Age_Category", "Credit_Score_Range", "Geo_Gender", "Age_Gender"]

for raw in label_raw:
    enc = LabelEncoder()
    enc = enc.fit(train[raw])
    
    train[raw] = enc.transform(train[raw])
    test[raw] = enc.transform(test[raw])

In [10]:
train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Age_Category,Credit_Score_Range,Geo_Gender,Age_Gender,Cred_Bal_Sal,Bal_Est,Tenure_Age_Div,Tenure_Age_Prod,Balance_Products
0,668,0,1,33.0,3,0.0,2,1.0,0.0,181449.97,0,1,1,1,3,0.0,0.0,0.090909,99.0,0.0
1,627,0,1,33.0,1,0.0,2,1.0,1.0,49503.5,0,1,1,1,3,0.0,0.0,0.030303,33.0,0.0
2,678,0,1,40.0,10,0.0,2,1.0,0.0,184866.69,0,1,1,1,3,0.0,0.0,0.25,400.0,0.0
3,581,0,1,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,0,1,3,1022.940581,1.760655,0.058824,68.0,148882.54
4,716,2,1,33.0,5,0.0,2,1.0,1.0,15068.83,0,1,2,5,3,0.0,0.0,0.151515,165.0,0.0


In [11]:
from sklearn.preprocessing import OneHotEncoder

hot_raw = ["NumOfProducts"]

for raw in hot_raw:
    enc = OneHotEncoder(sparse_output=False)
    new_col = enc.fit_transform(train[[raw]])
    cols = enc.get_feature_names_out([raw])
    train[cols] = new_col
    train = train.drop([raw], axis=1).copy()
    
    new_col_test = enc.transform(test[[raw]])
    cols_test = enc.get_feature_names_out([raw])
    test[cols_test] = new_col_test
    test = test.drop([raw], axis=1).copy()

In [12]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def calc_performance_metrics(y_test, y_pred, description="Metrics"):
    cm = confusion_matrix(y_test, y_pred)
    model_recall = recall_score(y_test, y_pred)
    model_precision = precision_score(y_test, y_pred)
    model_f1 = f1_score(y_test, y_pred)
    model_accuracy = accuracy_score(y_test, y_pred)
    model_roc = roc_auc_score(y_test, y_pred)
    
    mscores = [(description, model_recall, model_precision, model_f1, model_accuracy, model_roc)]
    mcols = ["Description", "Recall", "Precision", "F1_score", "Accuracy", "ROC Score"]
    metrics = pd.DataFrame(data=mscores, columns=mcols)
    
    return cm, metrics

In [13]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

all_cols = list(train.columns)
output_cols = ["Exited"]
input_cols = list(set(all_cols) - set(output_cols))

X = np.array(train[input_cols])
y = np.array(train[output_cols]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

# pipeline
scaler = StandardScaler()
pt = PowerTransformer()
pca = PCA()

xgb_params = {
    "max_depth" : 6,
    "min_child_weight" : 7,
    "learning_rate" : 0.022213294578283638,
    "n_estimators" : 797,
    "subsample" : 0.9646514255068099,
    "colsample_bytree" : 0.4208597039793016,
    "verbosity" : 0,
    "device" : "cuda",
}

xgbc = XGBClassifier(**xgb_params)

lgb_params = {
    "max_depth" : 8,
    "min_child_samples" : 2,
    "learning_rate" : 0.05530081215180204,
    "n_estimators" : 669,
    "subsample" : 0.30254466503794797,
    "colsample_bytree" : 0.41087854836196963,
    "reg_alpha" : 0.6241271382121083,
    "reg_lambda" : 0.7111824512446496,
    "verbosity" : -1,
}

lgbc = LGBMClassifier(**lgb_params)

cat_params = {
    "iterations": 988,
    "depth": 9,
    "min_data_in_leaf": 16,
    "learning_rate": 0.02375062070726108,
    "verbose": 0,
}

cat = CatBoostClassifier(**cat_params)

mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=1000,
    random_state=42,
    activation="relu",
    learning_rate_init=0.001,
    solver="adam",
    validation_fraction=0.1,
    momentum=0.9,
    nesterovs_momentum=True,
    batch_size=32,
    beta_1=0.9,
    beta_2=0.999
)

kf = StratifiedKFold(n_splits=5, shuffle=True)

stacking_model = StackingClassifier(
    estimators=[
        ("LGBM", lgbc),
        ("XGB", xgbc),
        ("CAT", cat)
    ],
    final_estimator=mlp,
    cv=kf
)

steps = [
    ("scaler", scaler),
    ("pt", pt),
    ("pca", pca),
    ("clf", stacking_model),
]

pipe = Pipeline(steps)
pipe = pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
cm, metrics = calc_performance_metrics(y_test, y_pred)

In [15]:
metrics

Unnamed: 0,Description,Recall,Precision,F1_score,Accuracy,ROC Score
0,Metrics,0.576938,0.717985,0.63978,0.862536,0.75806


In [16]:
cm

array([[36661,  2374],
       [ 4432,  6044]], dtype=int64)

In [26]:
pipe = pipe.fit(X, y)
X_sub = np.array(test)

stacking_test_predictions_proba = pipe.predict(X_sub)

stacking_submission_df = pd.DataFrame({
    "id" : test_ids,
    "Exited" : stacking_test_predictions_proba,
})

In [27]:
stacking_submission_df

Unnamed: 0,id,Exited
0,165034,1
1,165035,1
2,165036,1
3,165037,1
4,165038,0
...,...,...
110018,275052,0
110019,275053,0
110020,275054,1
110021,275055,1


In [28]:
stacking_submission_df.to_csv("sub_17-01-2024.csv", index=None)