In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
import time
import random
from itertools import combinations
from tqdm.notebook import tqdm
import optuna
import math

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.feature_selection import RFECV, mutual_info_classif

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor
from category_encoders import TargetEncoder

from lightgbm import LGBMClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)

experiment_name = 'lgbm'

In [2]:
train = pd.read_csv(r'.\train.csv')
test = pd.read_csv(r'.\test.csv')

train.head(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0


In [3]:
# Columns with missing values
train.columns[train.isna().any()]

Index([], dtype='object')

In [4]:
TARGET = 'Exited'
binary_cols = ['Gender', 'HasCrCard', 'IsActiveMember']
categorical_cols = ['Geography', 'NumOfProducts']
text_cols = ['Surname']
drop_cols = ['id']

In [5]:
numerical_cols = train.drop(categorical_cols + binary_cols + drop_cols + [TARGET], axis=1).select_dtypes(include=np.number).columns
numerical_cols

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'EstimatedSalary'],
      dtype='object')

# Preprocess Data

In [6]:
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', drop='if_binary'), binary_cols),
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist'), categorical_cols),
    (TargetEncoder(), text_cols),
    remainder='passthrough')

df_to_ohe = train.drop(drop_cols, axis=1)
test_ohe = test.drop('id', axis=1)

# transformed = transformer.fit_transform(df_to_ohe)

# Split the features and the target variable
X = df_to_ohe.drop(TARGET, axis=1)
y = df_to_ohe[TARGET]

# Fit the transformer
transformer.fit(X, y)

transformed = transformer.transform(X)

# Get the transformed feature names
transformed_feat_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]

# Create DataFrame of the transformed features
df_to_ohe_transformed = pd.DataFrame(transformed, columns=transformed_feat_names)
df_to_ohe_transformed.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
55676,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.153894,15592762.0,656.0,44.0,6.0,0.0,106095.08


In [7]:
transformed_new_data = transformer.transform(test_ohe)

# Create DataFrame of the transformed features
test_transformed = pd.DataFrame(transformed_new_data, columns=transformed_feat_names)
test_transformed.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
52793,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.315781,15587735.0,713.0,40.0,6.0,114446.84,87308.18


In [9]:
encoded_binary_cols = []
encoded_categorical_cols = []

# Loop through every column columns to separate binary and categorical
for col in df_to_ohe_transformed.columns:
    unique_values = df_to_ohe_transformed[col].nunique(dropna=False)
    print(f'{col} has {unique_values} unique values')
    if unique_values == 2:
        encoded_binary_cols.append(col)
    elif unique_values > 2 and unique_values < 10:
        encoded_categorical_cols.append(col)

encoded_numerical_cols = list(df_to_ohe_transformed.drop(encoded_binary_cols, axis=1).select_dtypes(include=np.number).columns)
len(encoded_numerical_cols), len(encoded_binary_cols), len(encoded_categorical_cols)

Gender_Male has 2 unique values
HasCrCard_1.0 has 2 unique values
IsActiveMember_1.0 has 2 unique values
Geography_France has 2 unique values
Geography_Germany has 2 unique values
Geography_Spain has 2 unique values
NumOfProducts_1 has 2 unique values
NumOfProducts_2 has 2 unique values
NumOfProducts_3 has 2 unique values
NumOfProducts_4 has 2 unique values
Surname has 858 unique values
CustomerId has 23221 unique values
CreditScore has 457 unique values
Age has 71 unique values
Tenure has 11 unique values
Balance has 30075 unique values
EstimatedSalary has 55298 unique values


(7, 10, 0)

In [10]:
df_to_ohe_transformed[TARGET] = train[TARGET]
df_to_ohe_transformed.head()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited
0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.318008,15674932.0,668.0,33.0,3.0,0.0,181449.97,0
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.326667,15749177.0,627.0,33.0,1.0,0.0,49503.5,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.222052,15694510.0,678.0,40.0,10.0,0.0,184866.69,0
3,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.181991,15741417.0,581.0,34.0,2.0,148882.54,84560.88,0
4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.146341,15766172.0,716.0,33.0,5.0,0.0,15068.83,0


# Training Machine Learning Methods

In [11]:
X = df_to_ohe_transformed.drop(TARGET, axis=1)
y = df_to_ohe_transformed[TARGET]
test_features = test_transformed.copy()

sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)

In [12]:
models = [
	LGBMClassifier(n_jobs=-1, random_state=5, objective='binary'),
    ]

### Cross-validation Function

In [16]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC AUC': 0,
                'MLA Test ROC AUC': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        # Perform cross-validation
        roc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'ROC AUC': roc_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': cv_results['train_ROC AUC'].mean() if 'train_ROC AUC' in cv_results else 0,
            'MLA Test ROC AUC': cv_results['test_ROC AUC'].mean() if 'test_ROC AUC' in cv_results else 0,
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

### Baseline Model

In [17]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [18]:
baseline_models = evaluate_models(models, X, y, baseline_features, sk, f'{experiment_name}')
baseline_models

Done with LGBMClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.905661,0.896892,0 min 3.26 sec
