In [20]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
import time
import random
from itertools import combinations
from tqdm.notebook import tqdm
import optuna
import math

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.feature_selection import RFECV, mutual_info_classif

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor
from category_encoders import TargetEncoder

from lightgbm import LGBMClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)

experiment_name = 'lgbm'

In [21]:
train = pd.read_csv(r'.\train.csv')
test = pd.read_csv(r'.\test.csv')

original = pd.read_csv(r'.\original_dataset.csv')
train.head(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0


In [22]:
train.tail(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.0,1,0.0,1.0,71173.03,0
165033,165033,15732798,Ulyanov,850,France,Male,31.0,1,0.0,1,1.0,0.0,61581.79,1


In [23]:
train = pd.concat([train, original], axis=0)
train.reset_index(drop=True, inplace=True)
train.drop(['id', 'RowNumber'], axis=1, inplace=True)
train.shape

(175036, 13)

In [24]:
train.tail(2)

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
175034,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0
175035,15628319,Walker,792,France,Female,28.0,4,130142.79,1,1.0,0.0,38190.78,0


In [25]:
# Columns with missing values
train.columns[train.isna().any()]

Index(['Geography', 'Age', 'HasCrCard', 'IsActiveMember'], dtype='object')

In [26]:
# Find out how many missing values in each column
# Just 1 row is missing in each. Even if there are different rows that is only 4 rows missing, we can afford to delete and not lose valuable information
# Time-Benefit decision
train.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [27]:
train.dropna(inplace=True)
train.isna().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [28]:
TARGET = 'Exited'
binary_cols = ['Gender', 'HasCrCard', 'IsActiveMember']
categorical_cols = ['Geography', 'NumOfProducts']
text_cols = ['Surname']
drop_cols = ['id']

In [29]:
numerical_cols = train.drop(categorical_cols + binary_cols + [TARGET], axis=1).select_dtypes(include=np.number).columns
numerical_cols

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'EstimatedSalary'],
      dtype='object')

# Preprocess Data

In [30]:
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', drop='if_binary'), binary_cols),
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist'), categorical_cols),
    (TargetEncoder(), text_cols),
    remainder='passthrough')

df_to_ohe = train.copy()
test_ohe = test.drop('id', axis=1)

# transformed = transformer.fit_transform(df_to_ohe)

# Split the features and the target variable
X = df_to_ohe.drop(TARGET, axis=1)
y = df_to_ohe[TARGET]

# Fit the transformer
transformer.fit(X, y)

transformed = transformer.transform(X)

# Get the transformed feature names
transformed_feat_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]

# Create DataFrame of the transformed features
df_to_ohe_transformed = pd.DataFrame(transformed, columns=transformed_feat_names)
df_to_ohe_transformed.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
62300,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.272,15642336.0,652.0,31.0,4.0,149136.31,162889.1


In [31]:
transformed_new_data = transformer.transform(test_ohe)

# Create DataFrame of the transformed features
test_transformed = pd.DataFrame(transformed_new_data, columns=transformed_feat_names)
test_transformed.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
107983,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.324215,15742632.0,729.0,36.0,1.0,0.0,122453.96


In [32]:
encoded_binary_cols = []
encoded_categorical_cols = []

# Loop through every column columns to separate binary and categorical
for col in df_to_ohe_transformed.columns:
    unique_values = df_to_ohe_transformed[col].nunique(dropna=False)
    print(f'{col} has {unique_values} unique values')
    if unique_values == 2:
        encoded_binary_cols.append(col)
    elif unique_values > 2 and unique_values < 10:
        encoded_categorical_cols.append(col)

encoded_numerical_cols = list(df_to_ohe_transformed.drop(encoded_binary_cols, axis=1).select_dtypes(include=np.number).columns)
len(encoded_numerical_cols), len(encoded_binary_cols), len(encoded_categorical_cols)

Gender_Male has 2 unique values
HasCrCard_1.0 has 2 unique values
IsActiveMember_1.0 has 2 unique values
Geography_France has 2 unique values
Geography_Germany has 2 unique values
Geography_Spain has 2 unique values
NumOfProducts_1 has 2 unique values
NumOfProducts_2 has 2 unique values
NumOfProducts_3 has 2 unique values
NumOfProducts_4 has 2 unique values
Surname has 907 unique values
CustomerId has 23421 unique values
CreditScore has 460 unique values
Age has 73 unique values
Tenure has 11 unique values
Balance has 30239 unique values
EstimatedSalary has 55581 unique values


(7, 10, 0)

In [33]:
# Reset indexes before assignment to reassign indices to both DataFrames, ensuring they are aligned
df_to_ohe_transformed.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
df_to_ohe_transformed[TARGET] = train[TARGET]

df_to_ohe_transformed.iloc[165040]

Gender_Male           1.000000e+00
HasCrCard_1.0         0.000000e+00
IsActiveMember_1.0    0.000000e+00
Geography_France      1.000000e+00
Geography_Germany     0.000000e+00
Geography_Spain       0.000000e+00
NumOfProducts_1       0.000000e+00
NumOfProducts_2       1.000000e+00
NumOfProducts_3       0.000000e+00
NumOfProducts_4       0.000000e+00
Surname               1.659335e-01
CustomerId            1.576782e+07
CreditScore           5.280000e+02
Age                   3.100000e+01
Tenure                6.000000e+00
Balance               1.020167e+05
EstimatedSalary       8.018112e+04
Exited                0.000000e+00
Name: 165040, dtype: float64

In [34]:
# Check for rows with misaligned indices
print(df_to_ohe_transformed.index.equals(train.index))

True


In [35]:
# Print out the rows with misaligned indices
condition_1 = df_to_ohe_transformed[TARGET] != 1
condition_2 = df_to_ohe_transformed[TARGET] != 0
df_to_ohe_transformed[(condition_1) & (condition_2)]

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited


# Training Machine Learning Methods

In [36]:
X = df_to_ohe_transformed.drop(TARGET, axis=1)
y = df_to_ohe_transformed[TARGET]
test_features = test_transformed.copy()

sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)

In [37]:
models = [
	LGBMClassifier(n_jobs=-1, random_state=5, objective='binary'),
    ]

### Cross-validation Function

In [38]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC AUC': 0,
                'MLA Test ROC AUC': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        # Perform cross-validation
        roc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'ROC AUC': roc_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': cv_results['train_ROC AUC'].mean() if 'train_ROC AUC' in cv_results else 0,
            'MLA Test ROC AUC': cv_results['test_ROC AUC'].mean() if 'test_ROC AUC' in cv_results else 0,
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

### Baseline Model

In [39]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [40]:
baseline_models = evaluate_models(models, X, y, baseline_features, sk, f'{experiment_name}')
baseline_models

Done with LGBMClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.903886,0.895704,0 min 7.93 sec
