In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np
import time
import random
from itertools import combinations
from tqdm.notebook import tqdm
import optuna
import math

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.feature_selection import RFECV, mutual_info_classif

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor
from category_encoders import TargetEncoder

from lightgbm import LGBMClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)

experiment_name = 'lgbm'

In [2]:
train = pd.read_csv(r'.\train.csv')
test = pd.read_csv(r'.\test.csv')

train.head(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0


In [3]:
train.tail(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.0,1,0.0,1.0,71173.03,0
165033,165033,15732798,Ulyanov,850,France,Male,31.0,1,0.0,1,1.0,0.0,61581.79,1


In [4]:
# Columns with missing values
train.columns[train.isna().any()]

Index([], dtype='object')

In [5]:
TARGET = 'Exited'
binary_cols = ['Gender', 'HasCrCard', 'IsActiveMember']
categorical_cols = ['Geography', 'NumOfProducts']
text_cols = ['Surname']
drop_cols = ['id']

In [6]:
numerical_cols = train.drop(categorical_cols + binary_cols + drop_cols + [TARGET], axis=1).select_dtypes(include=np.number).columns
numerical_cols

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'EstimatedSalary'],
      dtype='object')

# Preprocess Data

In [7]:
transformer = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', drop='if_binary'), binary_cols),
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist'), categorical_cols),
    (TargetEncoder(), text_cols),
    (StandardScaler(), numerical_cols),
    remainder='passthrough')

df_to_ohe = train.drop(drop_cols, axis=1)
test_ohe = test.drop('id', axis=1)

# transformed = transformer.fit_transform(df_to_ohe)

# Split the features and the target variable
X = df_to_ohe.drop(TARGET, axis=1)
y = df_to_ohe[TARGET]

# Fit the transformer
transformer.fit(X, y)

transformed = transformer.transform(X)

# Get the transformed feature names
transformed_feat_names = [name.split('__')[-1] for name in transformer.get_feature_names_out()]

# Create DataFrame of the transformed features
df_to_ohe_transformed = pd.DataFrame(transformed, columns=transformed_feat_names)
df_to_ohe_transformed.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
130744,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.5,1.471478,0.718395,-1.480278,-0.363613,0.56657,0.405717


In [8]:
transformed_new_data = transformer.transform(test_ohe)

# Create DataFrame of the transformed features
test_transformed = pd.DataFrame(transformed_new_data, columns=transformed_feat_names)
test_transformed.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
77280,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.153894,0.968436,-0.367706,-0.014197,1.061827,1.424393,-1.287318


In [9]:
encoded_binary_cols = []
encoded_categorical_cols = []

# Loop through every column columns to separate binary and categorical
for col in df_to_ohe_transformed.columns:
    unique_values = df_to_ohe_transformed[col].nunique(dropna=False)
    print(f'{col} has {unique_values} unique values')
    if unique_values == 2:
        encoded_binary_cols.append(col)
    elif unique_values > 2 and unique_values < 10:
        encoded_categorical_cols.append(col)

encoded_numerical_cols = list(df_to_ohe_transformed.drop(encoded_binary_cols, axis=1).select_dtypes(include=np.number).columns)
len(encoded_numerical_cols), len(encoded_binary_cols), len(encoded_categorical_cols)

Gender_Male has 2 unique values
HasCrCard_1.0 has 2 unique values
IsActiveMember_1.0 has 2 unique values
Geography_France has 2 unique values
Geography_Germany has 2 unique values
Geography_Spain has 2 unique values
NumOfProducts_1 has 2 unique values
NumOfProducts_2 has 2 unique values
NumOfProducts_3 has 2 unique values
NumOfProducts_4 has 2 unique values
Surname has 858 unique values
CustomerId has 23221 unique values
CreditScore has 457 unique values
Age has 71 unique values
Tenure has 11 unique values
Balance has 30075 unique values
EstimatedSalary has 55298 unique values


(7, 10, 0)

In [10]:
# Reset indexes before assignment to reassign indices to both DataFrames, ensuring they are aligned
df_to_ohe_transformed.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
df_to_ohe_transformed[TARGET] = train[TARGET]

df_to_ohe_transformed.head()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited
0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.318008,-0.239126,0.144135,-0.578074,-0.719973,-0.883163,1.369486,0
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.326667,0.800755,-0.367706,-0.578074,-1.432694,-0.883163,-1.254085,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.222052,0.035085,0.268974,0.211354,1.774548,-0.883163,1.437422,0
3,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.181991,0.692068,-0.941966,-0.465299,-1.076334,1.486918,-0.557018,0
4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.146341,1.038788,0.743362,-0.578074,-0.007253,-0.883163,-1.93877,0


In [11]:
# Check for rows with misaligned indices
print(df_to_ohe_transformed.index.equals(train.index))

True


In [12]:
# Print out the rows with misaligned indices
condition_1 = df_to_ohe_transformed[TARGET] != 1
condition_2 = df_to_ohe_transformed[TARGET] != 0
df_to_ohe_transformed[(condition_1) & (condition_2)]

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited


# Feature Selection

- Mutual Information

In [None]:
# # Generate a random feature
# np.random.seed(5)
# df_to_ohe_transformed['random_control_feature'] = np.round(np.random.uniform(-2, 2, df_to_ohe_transformed.shape[0]), 6)
# df_to_ohe_transformed.shape

In [None]:
# # Calculate mutual information - this can be memory and CPU intensive
# # The plan is to remove any features that have mutual information less than random_feature because surely you cannot be important
# mi_scores = mutual_info_classif(df_to_ohe_transformed, df_to_ohe_transformed[TARGET], discrete_features='auto', n_neighbors=20, copy=True, random_state=5)

# # Make results easier to interpret by placing them in a DataFrame
# mi_scores = pd.Series(mi_scores, name="MI Scores", index=df_to_ohe_transformed.columns)
# mi_scores = mi_scores.sort_values(ascending=False)

# # Now you have the MI scores sorted from the most to the least informative
# mi_scores.to_csv(f'{experiment_name}_20_mi_scores.csv')

In [13]:
# Read the CSV
mi_df = pd.read_csv('lgbm_mi_scores.csv')

# Get the mi score for control feature
mi_threshold = mi_df.loc[mi_df['Feature'] == 'random_control_feature', 'Average'].iloc[0]

# Get the features above random_control_feature to move forward
mi_feats = mi_df[mi_df['Average'] > mi_threshold]['Feature'].tolist()
mi_feats

['Age',
 'NumOfProducts_2',
 'NumOfProducts_1',
 'IsActiveMember_1.0',
 'Surname',
 'Geography_Germany',
 'NumOfProducts_3',
 'Balance',
 'Gender_Male',
 'Geography_France',
 'HasCrCard_1.0',
 'EstimatedSalary',
 'CreditScore',
 'NumOfProducts_4',
 'Tenure',
 'Geography_Spain']

# Training Machine Learning Methods

In [14]:
X = df_to_ohe_transformed[mi_feats]
y = df_to_ohe_transformed[TARGET]

sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)

In [15]:
models = [
	LGBMClassifier(n_jobs=-1, random_state=5, objective='binary'),
    ]

### Cross-validation Function

In [16]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC AUC': 0,
                'MLA Test ROC AUC': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        # Perform cross-validation
        roc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'ROC AUC': roc_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': cv_results['train_ROC AUC'].mean() if 'train_ROC AUC' in cv_results else 0,
            'MLA Test ROC AUC': cv_results['test_ROC AUC'].mean() if 'test_ROC AUC' in cv_results else 0,
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

### Baseline Model

In [17]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [None]:
baseline_models = evaluate_models(models, X, y, baseline_features, sk, f'{experiment_name}')
baseline_models

- SFS

In [None]:
# Initialize empty dictionary for SFS features
sfs_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__

    try:
        features = mi_feats

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {MLA_name}')

        sfs = SFS(alg,
            k_features='best',
            forward=False,
            floating=False,
            scoring='roc_auc',
            verbose=2,
            n_jobs=-1,
            cv=sk)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        sfs_features[MLA_name] = list(selected_sfs_feats)

        print(f'Done with {MLA_name}', end='\n\n')

    except KeyError:
        print(f'{MLA_name} not in the dictionary.')

In [None]:
with open('sfs_features_lgbm.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [None]:
sfs_models = evaluate_models(models, X, y, sfs_features, sk, f'{experiment_name}_sfs')
sfs_models

### Partial Dependency

In [18]:
lg = LGBMClassifier(n_jobs=-1, random_state=5, objective='binary')
# features = sfs_features['LGBMClassifier']
features = ['Age',
'NumOfProducts_2',
'NumOfProducts_1',
'IsActiveMember_1.0',
'Surname',
'Geography_Germany',
'Balance',
'Gender_Male',
'HasCrCard_1.0',
'EstimatedSalary',
'CreditScore',
'NumOfProducts_4',
'Tenure']

In [None]:
model = lg.fit(X[features], y)

# Number of rows for the subplot grid
num_features = len(features)
num_cols = 3
num_rows = math.ceil(num_features / num_cols)

# Create a larger figure to accommodate the subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows))
fig.suptitle('Partial Dependence Plots of Status = C for LGBM', fontsize=16)

# Adjust the layout
plt.tight_layout(rect=[0, 0.03, 1, 0.95], pad=4.0, h_pad=4.0, w_pad=2.0)

# Iterate over features and create each plot
for i, feature in enumerate(features):
    ax = axs[i // num_cols, i % num_cols]
    PartialDependenceDisplay.from_estimator(model, 
                                            X[features], 
                                            [feature],
                                            pd_line_kw={'color': 'red'},
                                            ice_lines_kw={'color': 'steelblue'},
                                            kind='both',
                                            target=0,
                                            response_method='predict_proba',
                                            subsample=500,
                                            random_state=5,
                                            ax=ax)
    ax.set_title(f'LGBM Partial Plot - {feature}')

# Hide any unused subplots
for i in range(num_features, num_rows * num_cols):
    axs[i // num_cols, i % num_cols].set_visible(False)

plt.show()

In [None]:
model = lg.fit(X[features], y)

# Number of rows for the subplot grid
num_features = len(features)
num_cols = 3
num_rows = math.ceil(num_features / num_cols)

# Create a larger figure to accommodate the subplots
fig, axs = plt.subplots(num_rows, num_cols, figsize=(20, 5 * num_rows))
fig.suptitle('Partial Dependence Plots of Status = C for LGBM', fontsize=16)

# Adjust the layout
plt.tight_layout(rect=[0, 0.03, 1, 0.95], pad=4.0, h_pad=4.0, w_pad=2.0)

# Iterate over features and create each plot
for i, feature in enumerate(features):
    ax = axs[i // num_cols, i % num_cols]
    PartialDependenceDisplay.from_estimator(model, 
                                            X[features], 
                                            [feature],
                                            pd_line_kw={'color': 'red'},
                                            ice_lines_kw={'color': 'steelblue'},
                                            kind='both',
                                            target=1,
                                            response_method='predict_proba',
                                            subsample=500,
                                            random_state=5,
                                            ax=ax)
    ax.set_title(f'LGBM Partial Plot - {feature}')

# Hide any unused subplots
for i in range(num_features, num_rows * num_cols):
    axs[i // num_cols, i % num_cols].set_visible(False)

plt.show()

In [None]:
pdp_to_drop = ['HasCrCard_1.0', 'NumOfProducts_4']

pdp_features = {'LGBMClassifier': ['Age',
                    'NumOfProducts_2',
                    'NumOfProducts_1',
                    'IsActiveMember_1.0',
                    'Surname',
                    'Geography_Germany',
                    'Balance',
                    'Gender_Male',
                    'HasCrCard_1.0',
                    'EstimatedSalary',
                    'CreditScore',
                    'NumOfProducts_4',
                    'Tenure']}

In [None]:
pdp_models = evaluate_models(models, X, y, pdp_features, sk, f'{experiment_name}_pdp')
pdp_models

### Hyperparameter Tuning using Optuna

In [19]:
# lgbm_features = sfs_features['LGBMClassifier']
lgbm_features = ['Age',
'NumOfProducts_2',
'NumOfProducts_1',
'IsActiveMember_1.0',
'Surname',
'Geography_Germany',
'Balance',
'Gender_Male',
'HasCrCard_1.0',
'EstimatedSalary',
'CreditScore',
'NumOfProducts_4',
'Tenure']

In [21]:
def objective(trial):
    lgbm_scores = []
    
    for i, (train_idx, test_idx) in enumerate(sk.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Suggesting hyperparameters for LGBMClassifier
        n_estimators = trial.suggest_int('lgbm_n_estimators', 50, 4000)
        max_depth = trial.suggest_int('lgbm_max_depth', 1, 50)
        num_leaves = trial.suggest_int('lgbm_num_leaves', 2, 256)
        # random_state = trial.suggest_int('lgbm_random_state', 2, 500)
        learning_rate = trial.suggest_loguniform('lgbm_learning_rate', 0.001, 0.2)
        reg_alpha = trial.suggest_float('lgbm_reg_alpha', 0.0, 10.0)
        reg_lambda = trial.suggest_float('lgbm_reg_lambda', 0.0, 10.0)
        subsample = trial.suggest_float('lgbm_subsample', 0.1, 1.0)
        colsample_bytree = trial.suggest_float('lgbm_colsample_bytree', 0.1, 1.0)
        
        # Train and evaluate LGBMClassifier with the suggested hyperparameters
        lgbm = LGBMClassifier(
            objective='binary',
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            reg_alpha=reg_alpha,
            reg_lambda=reg_lambda,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            # random_state=random_state,
            n_jobs=-1
        ).fit(X_train[lgbm_features], y_train)
        
        lgbm_pred = lgbm.predict_proba(X_test[lgbm_features])
        lgbm_pred_proba = lgbm_pred[:, 1]
        lgbm_score = roc_auc_score(y_test, lgbm_pred_proba)
        lgbm_scores.append(lgbm_score)
        
        # Report intermediate objective value
        trial.report(lgbm_score, i)
        
        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return np.mean(lgbm_scores)

n_trials = 20

# Initialize tqdm progress bar
progress_bar = tqdm(total=n_trials, desc='Optimizing', position=0)

# Callback function to update progress bar
def callback(study, trial):
    progress_bar.update(1)

# Create Optuna study object and optimize with callback
pruner = optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=3)
study = optuna.create_study(study_name="lgbm_optimization", direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=n_trials, callbacks=[callback])

# Best hyperparameters
print(study.best_params)

# Close the progress bar
progress_bar.close()

Optimizing:   0%|          | 0/20 [00:00<?, ?it/s]

[I 2024-01-10 06:01:41,089] A new study created in memory with name: lgbm_optimization
[I 2024-01-10 06:03:04,500] Trial 0 finished with value: 0.896486300498772 and parameters: {'lgbm_n_estimators': 261, 'lgbm_max_depth': 47, 'lgbm_num_leaves': 232, 'lgbm_learning_rate': 0.01493490401614091, 'lgbm_reg_alpha': 1.7220377366909212, 'lgbm_reg_lambda': 7.77759154699412, 'lgbm_subsample': 0.8110731561878076, 'lgbm_colsample_bytree': 0.6798682603831786}. Best is trial 0 with value: 0.896486300498772.
[I 2024-01-10 06:09:54,455] Trial 1 finished with value: 0.878098238753604 and parameters: {'lgbm_n_estimators': 2605, 'lgbm_max_depth': 25, 'lgbm_num_leaves': 172, 'lgbm_learning_rate': 0.11303952701199936, 'lgbm_reg_alpha': 1.9642049595489985, 'lgbm_reg_lambda': 6.53909319939304, 'lgbm_subsample': 0.11811292159577355, 'lgbm_colsample_bytree': 0.7332451044541862}. Best is trial 0 with value: 0.896486300498772.
[I 2024-01-10 06:13:03,378] Trial 2 finished with value: 0.8967298035184669 and param

{'lgbm_n_estimators': 3124, 'lgbm_max_depth': 34, 'lgbm_num_leaves': 102, 'lgbm_learning_rate': 0.005412223162683512, 'lgbm_reg_alpha': 3.507801184150934, 'lgbm_reg_lambda': 8.456634587433385, 'lgbm_subsample': 0.7681541520182114, 'lgbm_colsample_bytree': 0.3465406359524291}


[I 2024-01-10 02:18:58,967] Trial 61 finished with value: 0.8977757296716001 and parameters: {'lgbm_n_estimators': 3414, 'lgbm_max_depth': 40, 'lgbm_num_leaves': 25, 'lgbm_learning_rate': 0.0062645811308150124, 'lgbm_reg_alpha': 8.873443530076086, 'lgbm_reg_lambda': 5.885447781455638, 'lgbm_subsample': 0.7884773833791413, 'lgbm_colsample_bytree': 0.5089526054590882}. Best is trial 61 with value: 0.8977757296716001.

[I 2024-01-10 06:39:37,922] Trial 14 finished with value: 0.8977177816563057 and parameters: {'lgbm_n_estimators': 3124, 'lgbm_max_depth': 34, 'lgbm_num_leaves': 102, 'lgbm_learning_rate': 0.005412223162683512, 'lgbm_reg_alpha': 3.507801184150934, 'lgbm_reg_lambda': 8.456634587433385, 'lgbm_subsample': 0.7681541520182114, 'lgbm_colsample_bytree': 0.3465406359524291}. Best is trial 14 with value: 0.8977177816563057.

Best parameters: {'subsample': 0.487015585813217, 'reg_alpha': 0.534558240211952, 'num_leaves': 70, 'learning_rate': 0.11404268301840845, 'colsample_bytree': 0.5130103309743103}
Best ROC AUC score: 0.8973431825644113

Best parameters: {'subsample': 0.4583793258901043, 'reg_alpha': 6.898808410934859, 'num_leaves': 27, 'learning_rate': 0.16883582860379132, 'colsample_bytree': 0.48557488149384453}
Best ROC AUC score: 0.8974238313058602

In [22]:
# Evaluates the importance of hyperparameters. 
# It shows which hyperparameters had the most influence on the objective values so we can use it for RandomizedSearch
optuna.visualization.plot_param_importances(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

### Hyperparameter Tuning with Randomized Grid Search

In [31]:
# Define the parameter distribution
param_dist = {
    # 'n_estimators': np.random.randint(50, 4001, 50),
    # 'max_depth': np.random.randint(1, 51, 50),
    'num_leaves': np.random.randint(2, 257, 50),
    'learning_rate': np.random.uniform(0.001, 0.2, 50),
    'reg_alpha': np.random.uniform(0.0, 10.0, 50),
    # 'reg_lambda': np.random.uniform(0.0, 10.0, 50),
    'subsample': np.random.uniform(0.1, 1.0, 50),
    'colsample_bytree': np.random.uniform(0.1, 1.0, 50)
}

In [32]:
# Configure and run RandomizedSearchCV
random_search = RandomizedSearchCV(lg, param_distributions=param_dist, n_iter=200, 
                                   scoring='roc_auc', cv=sk, verbose=2, random_state=5, n_jobs=-1)

random_search.fit(X[lgbm_features], y)

# Best parameters and score
print("Best parameters:", random_search.best_params_)
print("Best ROC AUC score:", random_search.best_score_)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
Best parameters: {'subsample': 0.4583793258901043, 'reg_alpha': 6.898808410934859, 'num_leaves': 27, 'learning_rate': 0.16883582860379132, 'colsample_bytree': 0.48557488149384453}
Best ROC AUC score: 0.8974238313058602


### Ensemble

- Weighted Models

In [37]:
params_1 = {'n_estimators': 3414, 'max_depth': 40, 'num_leaves': 25, 'learning_rate': 0.0062645811308150124, 'reg_alpha': 8.873443530076086, 'reg_lambda': 5.885447781455638, 'subsample': 0.7884773833791413, 'colsample_bytree': 0.5089526054590882, 'random_state':5}
params_2 = {'n_estimators': 3124, 'max_depth': 34, 'num_leaves': 102, 'learning_rate': 0.005412223162683512, 'reg_alpha': 3.507801184150934, 'reg_lambda': 8.456634587433385, 'subsample': 0.7681541520182114, 'colsample_bytree': 0.3465406359524291, 'random_state':5}
params_3 = {'subsample': 0.487015585813217, 'reg_alpha': 0.534558240211952, 'num_leaves': 70, 'learning_rate': 0.11404268301840845, 'colsample_bytree': 0.5130103309743103, 'random_state':5}
params_4 = {'subsample': 0.4583793258901043, 'reg_alpha': 6.898808410934859, 'num_leaves': 27, 'learning_rate': 0.16883582860379132, 'colsample_bytree': 0.48557488149384453, 'random_state':5}

model1 = LGBMClassifier(n_jobs=-1, **params_1)
model2 = LGBMClassifier(n_jobs=-1, **params_2)
model3 = LGBMClassifier(n_jobs=-1, **params_3)
model4 = LGBMClassifier(n_jobs=-1, **params_4)

In [45]:
# the input features to be used for final ensembling
X_ensemble = df_to_ohe_transformed[lgbm_features]

In [40]:
model1_results, model2_results, model3_results, model4_results, y_test_list = [], [], [], [], []

for i, (train_index, test_index) in enumerate(sk.split(X_ensemble, y)):
    X_train, X_test = X_ensemble.iloc[train_index], X_ensemble.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model1.fit(X_train, y_train)
    model_1_pred = model1.predict_proba(X_test)
    model_1_pred_proba = model_1_pred[:, 1]
    model1_results.append(model_1_pred_proba)

    model2.fit(X_train, y_train)
    model_2_pred = model2.predict_proba(X_test)
    model_2_pred_proba = model_2_pred[:, 1]
    model2_results.append(model_2_pred_proba)

    model3.fit(X_train, y_train)
    model_3_pred = model3.predict_proba(X_test)
    model_3_pred_proba = model_3_pred[:, 1]
    model3_results.append(model_3_pred_proba)

    model4.fit(X_train, y_train)
    model_4_pred = model4.predict_proba(X_test)
    model_4_pred_proba = model_4_pred[:, 1]
    model4_results.append(model_4_pred_proba)

    y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

Done with fold 1.
Done with fold 2.
Done with fold 3.
Done with fold 4.
Done with fold 5.
Done with fold 6.
Done with fold 7.
Done with fold 8.
Done with fold 9.
Done with fold 10.


In [42]:
model1_weights, model2_weights, model3_weights, model4_weights, scores = [], [], [], [], []

for i in tqdm(range(1000)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]
    weight_4 = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)
    model4_weights.append(weight_4)

    scores_in = []

    for j in range(10):
        weighted_pred = weight_1 * model1_results[j] + weight_2 * model2_results[j] + weight_3 * model3_results[j] + weight_4 * model4_results[j]
        scores_in.append(roc_auc_score(y_test_list[j], weighted_pred))

    scores.append(np.mean(scores_in))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [43]:
results_df = pd.DataFrame()
results_df['model_1'] = model1_weights
results_df['model_2'] = model2_weights
results_df['model_3'] = model3_weights
results_df['model_4'] = model4_weights
results_df['score'] = scores
results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,model_1,model_2,model_3,model_4,score
0,0.934615,0.854905,0.495941,0.001262,0.898007
1,0.711355,0.712237,0.49505,0.007917,0.898003
2,0.620431,0.68736,0.344372,0.04446,0.898003
3,0.882003,0.866898,0.34025,0.01627,0.898001
4,0.48051,0.545759,0.264298,0.039771,0.898001
5,0.734937,0.817039,0.421376,0.087147,0.898001
6,0.678601,0.456672,0.263686,0.032362,0.898001
7,0.66562,0.710254,0.401547,0.117435,0.898
8,0.781506,0.886821,0.378908,0.015999,0.898
9,0.493296,0.599164,0.367289,0.024574,0.898


### Final Predictions

In [46]:
model1_final = model1.fit(X_ensemble, y)
model2_final = model2.fit(X_ensemble, y)
model3_final = model3.fit(X_ensemble, y)
model4_final = model4.fit(X_ensemble, y)

In [48]:
test_features = test_transformed.copy()
test_features.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
1055,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.290123,-0.829328,0.718395,-0.803625,0.705467,1.224714,0.720729


In [53]:
ensemble_pred = (
    results_df['model_1'][0] * model1_final.predict_proba(test_features[lgbm_features]) +
    results_df['model_2'][0] * model2_final.predict_proba(test_features[lgbm_features]) +
    results_df['model_3'][0] * model3_final.predict_proba(test_features[lgbm_features]) +
    results_df['model_4'][0] * model4_final.predict_proba(test_features[lgbm_features])
)

ensemble_df = pd.DataFrame(ensemble_pred)
ensemble_df = ensemble_df.div(ensemble_df.sum(axis=1), axis=0)
ensemble_df.head()

Unnamed: 0,0,1
0,0.980836,0.019164
1,0.164041,0.835959
2,0.977779,0.022221
3,0.783393,0.216607
4,0.627546,0.372454


In [54]:
ensemble_df = ensemble_df.iloc[:, 1]
ensemble_df.head()

0    0.019164
1    0.835959
2    0.022221
3    0.216607
4    0.372454
Name: 1, dtype: float64

In [55]:
submission = pd.read_csv('sample_submission.csv')
submission.loc[:, 'Exited'] = ensemble_df.values
submission.head()

Unnamed: 0,id,Exited
0,165034,0.019164
1,165035,0.835959
2,165036,0.022221
3,165037,0.216607
4,165038,0.372454


In [None]:
submission.to_csv('submission_ensemble_0.898007cv.csv', index=False)