# Load data

In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier

data = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [11]:
print('Training size: ', data.shape[0])

Training size:  165034


# Data processing

## Determine ranges

In [12]:
print(data['Age'].describe(), end='\n\n')
print(data['EstimatedSalary'].describe(), end='\n\n')

count    165034.000000
mean         38.125888
std           8.867205
min          18.000000
25%          32.000000
50%          37.000000
75%          42.000000
max          92.000000
Name: Age, dtype: float64

count    165034.000000
mean     112574.822734
std       50292.865585
min          11.580000
25%       74637.570000
50%      117948.000000
75%      155152.467500
max      199992.480000
Name: EstimatedSalary, dtype: float64



We can see that salary ranges from 11.58 (almost 0) to 199,992.48 (around 200k). First and third quartiles are near multiples of 50k, with median over twice of 50k. So it makes sense to stratefy this column into 0 - 50k, 50k - 100k, 100k - 200k ranges.

## Binning

In [13]:
def cut(data, col, bins):
    data[f'{col}Group'] = pd.cut(data[col], bins=bins, labels=False)

## Feature engineering

In [14]:
def engineer(data):
    # Age groups
    cut(data, 'Age', 5)
    
    # https://www.experian.com/blogs/ask-experian/infographic-what-are-the-different-scoring-ranges/
    cut(data, 'CreditScore', [0, 300, 580, 670, 740, 800, np.inf])
    
    cut(data, 'EstimatedSalary', [0, 50000, 100000, 200000, np.inf])
    
    data['BalanceToSalaryRatio'] = data['Balance'] / data['EstimatedSalary']
    
#     data['TenureGroup'] = pd.cut(data['Tenure'], bins=[0, 2, 5, 10], labels=False)
    
#     data['CreditCard_ActiveMember'] = data['HasCrCard'] * data['IsActiveMember']
    
    return data

## Remove outliers
Outliers can make our model more biased towards such samples. Removing them might increase accuracy on test data.

In [15]:
def remove_outliers(df):
    quant_cols = [
        'CreditScore',
        'Age', 
        'Balance', 
        'EstimatedSalary'
    ]
    
    for col in quant_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3-Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

## Label Encoding

In [16]:
def encode(train, test, cols):
    le = LabelEncoder()
    for col in cols:
        train[col] = le.fit_transform(train[col])
        test[col] = le.fit_transform(test[col])


## Combined data processing

In [17]:
# data = remove_outliers(data)
data = engineer(data)
encode(data, test, ['Geography','Gender'])

test = engineer(test)
encode(data, test, ['Geography','Gender'])

# Split training data

In [18]:
from imblearn.over_sampling import SMOTE

drop_cols_test = ['id', 'CustomerId', 'Surname']
drop_cols = drop_cols_test + ['Exited']

x_data = pd.get_dummies(data.drop(drop_cols, axis=1)).astype(np.float32)
y_data = (data['Exited']).astype(np.float32)

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

x_data, y_data = SMOTE().fit_resample(x_data, y_data)

# Large data set therefore we just need a small test size for validation purpose
x_train, x_, y_train, y_ = train_test_split(x_data, y_data, test_size=0.2, random_state=0)

In [19]:
def predict(model, x, y):
    p = model.predict(x)
    p = p.reshape(1, len(p))[0]
    p = np.array(list(map(lambda p: 1 if p >= 0.5 else 0, p)))

    accu = round(accuracy_score(p,  y) , 4)
    
    return accu

def evaluate(model):
    accu_train = predict(model, x_train, y_train)
    accu_dev = predict(model, x_, y_)
    
    print(f'Train: {accu_train}\tDev: {accu_dev}')

    return accu_dev

# Hyper parameter tuning

Run this code separately.

In [20]:
# from sklearn.model_selection import  cross_val_score
# from sklearn.model_selection import StratifiedKFold

# # Assuming 'skf' is your StratifiedKFold object
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# def objective_xgb(trial):
#     """Define the objective function for XGBClassifier"""

#     params = {
#         'max_depth': trial.suggest_int('max_depth', 5, 10),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#         'n_estimators': trial.suggest_int('n_estimators', 150, 1000),
#         'subsample': trial.suggest_float('subsample', 0.01, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
#         'random_state': trial.suggest_categorical('random_state', [42]),
#         'tree_method': 'hist',  # Use GPU for training
#         'device' : 'cuda',
#         'eval_metric': 'auc',  # Evaluation metric
#         'verbosity': 2,  # Set verbosity to 0 for less output
#     }

#     xgb_model = XGBRegressor(**params)

#     cv = abs(cross_val_score(xgb_model, x_train, y_train, cv=skf, scoring='roc_auc').mean())

#     return cv

# study = optuna.create_study(direction='maximize')
# study.optimize(objective_xgb, n_trials=50)

# # Get the best parameters
# best_params_xgb = study.best_params
# print("Best Hyperparameters for XGBoost:", best_params_xgb)

Best Hyperparameters found using Optuna
```
{
    'max_depth': 6, 
    'min_child_weight': 6, 
    'learning_rate': 0.015456088221076969, 
    'n_estimators': 535, 
    'subsample': 0.25556380739804013, 
    'colsample_bytree': 0.6705216911932783, 
    'random_state': 42
}
```

In [35]:
from sklearn.model_selection import  cross_val_score
from sklearn.model_selection import StratifiedKFold
import optuna
from catboost import CatBoostClassifier

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def objective_cb(trial):
    params = {
        'depth': trial.suggest_int('depth', 5, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'iterations': trial.suggest_int('iterations', 100, 500),
        'verbose': False,
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 0.2, 2),
        'task_type':"GPU",  # Use GPU for training
        'devices' : '0:1',
        # 'eval_metric': 'AUC',  # Evaluation metric
    }

    cb_model = CatBoostClassifier(**params)

    cv = abs(cross_val_score(cb_model, x_train, y_train, cv=skf, scoring='roc_auc').mean())

    return cv

study = optuna.create_study(direction='maximize')
study.optimize(objective_cb, n_trials=5)

# Get the best parameters
best_params_cb = study.best_params
print("Hyperparameters found:", best_params_cb)

[I 2024-01-14 16:57:53,941] A new study created in memory with name: no-name-e797883b-738f-48a9-921b-d8fe8d544743
[I 2024-01-14 16:58:48,195] Trial 0 finished with value: 0.9662344526372907 and parameters: {'depth': 9, 'learning_rate': 0.028886480232587365, 'iterations': 400, 'l2_leaf_reg': 1}. Best is trial 0 with value: 0.9662344526372907.
[I 2024-01-14 16:59:01,366] Trial 1 finished with value: 0.9616263735144968 and parameters: {'depth': 8, 'learning_rate': 0.07826093696342286, 'iterations': 174, 'l2_leaf_reg': 1}. Best is trial 0 with value: 0.9662344526372907.
[I 2024-01-14 17:00:07,405] Trial 2 finished with value: 0.9661093367899088 and parameters: {'depth': 9, 'learning_rate': 0.023064545609397948, 'iterations': 486, 'l2_leaf_reg': 1}. Best is trial 0 with value: 0.9662344526372907.
[I 2024-01-14 17:00:52,126] Trial 3 finished with value: 0.9672109136365223 and parameters: {'depth': 9, 'learning_rate': 0.0519263317978695, 'iterations': 325, 'l2_leaf_reg': 1}. Best is trial 3 w

Hyperparameters found: {'depth': 9, 'learning_rate': 0.0519263317978695, 'iterations': 325, 'l2_leaf_reg': 1}


# Train

In [36]:
from xgboost import XGBRegressor

params_xgb = {
    'max_depth': 6, 
    'min_child_weight': 6, 
    'learning_rate': 0.015456088221076969, 
    'n_estimators': 535, 
    'subsample': 0.25556380739804013, 
    'colsample_bytree': 0.6705216911932783, 
    'random_state': 42
}

# Params found using Optuna
model_xgb = XGBRegressor(**params_xgb)
model_xgb.fit(x_train, y_train)

In [37]:
params_cb = {'depth': 7, 'learning_rate': 0.054768705429762574, 'iterations': 467, 'l2_leaf_reg': 1}

model_cb = CatBoostClassifier(**params_cb)
model_cb.fit(x_train, y_train)

0:	learn: 0.6579096	total: 43.1ms	remaining: 20.1s
1:	learn: 0.6271947	total: 81.4ms	remaining: 18.9s
2:	learn: 0.6008972	total: 119ms	remaining: 18.4s
3:	learn: 0.5783039	total: 157ms	remaining: 18.2s
4:	learn: 0.5582040	total: 192ms	remaining: 17.8s
5:	learn: 0.5407940	total: 225ms	remaining: 17.3s
6:	learn: 0.5241579	total: 259ms	remaining: 17s
7:	learn: 0.5111016	total: 309ms	remaining: 17.8s
8:	learn: 0.4983988	total: 360ms	remaining: 18.3s
9:	learn: 0.4877390	total: 401ms	remaining: 18.3s
10:	learn: 0.4778097	total: 449ms	remaining: 18.6s
11:	learn: 0.4687140	total: 487ms	remaining: 18.5s
12:	learn: 0.4617282	total: 524ms	remaining: 18.3s
13:	learn: 0.4546214	total: 560ms	remaining: 18.1s
14:	learn: 0.4483422	total: 592ms	remaining: 17.8s
15:	learn: 0.4431614	total: 636ms	remaining: 17.9s
16:	learn: 0.4373532	total: 676ms	remaining: 17.9s
17:	learn: 0.4322498	total: 716ms	remaining: 17.9s
18:	learn: 0.4267109	total: 758ms	remaining: 17.9s
19:	learn: 0.4217703	total: 783ms	remaini

<catboost.core.CatBoostClassifier at 0x1d549c87880>

# Evaluate

In [34]:
models = [model_xgb, model_cb]
accu_rates = []

for model in models:
    accu_rates.append(evaluate(model))

# Train: 0.8874	Dev: 0.8831
# Train: 0.9154	Dev: 0.9084

Train: 0.8874	Dev: 0.8831
Train: 0.912	Dev: 0.9082


In [25]:
best_model = models[np.argmax(accu_rates)]

# Submit

In [27]:
x_test = test.drop(drop_cols_test, axis=1)
x_test = scaler.transform(x_test)

test_pred = best_model.predict(x_test)
[test_pred] = test_pred.reshape(1, len(test_pred))
output = pd.DataFrame({'id': test.id, 'Exited': test_pred})
output.to_csv('submission.csv', index=False)