# Load data

In [17]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
import optuna

data = pd.read_csv('./data/train.csv')
data.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [6]:
print('Training size: ', data.shape[0])

Training size:  165034


In [7]:
data['Surname'].value_counts()

Surname
Hsia         2456
T'ien        2282
Hs?          1611
Kao          1577
Maclean      1577
             ... 
Samaniego       1
Lawley          1
Bonwick         1
Tennant         1
Elkins          1
Name: count, Length: 2797, dtype: int64

# Feature engineering

## Define helper function

In [8]:
def cut(col, bins):
    preview = pd.cut(data[col], bins=bins)
    print('*** PREVIEW with LABELS ***')
    print(preview.value_counts())
    
    data[f'{col}Group'] = pd.cut(data[col], bins=bins, labels=False)
    print(f'{col}Group (no labels) created')
    print()

## Age group and salary bands

In [9]:
# Age groups
cut('Age', 5)

# https://www.experian.com/blogs/ask-experian/infographic-what-are-the-different-scoring-ranges/
cut('CreditScore', [0, 300, 580, 670, 740, 800, 850])

*** PREVIEW with LABELS ***
Age
(32.8, 47.6]      99773
(17.926, 32.8]    43675
(47.6, 62.4]      18789
(62.4, 77.2]       2711
(77.2, 92.0]         86
Name: count, dtype: int64
AgeGroup (no labels) created

*** PREVIEW with LABELS ***
CreditScore
(580, 670]    62141
(670, 740]    51927
(300, 580]    28114
(740, 800]    16257
(800, 850]     6595
(0, 300]          0
Name: count, dtype: int64
CreditScoreGroup (no labels) created



## Salary binning

In [10]:
data['EstimatedSalary'].describe()

count    165034.000000
mean     112574.822734
std       50292.865585
min          11.580000
25%       74637.570000
50%      117948.000000
75%      155152.467500
max      199992.480000
Name: EstimatedSalary, dtype: float64

We can see that salary ranges from 11.58 (almost 0) to 199,992.48 (around 200k). First and third quartiles are near multiples of 50k, with median over twice of 50k. So it makes sense to stratefy this column into 0 - 50k, 50k - 100k, 100k - 200k ranges.

In [11]:
cut('EstimatedSalary', [0, 50000, 100000, 200000])

*** PREVIEW with LABELS ***
EstimatedSalary
(100000, 200000]    98273
(50000, 100000]     46817
(0, 50000]          19944
Name: count, dtype: int64
EstimatedSalaryGroup (no labels) created



## Balance to Salary ratio

In [12]:
data['BalanceToSalaryRatio'] = data['Balance'] / data['EstimatedSalary']

# Split training data

In [13]:
data

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,AgeGroup,CreditScoreGroup,EstimatedSalaryGroup,BalanceToSalaryRatio
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.00,2,1.0,0.0,181449.97,0,1,2,2,0.000000
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.00,2,1.0,1.0,49503.50,0,1,2,0,0.000000
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.00,2,1.0,0.0,184866.69,0,1,3,2,0.000000
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0,1,2,1,1.760655
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.00,2,1.0,1.0,15068.83,0,1,3,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,667,Spain,Female,33.0,2,0.00,1,1.0,1.0,131834.75,0,1,2,2,0.000000
165030,165030,15665521,Okechukwu,792,France,Male,35.0,3,0.00,1,0.0,0.0,131834.45,0,1,4,2,0.000000
165031,165031,15664752,Hsia,565,France,Male,31.0,5,0.00,1,1.0,1.0,127429.56,0,0,1,2,0.000000
165032,165032,15689614,Hsiung,554,Spain,Female,30.0,7,161533.00,1,0.0,1.0,71173.03,0,0,1,1,2.269582


In [14]:
drop_cols = ['id', 'CustomerId', 'Surname', 'Exited']
x_data = pd.get_dummies(data.drop(columns=drop_cols)).astype(np.float32)
y_data = (data['Exited']).astype(np.float32)

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

# Large data set therefore we just need a small test size for validation purpose
x_train, x_, y_train, y_ = train_test_split(x_data, y_data, test_size=0.05, random_state=0)

In [15]:
def predict(model, x, y):
    p = model.predict(x)
    p = p.reshape(1, len(p))[0]
    p = np.array(list(map(lambda p: 1 if p >= 0.5 else 0, p)))
    
    accu = round(1 - np.mean(p !=  y) , 4)
    
    return accu

def evaluate(model):
    accu_train = predict(model, x_train, y_train)
    accu_dev = predict(model, x_, y_)
    
    print(f'Train: {accu_train}\tDev: {accu_dev}')

    return accu_dev

## Hyper parameter tuning

In [18]:
from sklearn.model_selection import  cross_val_score
from sklearn.model_selection import StratifiedKFold

# Assuming 'skf' is your StratifiedKFold object
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def objective_xgb(trial):
    """Define the objective function for XGBClassifier"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 150, 1000),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'tree_method': 'hist',  # Use GPU for training
        'device' : 'cuda',
        'eval_metric': 'auc',  # Evaluation metric
        'verbosity': 2,  # Set verbosity to 0 for less output
    }

    xgb_model = XGBRegressor(**params)

    cv = abs(cross_val_score(xgb_model, x_train, y_train, cv=skf, scoring='roc_auc').mean())

    return cv

study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb, n_trials=50)

# Get the best parameters
best_params_xgb = study.best_params
print("Best Hyperparameters for XGBoost:", best_params_xgb)

[I 2024-01-12 22:55:56,569] A new study created in memory with name: no-name-25485022-c0a0-4117-9602-e5a85e101182
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2024-01-12 22:57:03,092] Trial 0 finished with value: 0.8130533023221085 and parameters: {'max_depth': 9, 'min_child_weight': 3, 'learning_rate': 0.4848297954781477, 'n_estimators': 727, 'subsample': 0.5560024566363885, 'colsample_bytree': 0.23598936398330203, 'random_state': 42}. Best is trial 0 with value: 0.8130533023221085.
[I 2024-01-12 22:57:14,014] Trial 1 finished with value: 0.8767941064545266 and parameters: {'max_depth': 6, 'min_child_weight': 6, 'learning_rate': 0.5305176932234652, 'n_estimators': 281, 'subsample': 0.54349268447752, 'colsample_bytree': 0.17126402957453632, 'random_state': 42}. Best is trial 1 with value: 0.8767941064545266.
[I 2024-01-12 22:58:16,685] Trial 2 finished with value: 0.76952495

Best Hyperparameters for XGBoost: {'max_depth': 6, 'min_child_weight': 6, 'learning_rate': 0.015456088221076969, 'n_estimators': 535, 'subsample': 0.25556380739804013, 'colsample_bytree': 0.6705216911932783, 'random_state': 42}


In [19]:
from xgboost import XGBRegressor

# Params found using grid search
model_xgb = XGBRegressor(**best_params_xgb)
model_xgb.fit(x_train, y_train)

In [21]:
models = [model_xgb]
accu_rates = []

for model in models:
    accu_rates.append(evaluate(model))

# Train: 0.8667	Dev: 0.8644

Train: 0.868	Dev: 0.8644


In [276]:
best_model = models[np.argmax(accu_rates)]

In [None]:
# test_data = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")
# x_test = pd.get_dummies(test_data[features]).astype('float32')

# x_test = scaler.transform(x_test)

# test_pred = best_model.predict(x_test)
# [test_pred] = test_pred.reshape(1, len(test_pred))
# output = pd.DataFrame({'id': test_data.id, 'Exited': test_pred})
# output.to_csv('submission.csv', index=False)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- AgeGroup
- BalanceToSalaryRatio
- CreditScoreGroup
- EstimatedSalaryGroup
