# Load data

In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

data = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [14]:
print('Training size: ', data.shape[0])

Training size:  165034


# Feature engineering

## Determine ranges

In [15]:
print(data['Age'].describe(), end='\n\n')
print(data['EstimatedSalary'].describe(), end='\n\n')

count    165034.000000
mean         38.125888
std           8.867205
min          18.000000
25%          32.000000
50%          37.000000
75%          42.000000
max          92.000000
Name: Age, dtype: float64

count    165034.000000
mean     112574.822734
std       50292.865585
min          11.580000
25%       74637.570000
50%      117948.000000
75%      155152.467500
max      199992.480000
Name: EstimatedSalary, dtype: float64



We can see that salary ranges from 11.58 (almost 0) to 199,992.48 (around 200k). First and third quartiles are near multiples of 50k, with median over twice of 50k. So it makes sense to stratefy this column into 0 - 50k, 50k - 100k, 100k - 200k ranges.

## Binning

In [16]:
def cut(data, col, bins):
    preview = pd.cut(data[col], bins=bins)
    print('*** PREVIEW with LABELS ***')
    print(preview.value_counts())
    
    data[f'{col}Group'] = pd.cut(data[col], bins=bins, labels=False)
    print(f'{col}Group (no labels) created')
    print()

## Feature engineering

In [17]:
def engineer(data):
    # Age groups
    cut(data, 'Age', 5)
    
    # https://www.experian.com/blogs/ask-experian/infographic-what-are-the-different-scoring-ranges/
    cut(data, 'CreditScore', [0, 300, 580, 670, 740, 800, np.inf])
    
    cut(data, 'EstimatedSalary', [0, 50000, 100000, np.inf])
    
    data['BalanceToSalaryRatio'] = data['Balance'] / data['EstimatedSalary']
    print(data['BalanceToSalaryRatio'].describe())
    print()
    
    data['TenureGroup'] = pd.cut(data['Tenure'], bins=[0, 2, 5, 10, np.inf], labels=False)
    
    data['CreditCard_ActiveMember'] = data['HasCrCard'] * data['IsActiveMember']
    
    return data

## Remove outliers
Outliers can make our model more biased towards such samples. Removing them might increase accuracy on test data.

In [18]:
def remove_outliers(df):
    quant_cols = [
        'CreditScore',
        'Age', 
        'Balance', 
        'EstimatedSalary'
    ]
    
    for col in quant_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3-Q1

        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

## Label Encoding

In [19]:
class SurnameEncoder:
    def __init__(self, train_set, test_set):
        self.le = LabelEncoder()
        self.le.fit(pd.concat([train_set, test])['Surname'])
    
    def transform(self, df):
        df['Surname'] = self.le.transform(df['Surname'])
        return df


## Combined data processing

In [20]:
se = SurnameEncoder(data, test)

data = remove_outliers(data)
data = engineer(data)
data = se.transform(data)

test = engineer(test)
test = se.transform(test)

*** PREVIEW with LABELS ***
Age
(33.6, 41.4]      67043
(25.8, 33.6]      44887
(41.4, 49.2]      28876
(49.2, 57.0]      10510
(17.961, 25.8]     7084
Name: count, dtype: int64
AgeGroup (no labels) created

*** PREVIEW with LABELS ***
CreditScore
(580.0, 670.0]    59741
(670.0, 740.0]    50004
(300.0, 580.0]    26758
(740.0, 800.0]    15591
(800.0, inf]       6306
(0.0, 300.0]          0
Name: count, dtype: int64
CreditScoreGroup (no labels) created

*** PREVIEW with LABELS ***
EstimatedSalary
(100000.0, inf]        94558
(50000.0, 100000.0]    44869
(0.0, 50000.0]         18973
Name: count, dtype: int64
EstimatedSalaryGroup (no labels) created

count    158400.000000
mean          2.111896
std          93.404463
min           0.000000
25%           0.000000
50%           0.000000
75%           0.973252
max       12863.796200
Name: BalanceToSalaryRatio, dtype: float64

*** PREVIEW with LABELS ***
Age
(32.8, 47.6]      66677
(17.926, 32.8]    29148
(47.6, 62.4]      12294
(62.4, 77.2] 

# Split training data

In [21]:
drop_cols_test = ['id', 'CustomerId']
drop_cols = ['id', 'CustomerId', 'Exited']
x_data = pd.get_dummies(data.drop(columns=drop_cols)).astype(np.float32)
y_data = (data['Exited']).astype(np.float32)

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

# Large data set therefore we just need a small test size for validation purpose
x_train, x_, y_train, y_ = train_test_split(x_data, y_data, test_size=0.2, random_state=0)

In [22]:
def predict(model, x, y):
    p = model.predict(x)
    p = p.reshape(1, len(p))[0]
    p = np.array(list(map(lambda p: 1 if p >= 0.5 else 0, p)))

    accu = round(accuracy_score(p,  y) , 4)
    
    return accu

def evaluate(model):
    accu_train = predict(model, x_train, y_train)
    accu_dev = predict(model, x_, y_)
    
    print(f'Train: {accu_train}\tDev: {accu_dev}')

    return accu_dev

# Hyper parameter tuning

Run this code separately.

In [23]:
from sklearn.model_selection import  cross_val_score
from sklearn.model_selection import StratifiedKFold
import optuna
from xgboost import XGBRegressor

# Assuming 'skf' is your StratifiedKFold object
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def objective_xgb(trial):
    """Define the objective function for XGBClassifier"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 150, 1000),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'tree_method': 'hist',  # Use GPU for training
        'device' : 'cuda',
        'eval_metric': 'auc',  # Evaluation metric
        'verbosity': 2,  # Set verbosity to 0 for less output
    }

    xgb_model = XGBRegressor(**params)

    cv = abs(cross_val_score(xgb_model, x_train, y_train, cv=skf, scoring='roc_auc').mean())

    return cv

study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb, n_trials=50)

# Get the best parameters
best_params_xgb = study.best_params
print("Best Hyperparameters for XGBoost:", best_params_xgb)

[I 2024-01-14 15:31:13,624] A new study created in memory with name: no-name-ad1a83e2-aa6e-4c4e-83a5-02bd0f95bc4b
[I 2024-01-14 15:31:23,472] Trial 0 finished with value: 0.8383001448444226 and parameters: {'max_depth': 5, 'min_child_weight': 4, 'learning_rate': 0.9747891959400499, 'n_estimators': 258, 'subsample': 0.6520768269723052, 'colsample_bytree': 0.802880629092744, 'random_state': 42}. Best is trial 0 with value: 0.8383001448444226.
[I 2024-01-14 15:31:36,774] Trial 1 finished with value: 0.8230423303656433 and parameters: {'max_depth': 8, 'min_child_weight': 10, 'learning_rate': 0.7158424645307248, 'n_estimators': 189, 'subsample': 0.8444203465265168, 'colsample_bytree': 0.6169176395534726, 'random_state': 42}. Best is trial 0 with value: 0.8383001448444226.
[I 2024-01-14 15:32:02,248] Trial 2 finished with value: 0.8604539297754205 and parameters: {'max_depth': 5, 'min_child_weight': 4, 'learning_rate': 0.32580211834388895, 'n_estimators': 816, 'subsample': 0.5270860253887788

Best Hyperparameters for XGBoost: {'max_depth': 6, 'min_child_weight': 3, 'learning_rate': 0.011514316949476584, 'n_estimators': 520, 'subsample': 0.3005629662407421, 'colsample_bytree': 0.7104732253814815, 'random_state': 42}


Best Hyperparameters found using Optuna
```
{
    'max_depth': 6, 
    'min_child_weight': 3, 
    'learning_rate': 0.011514316949476584, 
    'n_estimators': 520, 
    'subsample': 0.3005629662407421, 
    'colsample_bytree': 0.7104732253814815, 
    'random_state': 42
}
```

# Train

In [28]:
from xgboost import XGBRegressor

params = {
    'max_depth': 6, 
    'min_child_weight': 3, 
    'learning_rate': 0.011514316949476584, 
    'n_estimators': 520, 
    'subsample': 0.3005629662407421, 
    'colsample_bytree': 0.7104732253814815, 
    'random_state': 42
}

# Params found using Optuna
model_xgb = XGBRegressor(**params)
model_xgb.fit(x_train, y_train)

# Evaluate

In [29]:
models = [model_xgb]
accu_rates = []

for model in models:
    accu_rates.append(evaluate(model))

Train: 0.8722	Dev: 0.8664


In [31]:
best_model = models[np.argmax(accu_rates)]

# Submit

In [33]:
x_test = pd.get_dummies(test.drop(columns=drop_cols_test)).astype('float32')

x_test = scaler.transform(x_test)

test_pred = best_model.predict(x_test)
[test_pred] = test_pred.reshape(1, len(test_pred))
output = pd.DataFrame({'id': test.id, 'Exited': test_pred})
output.to_csv('submission.csv', index=False)