# Load data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

data = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')

data.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [2]:
print('Training size: ', data.shape[0])

Training size:  165034


In [3]:
data['Surname'].value_counts()

Surname
Hsia         2456
T'ien        2282
Hs?          1611
Kao          1577
Maclean      1577
             ... 
Samaniego       1
Lawley          1
Bonwick         1
Tennant         1
Elkins          1
Name: count, Length: 2797, dtype: int64

# Feature engineering

## Define helper function

In [4]:
def cut(data, col, bins):
    preview = pd.cut(data[col], bins=bins)
    print('*** PREVIEW with LABELS ***')
    print(preview.value_counts())
    
    data[f'{col}Group'] = pd.cut(data[col], bins=bins, labels=False)
    print(f'{col}Group (no labels) created')
    print()

## Age Group and Credit Score bands

In [5]:
# Age groups
cut(data,'Age', 5)

# https://www.experian.com/blogs/ask-experian/infographic-what-are-the-different-scoring-ranges/
cut(data, 'CreditScore', [0, 300, 580, 670, 740, 800, 850])

*** PREVIEW with LABELS ***
Age
(32.8, 47.6]      99773
(17.926, 32.8]    43675
(47.6, 62.4]      18789
(62.4, 77.2]       2711
(77.2, 92.0]         86
Name: count, dtype: int64
AgeGroup (no labels) created

*** PREVIEW with LABELS ***
CreditScore
(580, 670]    62141
(670, 740]    51927
(300, 580]    28114
(740, 800]    16257
(800, 850]     6595
(0, 300]          0
Name: count, dtype: int64
CreditScoreGroup (no labels) created



## Salary binning

In [6]:
data['EstimatedSalary'].describe()

count    165034.000000
mean     112574.822734
std       50292.865585
min          11.580000
25%       74637.570000
50%      117948.000000
75%      155152.467500
max      199992.480000
Name: EstimatedSalary, dtype: float64

We can see that salary ranges from 11.58 (almost 0) to 199,992.48 (around 200k). First and third quartiles are near multiples of 50k, with median over twice of 50k. So it makes sense to stratefy this column into 0 - 50k, 50k - 100k, 100k - 200k ranges.

In [7]:
cut(data, 'EstimatedSalary', [0, 50000, 100000, 200000])

*** PREVIEW with LABELS ***
EstimatedSalary
(100000, 200000]    98273
(50000, 100000]     46817
(0, 50000]          19944
Name: count, dtype: int64
EstimatedSalaryGroup (no labels) created



## Balance to Salary ratio

In [8]:
data['BalanceToSalaryRatio'] = data['Balance'] / data['EstimatedSalary']

## Label Encoding

In [9]:
le = LabelEncoder()

# Fit surname for both datasets
le.fit(pd.concat([data,test])['Surname'])
data['Surname'] = le.transform(data['Surname'])

## Wrapping things up

In [10]:
def engineer(data):
    cut(data, 'Age', 5)
    cut(data, 'CreditScore', [0, 300, 580, 670, 740, 800, 850])
    cut(data, 'EstimatedSalary', [0, 50000, 100000, 200000])
    
    data['BalanceToSalaryRatio'] = data['Balance'] / data['EstimatedSalary']
    print(data['BalanceToSalaryRatio'].describe())
    print()
    
    return data

# Split training data

In [11]:
data = engineer(data)

*** PREVIEW with LABELS ***
Age
(32.8, 47.6]      99773
(17.926, 32.8]    43675
(47.6, 62.4]      18789
(62.4, 77.2]       2711
(77.2, 92.0]         86
Name: count, dtype: int64
AgeGroup (no labels) created

*** PREVIEW with LABELS ***
CreditScore
(580, 670]    62141
(670, 740]    51927
(300, 580]    28114
(740, 800]    16257
(800, 850]     6595
(0, 300]          0
Name: count, dtype: int64
CreditScoreGroup (no labels) created

*** PREVIEW with LABELS ***
EstimatedSalary
(100000, 200000]    98273
(50000, 100000]     46817
(0, 50000]          19944
Name: count, dtype: int64
EstimatedSalaryGroup (no labels) created

count    165034.000000
mean          2.100904
std          91.642444
min           0.000000
25%           0.000000
50%           0.000000
75%           0.978816
max       12863.796200
Name: BalanceToSalaryRatio, dtype: float64



In [12]:
drop_cols_test = ['id', 'CustomerId']
drop_cols = ['id', 'CustomerId', 'Exited']
x_data = pd.get_dummies(data.drop(columns=drop_cols)).astype(np.float32)
y_data = (data['Exited']).astype(np.float32)

scaler = StandardScaler()
x_data = scaler.fit_transform(x_data)

# Large data set therefore we just need a small test size for validation purpose
x_train, x_, y_train, y_ = train_test_split(x_data, y_data, test_size=0.01, random_state=0)

In [13]:
def predict(model, x, y):
    p = model.predict(x)
    p = p.reshape(1, len(p))[0]
    p = np.array(list(map(lambda p: 1 if p >= 0.5 else 0, p)))

    accu = round(accuracy_score(p,  y) , 4)
    
    return accu

def evaluate(model):
    accu_train = predict(model, x_train, y_train)
    accu_dev = predict(model, x_, y_)
    
    print(f'Train: {accu_train}\tDev: {accu_dev}')

    return accu_dev

# Hyper parameter tuning

Best Hyperparameters found using Optuna
```
{
    'max_depth': 6, 
    'min_child_weight': 6, 
    'learning_rate': 0.015456088221076969, 
    'n_estimators': 535, 
    'subsample': 0.25556380739804013, 
    'colsample_bytree': 0.6705216911932783, 
    'random_state': 42
}
```

# Train

In [14]:
from xgboost import XGBRegressor

params = {
    'max_depth': 6, 
    'min_child_weight': 6, 
    'learning_rate': 0.015456088221076969, 
    'n_estimators': 535, 
    'subsample': 0.25556380739804013, 
    'colsample_bytree': 0.6705216911932783, 
    'random_state': 42
}

# Params found using Optuna
model_xgb = XGBRegressor(**params)
model_xgb.fit(x_train, y_train)

# Evaluate

In [15]:
models = [model_xgb]
accu_rates = []

for model in models:
    accu_rates.append(evaluate(model))
    
# Best: [Train: 0.8701	Dev: 0.8651]

Train: 0.8696	Dev: 0.871


In [16]:
best_model = models[np.argmax(accu_rates)]

# Submit

In [17]:
test_data = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")
test_data = engineer(test_data)
test_data['Surname'] = le.transform(test_data['Surname'])

x_test = pd.get_dummies(test_data.drop(columns=drop_cols_test)).astype('float32')

x_test = scaler.transform(x_test)

test_pred = best_model.predict(x_test)
[test_pred] = test_pred.reshape(1, len(test_pred))
output = pd.DataFrame({'id': test_data.id, 'Exited': test_pred})
output.to_csv('submission.csv', index=False)

*** PREVIEW with LABELS ***
Age
(32.8, 47.6]      66677
(17.926, 32.8]    29148
(47.6, 62.4]      12294
(62.4, 77.2]       1829
(77.2, 92.0]         75
Name: count, dtype: int64
AgeGroup (no labels) created

*** PREVIEW with LABELS ***
CreditScore
(580, 670]    41333
(670, 740]    34596
(300, 580]    18723
(740, 800]    10923
(800, 850]     4448
(0, 300]          0
Name: count, dtype: int64
CreditScoreGroup (no labels) created

*** PREVIEW with LABELS ***
EstimatedSalary
(100000, 200000]    65148
(50000, 100000]     31290
(0, 50000]          13585
Name: count, dtype: int64
EstimatedSalaryGroup (no labels) created

count    110023.000000
mean          2.115039
std          86.120211
min           0.000000
25%           0.000000
50%           0.000000
75%           0.978741
max       10614.655440
Name: BalanceToSalaryRatio, dtype: float64

