In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV
)

In [3]:
!pip install dask[dataframe] > /tmp/null

In [4]:
import xgboost as xgb
import lightgbm as lgb

In [5]:
df = pd.read_csv('churn_modelling.csv')
df.shape

(10000, 13)

In [6]:
df.head(8).T

Unnamed: 0,0,1,2,3,4,5,6,7
CustomerId,15634602,15647311,15619304,15701354,15737888,15574012,15592531,15656148
Surname,Hargrave,Hill,Onio,Boni,Mitchell,Chu,Bartlett,Obinna
CreditScore,619,608,502,699,850,645,822,376
Geography,France,Spain,France,France,Spain,Spain,France,Germany
Gender,Female,Female,Female,Female,Female,Male,Male,Female
Age,42,41,42,39,43,44,50,29
Tenure,2,1,8,1,2,8,7,4
Balance,0.0,83807.86,159660.8,0.0,125510.82,113755.78,0.0,115046.74
NumOfProducts,1,1,3,2,1,2,2,4
HasCrCard,1,0,1,0,1,1,1,1


In [7]:
df = df.drop(['CustomerId', 'Surname'], axis= 1)
df.shape

(10000, 11)

In [8]:
df.dtypes

Unnamed: 0,0
CreditScore,int64
Geography,object
Gender,object
Age,int64
Tenure,int64
Balance,float64
NumOfProducts,int64
HasCrCard,int64
IsActiveMember,int64
EstimatedSalary,float64


In [9]:
df.isnull().sum()

Unnamed: 0,0
CreditScore,0
Geography,0
Gender,0
Age,0
Tenure,0
Balance,0
NumOfProducts,0
HasCrCard,0
IsActiveMember,0
EstimatedSalary,0


In [10]:
df['Gender'].value_counts()

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,5457
Female,4543


In [11]:
df['is_male'] = (df['Gender'] == 'Male').astype('int')
df['is_male'].value_counts()

Unnamed: 0_level_0,count
is_male,Unnamed: 1_level_1
1,5457
0,4543


In [12]:
df['Geography'].value_counts()

Unnamed: 0_level_0,count
Geography,Unnamed: 1_level_1
France,5014
Germany,2509
Spain,2477


In [13]:
cols_cat = ['Geography']
df_1hot = pd.get_dummies(df[cols_cat], drop_first=True).astype('int')
df_1hot.head(10)

Unnamed: 0,Geography_Germany,Geography_Spain
0,0,0
1,0,1
2,0,0
3,0,0
4,0,1
5,0,1
6,0,0
7,1,0
8,0,0
9,0,0


In [14]:
cols2drop = ['Gender']
cols2drop += cols_cat
df = df.drop(cols2drop, axis=1)

In [15]:
df = pd.concat([df, df_1hot], axis=1)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,is_male,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [16]:
col_target = 'Exited'

X = df.drop(col_target, axis=1).values
y = df[col_target].values

In [17]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y,
                                              test_size=.2,
                                              random_state=1)

X_trn.shape[0], X_tst.shape[0]

(8000, 2000)

In [18]:
cls_lgb = lgb.LGBMClassifier()

scores_lgb = cross_val_score(cls_lgb,
                             X_trn, y_trn,
                             cv=5, scoring='accuracy')
scores_lgb

[LightGBM] [Info] Number of positive: 1298, number of negative: 5102
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 855
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202813 -> initscore=-1.368808
[LightGBM] [Info] Start training from score -1.368808
[LightGBM] [Info] Number of positive: 1298, number of negative: 5102
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 857
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:

array([0.8625  , 0.858125, 0.843125, 0.8625  , 0.858125])

In [19]:
scores_lgb.mean().round(3), scores_lgb.std().round(3)

(0.857, 0.007)

In [20]:
params_lgb = [{
    'n_estimators': [10, 50, 100,],
    'learning_rate': [.01, .1],
    'max_depth': [5, -1],
    'colsample_bytree': [.6]
}]

grid_lgb = GridSearchCV(cls_lgb, params_lgb,
                          scoring='accuracy',
                          cv=5)

In [21]:
grid_lgb.fit(X_trn, y_trn)

[LightGBM] [Info] Number of positive: 1298, number of negative: 5102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001631 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 855
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202813 -> initscore=-1.368808
[LightGBM] [Info] Start training from score -1.368808
[LightGBM] [Info] Number of positive: 1298, number of negative: 5102
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001400 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 857
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.202813 -> initscore=-1.368808
[LightGBM] [Info] Start training from score -1.368808
[LightGBM] [Info] Nu

In [22]:
grid_lgb.best_score_

0.863875

In [23]:
grid_lgb.best_params_

{'colsample_bytree': 0.6,
 'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 50}

In [24]:
grid_lgb.best_estimator_