# Santander Customer Satisfaction

In [1]:
# module import

# EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Seperate
from sklearn.model_selection import train_test_split

# models
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# validation
from sklearn.model_selection import GridSearchCV
# ETC
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Load data
cust_data = pd.read_csv('../Data/train.csv')
cust_data.head(1)
print(cust_data['TARGET'].value_counts())
unsatisfied_cnt = cust_data[cust_data['TARGET']==1]["TARGET"].count()
total_cnt = cust_data['TARGET'].count()
print("unsatisfied ratio > {:.2f}".format((unsatisfied_cnt/total_cnt)))

0    73012
1     3008
Name: TARGET, dtype: int64
unsatisfied ratio > 0.04


In [3]:
cust_data.describe() # var 3 min == -9999999999
cust_data['var3'] = cust_data['var3'].replace(-999999.000000, 2 ) 
cust_data.drop('ID',axis=1,inplace=True)

In [4]:
# seperate
X = cust_data.iloc[:,:-1]
y = cust_data.iloc[:,-1]

X_train, X_test, y_train, y_test =  train_test_split(X,y,
                                                    test_size=0.2,
                                                    random_state=150)

train_cnt = y_train.count()
test_cnt = y_test.count()
print(X_train.shape,X_test.shape)
print('학습 분포 비율:\n',y_train.value_counts()/train_cnt)
print('\n 테스트 분포 비율:\n', y_test.value_counts()/test_cnt)

(60816, 369) (15204, 369)
학습 분포 비율:
 0    0.960701
1    0.039299
Name: TARGET, dtype: float64

 테스트 분포 비율:
 0    0.959353
1    0.040647
Name: TARGET, dtype: float64


In [5]:
y_train

45327    0
23334    0
48217    0
12634    0
68486    0
        ..
2354     0
496      0
49177    0
28155    0
62692    0
Name: TARGET, Length: 60816, dtype: int64

### XGBoost Model

In [6]:
# models
xgb_clf = XGBClassifier(n_estimators=500, random_state=150)

In [7]:
# fit
evals = [(X_train,y_train),(X_test,y_test)]
xgb_clf.fit(X_train,y_train,early_stopping_rounds=100,eval_metric="auc",eval_set=evals)

[0]	validation_0-auc:0.82496	validation_1-auc:0.81473
[1]	validation_0-auc:0.83359	validation_1-auc:0.82282
[2]	validation_0-auc:0.84137	validation_1-auc:0.82665
[3]	validation_0-auc:0.84633	validation_1-auc:0.82918
[4]	validation_0-auc:0.84861	validation_1-auc:0.82973
[5]	validation_0-auc:0.85185	validation_1-auc:0.83182
[6]	validation_0-auc:0.85599	validation_1-auc:0.83561
[7]	validation_0-auc:0.85908	validation_1-auc:0.83554
[8]	validation_0-auc:0.86367	validation_1-auc:0.83485
[9]	validation_0-auc:0.86616	validation_1-auc:0.83632
[10]	validation_0-auc:0.86949	validation_1-auc:0.83733
[11]	validation_0-auc:0.87153	validation_1-auc:0.83751
[12]	validation_0-auc:0.87418	validation_1-auc:0.83832
[13]	validation_0-auc:0.87696	validation_1-auc:0.83890
[14]	validation_0-auc:0.87867	validation_1-auc:0.83840
[15]	validation_0-auc:0.88074	validation_1-auc:0.83924
[16]	validation_0-auc:0.88313	validation_1-auc:0.84013
[17]	validation_0-auc:0.88407	validation_1-auc:0.83973
[18]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=12, num_parallel_tree=1,
              random_state=150, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [8]:
# predict
xgb_clf_preds = xgb_clf.predict(X_test)
xgb_clf_proba = xgb_clf.predict_proba(X_test)[:,1]

In [9]:
# roc_auc
xgb_clf_auc = roc_auc_score(y_test, xgb_clf_proba, average='macro')
print("roc_auc : {0:.4f}".format(xgb_clf_auc))

roc_auc : 0.8402


In [None]:
# 2nd validation
from sklearn.model_selection import GridSearchCV
xgb_clf = XGBClassifier(n_estimators=100)
params = {'max_depth':[5,7],'min_child_weight':[1,3],
         'colsample_bytree':[0.5,0.75]}
gridcv = GridSearchCV(xgb_clf,param_grid=params,cv=3)
gridcv.fit(X_train,y_train,early_stopping_rounds=30,
           eval_metric='auc',
          eval_set=[(X_train,y_train),(X_test,y_test)])
print('GV 최적 파라미터:', gridcv.best_params_)
xgb_roc_score = roc_auc_score(y_test,gridcv.predict_proba(X_test)[:,1])
print('AUC:{:.4f}'.format(xgb_roc_score))

[0]	validation_0-auc:0.80729	validation_1-auc:0.79441
[1]	validation_0-auc:0.81724	validation_1-auc:0.80286
[2]	validation_0-auc:0.82363	validation_1-auc:0.81132
[3]	validation_0-auc:0.82988	validation_1-auc:0.81728
[4]	validation_0-auc:0.83261	validation_1-auc:0.82028
[5]	validation_0-auc:0.83461	validation_1-auc:0.81962
[6]	validation_0-auc:0.83276	validation_1-auc:0.81758
[7]	validation_0-auc:0.84359	validation_1-auc:0.82437
[8]	validation_0-auc:0.84671	validation_1-auc:0.82665
[9]	validation_0-auc:0.84815	validation_1-auc:0.82362
[10]	validation_0-auc:0.84744	validation_1-auc:0.82007
[11]	validation_0-auc:0.84790	validation_1-auc:0.81824
[12]	validation_0-auc:0.85346	validation_1-auc:0.82656
[13]	validation_0-auc:0.85699	validation_1-auc:0.82975
[14]	validation_0-auc:0.85938	validation_1-auc:0.83134
[15]	validation_0-auc:0.86102	validation_1-auc:0.83040
[16]	validation_0-auc:0.86258	validation_1-auc:0.83243
[17]	validation_0-auc:0.86442	validation_1-auc:0.83319
[18]	validation_0-au

In [None]:
print('GV 최적 파라미터:', gridcv.best_params_)
xgb_roc_score = roc_auc_score(y_test,gridcv.predict_proba(X_test)[:,1])
print('AUC:{:.4f}'.format(xgb_roc_score))

In [None]:
gridcv.predict_proba(X_test)[:,1]

In [None]:
# refit
xgb_clf = XGBClassifier(n_estimators=1000,random_state=150,colsample_bytes=0.75, max_depth=5,
                        min_child_weight=1, reg_alpha=0.02)
xgb_clf.fit(X_train,y_train,early_stopping_rounds=200,eval_metric='auc',eval_set=evals)

In [None]:
xgb_roc_score = roc_auc_score(y_test,xgb_clf.predict_proba(X_test)[:,1])
print('AUC:{:.4f}'.format(xgb_roc_score))

In [None]:
# models
lgbm_wrapper = LGBMClassifier(n_estimators=500, random_state=150)

In [None]:
# fit
evals=[(X_test,y_test),(X_test,y_test)]
lgbm_wrapper.fit(X_train,y_train,early_stopping_rounds=100, eval_metric='auc',eval_set=evals,verbose=True)

In [None]:
# predict
lgbm_wrapper_preds = lgbm_wrapper.predict(X_test)
lgbm_wrapper_proba = lgbm_wrapper.predict_proba(X_test)[:,1]

In [None]:
# AUC
lgb_roc_score = roc_auc_score(y_test,lgbm_wrapper_proba,average='macro')
print("roc_auc : {0:.4f}".format(lgb_roc_score))

In [None]:
# 2nd Validation
grid_lgb_clf = LGBMClassifier()
print(grid_lgb_clf.get_params())
params = {max_depth:[]}