# XGBOOST, LIGHTGBM, CATBOOST 실습

![Alt text](../images/lgb.png)

In [1]:
! pip install xgboost lightgbm catboost



# preprocessing

In [2]:
import numpy as np
import random 
import warnings
warnings.simplefilter("ignore", UserWarning)
np.random.seed(1)
random.seed(1)

In [3]:
# Load libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
import pandas as pd
from sklearn.metrics import classification_report

In [4]:
filename = '../dataset/pima-indians-diabetes.data.csv'
dataframe = pd.read_csv(filename, header =None)
dataframe.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
dataframe.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
X = dataframe.iloc[:, :-1]
y = dataframe.iloc[:, -1] 

In [6]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Class, dtype: int64

In [8]:
# 데이터 셋 분할하기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0) 

# xgboost 모델

In [9]:
import xgboost as xgb
# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [10,30,50],
              "min_child_weight" : [1,3,6],
              "n_estimators": [200],
              "learning_rate": [0.05, 0.1,0.16]}

grid_search = GridSearchCV(model, param_grid=param_dist, cv = 3,scoring="accuracy", 
                                   verbose=10, n_jobs=-1)
grid_search.fit(X_train, y_train)



Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  59 out of  81 | elapsed:    3.2s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  68 out of  81 | elapsed:    3.3s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  77 out of  81 | elapsed:    3.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:    3.4s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

In [10]:
grid_search.best_estimator_.get_params()

{'objective': 'binary:logistic',
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': '',
 'learning_rate': 0.05,
 'max_delta_step': 0,
 'max_depth': 10,
 'min_child_weight': 6,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 200,
 'n_jobs': 0,
 'num_parallel_tree': 1,
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [11]:
# 1) model 선언
model = xgb.XGBClassifier(**grid_search.best_estimator_.get_params())
# 2) model 학습
model = model.fit(X_train,y_train)

In [12]:
# 3) test 예측
pred_y = model.predict(X_test)
pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       107
           1       0.70      0.70      0.70        47

    accuracy                           0.82       154
   macro avg       0.79      0.79      0.79       154
weighted avg       0.82      0.82      0.82       154



In [14]:
x_gbm_results = classification_report(y_test, pred_y, output_dict=True)

In [15]:
x_gbm_results['1']['f1-score']

0.7021276595744681

In [16]:
model.feature_importances_

array([0.09198124, 0.303577  , 0.0766286 , 0.0744144 , 0.09281166,
       0.14462894, 0.09162459, 0.12433354], dtype=float32)

In [17]:
var_df = pd.Series(model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

Glucose                     0.303577
BMI                         0.144629
Age                         0.124334
Insulin                     0.092812
Pregnancies                 0.091981
DiabetesPedigreeFunction    0.091625
BloodPressure               0.076629
SkinThickness               0.074414
dtype: float32

----

# LightGBM 모델

In [18]:
import lightgbm as lgb

lg = lgb.LGBMClassifier(silent=True)
param_dict = { "objective":['binary'], # multiclass, regression
              "max_depth": [25,50, 75],
              "learning_rate" : [0.01,0.05,0.1],
              "num_leaves": [300,900,1200],
              "n_estimators": [200]
             }
grid_search = GridSearchCV(lg, n_jobs=-1, param_grid=param_dict, cv = 3, scoring="accuracy")
grid_search.fit(X_train,y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.01, 0.05, 0.1],
                         'max_depth

In [19]:
grid_search.best_estimator_.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.01,
 'max_depth': 25,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 200,
 'n_jobs': -1,
 'num_leaves': 300,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [20]:
# model 선언
model = lgb.LGBMClassifier( **grid_search.best_estimator_.get_params())
print(model)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=25,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=200, n_jobs=-1, num_leaves=300, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [21]:
# 모델 학습
model = model.fit(X_train,y_train)

In [22]:
# 예측
pred_y = model.predict(X_test)
pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [23]:
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       107
           1       0.69      0.66      0.67        47

    accuracy                           0.81       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.80      0.81      0.80       154



In [24]:
light_gbm_results = classification_report(y_test, pred_y, output_dict=True)

In [25]:
light_gbm_results['1']['f1-score']

0.6739130434782609

In [26]:
model.feature_importances_

array([293, 849, 411, 319, 323, 859, 757, 611], dtype=int32)

In [27]:
var_df = pd.Series(model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

BMI                         859
Glucose                     849
DiabetesPedigreeFunction    757
Age                         611
BloodPressure               411
Insulin                     323
SkinThickness               319
Pregnancies                 293
dtype: int32

- categorical variable?

In [28]:
# Initialize data
train_data = [[1, 0, 1, 4, 5, 6],
              [1, 0, 4, 5, 6, 7],
              [0, 1, 30, 40, 50, 60]]
train_labels = [1, 1, -1]
eval_data = [[1, 0, 2, 4, 6, 8],
             [1, 0, 1, 4, 50, 60]]

# Initialize CatBoostClassifier
model = lgb.LGBMClassifier()

# Fit model
model.fit(X = train_data,
          y = train_labels, 
          feature_name = ['c1','c2', 'c3', 'c4', 'c5', 'c6'],
          categorical_feature = ['c1','c2'])
# Get predicted classes
preds_class = model.predict(eval_data)
print('\npreds_class:\n', preds_class)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
print('preds_proba:\n', preds_proba)




preds_class:
 [1 1]
preds_proba:
 [[0.33333333 0.66666667]
 [0.33333333 0.66666667]]


---

# CatBoost 모델

In [30]:
import catboost as cb

cbm = cb.CatBoostClassifier(silent=True)

params = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
         'l2_leaf_reg': [1,4,9],
         'iterations': [300]}


grid_search = GridSearchCV(cbm, params, scoring="accuracy", cv = 3)
grid_search.fit(X_train, y_train)


GridSearchCV(cv=3, error_score=nan,
             estimator=<catboost.core.CatBoostClassifier object at 0x7fbd38e086a0>,
             iid='deprecated', n_jobs=None,
             param_grid={'depth': [4, 7, 10], 'iterations': [300],
                         'l2_leaf_reg': [1, 4, 9],
                         'learning_rate': [0.03, 0.1, 0.15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [31]:
model = cb.CatBoostClassifier(**grid_search.best_estimator_.get_params())

In [32]:
# 모델 학습
model = model.fit(X_train,y_train)

In [33]:
# 3) test 예측
pred_y = model.predict(X_test)
pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [34]:
print(classification_report(y_test, pred_y))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       107
           1       0.72      0.70      0.71        47

    accuracy                           0.82       154
   macro avg       0.79      0.79      0.79       154
weighted avg       0.82      0.82      0.82       154



In [35]:
catboost_results = classification_report(y_test, pred_y, output_dict=True)

In [36]:
catboost_results['1']['f1-score']

0.7096774193548387

In [37]:
model.feature_importances_

array([ 7.65620222, 25.06011137,  6.7629248 ,  3.83782887,  6.89193718,
       21.98133846, 12.51304823, 15.29660887])

In [38]:
var_df = pd.Series(model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

Glucose                     25.060111
BMI                         21.981338
Age                         15.296609
DiabetesPedigreeFunction    12.513048
Pregnancies                  7.656202
Insulin                      6.891937
BloodPressure                6.762925
SkinThickness                3.837829
dtype: float64

- categorical variable?

In [39]:
# Initialize data
cat_features = [0, 1]
train_data = [["a", "b", 1, 4, 5, 6],
              ["a", "b", 4, 5, 6, 7],
              ["c", "d", 30, 40, 50, 60]]
train_labels = [1, 1, -1]
eval_data = [["a", "b", 2, 4, 6, 8],
             ["a", "d", 1, 4, 50, 60]]

# Initialize CatBoostClassifier
model = cb.CatBoostClassifier(iterations=2,
                           learning_rate=1,
                           depth=2)
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(eval_data)
print('\npreds_class:\n', preds_class)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(eval_data)
print('preds_proba:\n', preds_proba)




0:	learn: 0.5800330	total: 782us	remaining: 782us
1:	learn: 0.4935379	total: 1.84ms	remaining: 0us

preds_class:
 [1 1]
preds_proba:
 [[0.37014499 0.62985501]
 [0.4641579  0.5358421 ]]


---

# Summary

In [40]:
pd.DataFrame({'Acc': [x_gbm_results['accuracy'], 
                      light_gbm_results['accuracy'],
                      catboost_results['accuracy']],
              'f1':[x_gbm_results['1']['f1-score'], 
                          light_gbm_results['1']['f1-score'], 
                          catboost_results['1']['f1-score']]},
             index =['Xgb', 'Lightgbm', 'Catboost'])

Unnamed: 0,Acc,f1
Xgb,0.818182,0.702128
Lightgbm,0.805195,0.673913
Catboost,0.824675,0.709677
