# [XGBoost (eXtreme Gradient Boost)](https://github.com/kyopark2014/ML-Algorithms/blob/main/boosting.md#xgboost-extreme-gradient-boost)

In [None]:
!pip install xgboost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [2]:
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [4]:
wine.isna().sum().sum()

0

In [5]:
wine['class'].value_counts()

1.0    4898
0.0    1599
Name: class, dtype: int64

In [6]:
wine['class']

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
6492    1.0
6493    1.0
6494    1.0
6495    1.0
6496    1.0
Name: class, Length: 6497, dtype: float64

In [7]:
from sklearn.utils import shuffle
wine = shuffle(wine, random_state=2)

wine.head()

Unnamed: 0,alcohol,sugar,pH,class
799,10.7,3.6,3.29,0.0
1153,11.1,2.1,3.23,0.0
4915,12.0,1.1,3.16,1.0
2602,10.4,1.3,3.31,1.0
5267,11.2,11.0,3.07,1.0


### Split Train/Test Dataset

In [8]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.9/255.9 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

### XGBClassifier

In [16]:
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', random_state=42)

In [17]:
xgb.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}

### cross_validate

In [10]:
from sklearn.model_selection import cross_validate

scores = cross_validate(xgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9531942215608321 0.8770448656252313


### Evaluation

In [11]:
import time
start = time.time()

xgb.fit(train_input, train_target)

y_pred = xgb.predict(test_input)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, test_target)

print('Accuracy:', np.round(score, 3))

print('\nElased time: %0.2fs' % (time.time()-start))

Accuracy: 0.885

Elased time: 0.45s


In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_true=test_target, y_pred = y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.77      0.77       326
         1.0       0.92      0.92      0.92       974

    accuracy                           0.89      1300
   macro avg       0.85      0.85      0.85      1300
weighted avg       0.89      0.89      0.89      1300



### Feature Importance

In [13]:
print(xgb.feature_importances_)

[0.20768014 0.5993436  0.19297631]


In [14]:
import xgboost as xgb

def feature_importance(model, X_test): 
    feature_data = xgb.DMatrix(X_test)
    model.get_booster().feature_names = feature_data.feature_names
    model.get_booster().feature_types = feature_data.feature_types

    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=(15, 8))
    xgb.plot_importance(model, ax=ax, importance_type='gain')

In [15]:
feature_importance(model=xgb, X_test=test_input)

AttributeError: module 'xgboost' has no attribute 'get_booster'