# XGBoost (iris) - Classification
[ch5-advanced-xgboost-unveiled.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch5-advanced-xgboost-unveiled.ipynb)

In [1]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import time

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [4]:
import xgboost as xgb
xgb.set_config(verbosity=0)

In [5]:
from sklearn import datasets
iris = datasets.load_iris()

In [6]:
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                  columns=iris['feature_names'] + ['target'])

In [7]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


### Shuffle

In [8]:
from sklearn.utils import shuffle
df = shuffle(df, random_state=2)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
6,4.6,3.4,1.4,0.3,0.0
3,4.6,3.1,1.5,0.2,0.0
113,5.7,2.5,5.0,2.0,2.0
12,4.8,3.0,1.4,0.1,0.0
24,4.8,3.4,1.9,0.2,0.0


In [9]:
df['target'].value_counts()

1.0    50
2.0    50
0.0    50
Name: target, dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 6 to 15
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 7.0 KB


In [11]:
df.isna().sum().sum()

0

### Split Train/Test dataset

In [12]:
X = iris['data']
y = iris['target']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

## classification_model

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, StratifiedKFold

kfold = StratifiedKFold(n_splits=5)

def classification_model(model):
    start = time.time()
    
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=kfold)
    # scores = cross_val_score(model, X, y, scoring='recall', cv=kfold)
    
    print('\nElased time: %0.2fs' % (time.time()-start))
    print('Accuracy:', np.round(scores, 2))
    print('Avg. Accuracy: %0.2f' % (scores.mean()))

In [15]:
from xgboost import XGBClassifier

classification_model(XGBClassifier(booster='gbtree'))  # Default


Elased time: 1.79s
Accuracy: [0.97 0.97 0.93 0.93 1.  ]
Avg. Accuracy: 0.96


In [16]:
XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [17]:
classification_model(XGBClassifier(booster='gblinear'))


Elased time: 1.65s
Accuracy: [1.   1.   0.93 0.9  1.  ]
Avg. Accuracy: 0.97


In [18]:
classification_model(XGBClassifier(booster='dart', one_drop=True))


Elased time: 3.52s
Accuracy: [0.97 0.97 0.93 0.93 1.  ]
Avg. Accuracy: 0.96


In [19]:
from sklearn.ensemble import RandomForestClassifier

classification_model(RandomForestClassifier())


Elased time: 0.57s
Accuracy: [0.97 0.97 0.93 0.93 1.  ]
Avg. Accuracy: 0.96


## HPO

### Default

In [20]:
start = time.time()

#xgb = XGBClassifier()
xgb = XGBClassifier(booster='gblinear')

xgb.fit(X_train, y_train)
    
y_pred = xgb.predict(X_test)
    
default_score = accuracy_score(y_pred, y_test)

print('default score: %0.2f' % (default_score))
print('Elased time: %0.2fs' % (time.time()-start))    

params = xgb.get_params()

default score: 0.97
Elased time: 0.32s


## HPO: Randomized Search

In [21]:
from sklearn.model_selection import RandomizedSearchCV

def randomized_search(params, runs=20): 
    #xgb = XGBClassifier(booster='gbtree', objective='binary:logistic', 
    #                    random_state=2, verbosity=0, use_label_encoder=False, n_jobs=-1)
    xgb = XGBClassifier(booster='gblinear', objective='multi:softprob', 
                        random_state=2, verbosity=0, use_label_encoder=False, n_jobs=-1)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    
    rand_clf = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=runs, n_jobs=-1, random_state=2, scoring='accuracy')
    
    rand_clf.fit(X_train, y_train)    
    
    best_model = rand_clf.best_estimator_    
    
    best_params = rand_clf.best_params_
    print("best parameter:", best_params)
    
    best_score = rand_clf.best_score_
    print("best score: {:.3f}".format(best_score))
    
    return best_model

In [None]:
import time
start = time.time()

best_model = randomized_search(
    params={
        'n_estimators':[50, 100, 200],
        'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
        'max_depth':[1, 2, 3, 5, 6, 8],
        'gamma':[0, 0.01, 0.1, 0.5, 1, 2],
        'min_child_weight':[1, 2, 3, 4, 5],
        'subsample':[0.5, 0.7, 0.8, 0.9, 1],
        'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1],  
        }, 
    runs=20)

print('\nElased time: %0.2fs' % (time.time()-start))

In [None]:
best_model.get_params()

## Evaluation

In [None]:
model = best_model

### cross_val_score

In [None]:
classification_model(model)

### Prediction

In [None]:
y_pred = model.predict(X_test)

### classification_report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred = y_pred))

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

print('Accuracy Score: %0.2f' % (accuracy_score(y_test,y_pred)))
print('Precision Score: %0.2f' % (precision_score(y_test,y_pred)))
print('Recall Score: %0.2f' % (recall_score(y_test,y_pred)))
print('F1 Score: %0.2f' % (f1_score(y_test,y_pred)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

## Feature Importances

In [None]:
print(model.feature_importances_)

In [None]:
import xgboost as xgb

feature_data = xgb.DMatrix(X_test)
model.get_booster().feature_names = feature_data.feature_names
model.get_booster().feature_types = feature_data.feature_types

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 8))
xgb.plot_importance(model, ax=ax, importance_type='gain')

In [None]:
!pip install graphviz

In [None]:
xgb.plot_tree(model, num_trees=0, rankdir='LR')

fig = plt.gcf()
fig.set_size_inches(50, 15)
plt.show()

### XGBClassifier

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', 
                    max_depth=6, learning_rate=0.1, n_estimators=100, 
                    n_jobs=-1)

xgb.fit(X_train, y_train)

### accuracy_score

In [None]:
y_pred = xgb.predict(X_test)

score = accuracy_score(y_pred, y_test)

print('Accuracy: %0.2f' % (score))

### classification_report

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred = y_pred))