# XGBoost - diabetes (Regression)
[ch5-advanced-xgboost-unveiled.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch5-advanced-xgboost-unveiled.ipynb)

In [1]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import time

In [4]:
import xgboost as xgb
xgb.set_config(verbosity=0)

## Define Feature and Target

In [5]:
from sklearn import datasets
X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)

In [6]:
pd.DataFrame(y).describe()

Unnamed: 0,target
count,442.0
mean,152.133484
std,77.093005
min,25.0
25%,87.0
50%,140.5
75%,211.5
max,346.0


In [7]:
pd.DataFrame(y).value_counts()

target
200.0     6
72.0      6
178.0     5
71.0      5
90.0      5
         ..
199.0     1
201.0     1
208.0     1
209.0     1
25.0      1
Length: 214, dtype: int64

In [8]:
pd.DataFrame(y).head()

Unnamed: 0,target
0,151.0
1,75.0
2,141.0
3,206.0
4,135.0


In [9]:
pd.DataFrame(X).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [10]:
pd.DataFrame(y).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   target  442 non-null    float64
dtypes: float64(1)
memory usage: 3.6 KB


In [11]:
pd.DataFrame(X).isna().sum().sum()

0

## Regression Model Selection

In [12]:
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score

# kfold = KFold(n_splits=5, shuffle=True, random_state=2)
kfold = StratifiedKFold(n_splits=5)

def cross_validation(model):
    start = time.time()
    
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
    
    rmse = np.sqrt(-scores)
    
    print('Cross Validation:')
    print('Elased time: %0.2fs' % (time.time()-start))
    print('RMSE:', np.round(rmse, 3))
    print('Avg. RMSE: %0.3f' % (rmse.mean()))

In [None]:
cross_validation(XGBRegressor(booster='gbtree'))

In [None]:
cross_validation(XGBRegressor(booster='gblinear'))

In [None]:
cross_validation(XGBRegressor(booster='dart', one_drop=1))

In [None]:
cross_validation(XGBRegressor(booster='dart', rate_drop=0.5))

In [None]:
cross_validation(XGBRegressor(booster='dart', rate_drop=0.5, sample_type='weighted'))

In [None]:
cross_validation(XGBRegressor(booster='dart', rate_drop=0.5, normalize_type='forest'))

In [None]:
#cross_validation(XGBRegressor(booster='gbtree', num_parallel_tree=25))

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression

cross_validation(LinearRegression())

In [None]:
from sklearn.linear_model import Lasso

cross_validation(Lasso())

In [None]:
from sklearn.linear_model import Ridge

cross_validation(Ridge())

In [None]:
from sklearn.ensemble import RandomForestRegressor

cross_validation(RandomForestRegressor())

## HPO

### Default

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split train/test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

# define model
#model = XGBRegressor(booster='gbtree', objective='binary:logistic', 
#                        random_state=2, verbosity=0, use_label_encoder=False, n_jobs=-1)
model = XGBRegressor(booster='gbtree', random_state=2, verbosity=0, use_label_encoder=False, n_jobs=-1)

# default model accuacry 
def getAccuracy(model):
    start = time.time()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    reg_mse = mean_squared_error(y_test, y_pred)
    reg_rmse = np.sqrt(reg_mse)

    print('Elapsed time: %0.2fs' % (time.time()-start))    
    
    #print('RMSE:', np.round(rmse, 3))
    print('RMSE: %0.3f' % (reg_rmse))

getAccuracy(model)

## HPO: Randomized Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

def randomized_search(params, runs=20): 
    xgb = XGBRegressor(booster='gbtree', random_state=2, verbosity=1, use_label_encoder=False, n_jobs=-1)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    
    rand_reg = RandomizedSearchCV(xgb, params, cv=kfold, n_iter=runs, n_jobs=-1, random_state=2, scoring='neg_mean_squared_error')
    
    rand_reg.fit(X_train, y_train)    
    
    best_model = rand_reg.best_estimator_    
    
    best_params = rand_reg.best_params_
    print("best parameter:", best_params)
    
    best_score = rand_reg.best_score_
    print("best score: {:.3f}".format(best_score))
    
    return best_model

In [None]:
#        'n_estimators':[50, 100, 200],
#        'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5],
#        'max_depth':[1, 2, 3, 5, 6, 8],
#        'gamma':[0, 0.01, 0.1, 0.5, 1, 2],
#        'min_child_weight':[1, 2, 3, 4, 5],
#        'subsample':[0.5, 0.7, 0.8, 0.9, 1],
#        'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1],  

start = time.time()

best_model = randomized_search(
    params={
        'n_estimators':[50, 100, 200],
        'learning_rate':[0.01, 0.1, 0.3, 0.5],
        'max_depth':[1, 2, 3, 5, 7, 9],
        'subsample':[0.5, 0.7, 0.8, 0.9, 1],
        }, 
    runs=20)

print('\nElapsed time: %0.2fs' % (time.time()-start))

In [None]:
best_model.get_params()

## Evaluation

In [None]:
model = best_model

def model_estimation(model):    
    start = time.time()
    
    y_pred = model.predict(X_test)

    reg_mse = mean_squared_error(y_test, y_pred)
    reg_rmse = np.sqrt(reg_mse)

    print('Elapsed time: %0.2fs' % (time.time()-start))    
    
    #print('RMSE:', np.round(rmse, 3))
    print('RMSE: %0.3f' % (reg_rmse))
    
cross_validation(model)   
model_estimation(model)    

### HPO - GridSearchCV

In [None]:
import time
from sklearn.model_selection import GridSearchCV

def grid_search(params, reg=XGBRegressor(booster='gblinear')):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    start = time.time()
    
    grid_reg.fit(X, y)

    best_params = grid_reg.best_params_
    print("best parameter:", best_params)
    
    best_score = np.sqrt(-grid_reg.best_score_)
    print("best accuracy:", best_score)    
    print('Elased time: %0.2fs' % (time.time()-start))

In [None]:
grid_search(params={'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]})

In [None]:
grid_search(params={'reg_lambda':[0.001, 0.01, 0.1, 0.5, 1, 5]})

In [None]:
grid_search(params={'feature_selector':['shuffle']})

In [None]:
grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 
                    'updater':['coord_descent'] })

In [None]:
grid_search(params={'feature_selector':['greedy', 'thrifty'], 
                    'updater':['coord_descent'], 'top_k':[3, 5, 7, 9]})

In [None]:
param_grid = [{'updater':['shotgun'],
               'feature_selector':['cyclic', 'shuffle']},
              {'updater':['coord_descent'],
               'feature_selector':['random', 'greedy', 'thrifty']}]
grid_search(params=param_grid)

In [None]:
X = np.arange(1,100)
np.random.seed(2) 
y = []
for i in X:
    y.append(i*np.random.uniform(-0.2, 0.2))
y = np.array(y)
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [None]:
import matplotlib.pyplot as plt

xgbr = XGBRegressor(booster='gblinear')
xgbr.fit(X, y)

plt.scatter(X, y)
plt.plot((0, 99), (xgbr.intercept_, xgbr.coef_*99+xgbr.intercept_))
plt.show()