# [Gradient Boosting](https://github.com/kyopark2014/ML-Algorithms/blob/main/boosting.md#gradient-boosting)

In [6]:
import numpy as np
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [7]:
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


### Shuffle

In [10]:
from sklearn.utils import shuffle
wine = shuffle(wine, random_state=2)

wine.head(10)

Unnamed: 0,alcohol,sugar,pH,class
3014,9.3,0.8,3.36,1.0
2725,12.9,1.95,3.12,1.0
3266,11.8,5.9,2.95,1.0
3308,9.8,13.6,3.44,1.0
1096,10.8,5.5,3.35,0.0
124,9.5,1.6,3.39,0.0
2186,10.8,1.7,3.47,1.0
1051,9.2,1.4,3.03,0.0
6112,11.2,2.8,3.12,1.0
3595,8.8,14.6,3.34,1.0


In [4]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


### Split Train/Test dataset

In [11]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

### Gradient Boosting Classifier

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)

### cross_validate

In [14]:
from sklearn.model_selection import cross_validate

scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.8858958440146797 0.8689675723698823


### HPO

In [16]:
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=42)

scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9421784311608704 0.8793566298956097


In [17]:
import time
start = time.time()

gb.fit(train_input, train_target)

y_pred = gb.predict(test_input)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, test_target)

print('Accuracy:', np.round(score, 3))

print('\nElased time: %0.2fs' % (time.time()-start))

Accuracy: 0.875

Elased time: 1.36s


### classification_report

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_true=test_target, y_pred = y_pred))

              precision    recall  f1-score   support

         0.0       0.77      0.72      0.74       327
         1.0       0.91      0.93      0.92       973

    accuracy                           0.88      1300
   macro avg       0.84      0.82      0.83      1300
weighted avg       0.87      0.88      0.87      1300



### feature_importances

In [19]:
print(gb.feature_importances_)

[0.17161608 0.6715465  0.15683742]
