In [1]:
import numpy as np
import pandas as pd

In [2]:
wine = pd.read_csv('http://bit.ly/wine-date')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
from sklearn.model_selection import train_test_split

data_input = wine[['alcohol', 'sugar', 'pH']].to_numpy()
data_target = wine[['class']].to_numpy()

train_input, test_input, train_target, test_target =\
    train_test_split(data_input, data_target, test_size=0.2, random_state=1234)

print(train_input.shape, test_input.shape)
print(train_target.shape, test_target.shape)

(5197, 3) (1300, 3)
(5197, 1) (1300, 1)


### Random Forest

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=1234)
result = cross_validate(rf, train_input, train_target, return_train_score=True, n_jobs=-1)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.39993,0.065825,0.877885,0.998557
1,0.428853,0.065825,0.889423,0.997594
2,0.594411,0.043882,0.890279,0.998076
3,0.524599,0.089761,0.891242,0.997835
4,0.412894,0.062832,0.891242,0.998076


In [5]:
print('Average Train Score:', result_df['train_score'].mean())
print('Average Test Score:', result_df['test_score'].mean())

Average Train Score: 0.9980277128942033
Average Test Score: 0.8880139927445029


In [6]:
rf.fit(train_input, train_target)
print(rf.feature_importances_)

  rf.fit(train_input, train_target)


[0.23273663 0.49520246 0.27206091]


In [7]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=1234)
rf.fit(train_input, train_target)
print(rf.oob_score_.round(4))

  rf.fit(train_input, train_target)


0.8978


### Extra Trees

In [8]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_jobs=-1, random_state=1234)
result = cross_validate(et, train_input, train_target, return_train_score=True, n_jobs=-1)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.299201,0.047872,0.869231,0.998557
1,0.263295,0.060838,0.877885,0.997594
2,0.303191,0.043881,0.894129,0.998076
3,0.40292,0.071809,0.885467,0.997835
4,0.379982,0.087766,0.883542,0.998076


In [9]:
print('Average Train Score:', result_df['train_score'].mean())
print('Average Test Score:', result_df['test_score'].mean())

Average Train Score: 0.9980277128942033
Average Test Score: 0.8820506033908343


In [10]:
et.fit(train_input, train_target)
print(et.feature_importances_.round(4))

  et.fit(train_input, train_target)


[0.1965 0.5172 0.2862]


### Gradient Boosting

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=1234)

result = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.322608,0.001959,0.860577,0.891268
1,0.341559,0.002956,0.874038,0.885975
2,0.338562,0.005984,0.871992,0.88456
3,0.316625,0.003989,0.875842,0.886724
4,0.333582,0.00299,0.87103,0.886003


In [12]:
print('Average Train Score:', result_df['train_score'].mean())
print('Average Test Score:', result_df['test_score'].mean())

Average Train Score: 0.8869060723041959
Average Test Score: 0.8706959354408825


In [13]:
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=1234)

result = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)
result_df = pd.DataFrame(result)
result_df

Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.249653,0.010005,0.856731,0.945874
1,1.230706,0.007012,0.882692,0.93962
2,1.262656,0.009974,0.891242,0.94228
3,1.626163,0.009974,0.877767,0.944204
4,1.652094,0.010971,0.87488,0.942039


In [14]:
print('Average Train Score:', result_df['train_score'].mean())
print('Average Test Score:', result_df['test_score'].mean())

Average Train Score: 0.9428035350816202
Average Test Score: 0.876662286221959


In [15]:
gb.fit(train_input, train_target)
print(gb.feature_importances_.round(4))

  return f(**kwargs)


[0.167  0.6625 0.1705]


### Histogram-based Gradient Boosting

In [16]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=1234)

result = cross_validate(hgb, train_input, train_target, return_train_score=True)
result_df = pd.DataFrame(result)
result_df

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.975875,0.005984,0.865385,0.938417
1,0.880648,0.006978,0.8875,0.933847
2,0.86768,0.004986,0.894129,0.929052
3,0.85671,0.005984,0.873917,0.93266
4,0.863691,0.006981,0.879692,0.9329


In [17]:
print('Average Train Score:', result_df['train_score'].mean())
print('Average Test Score:', result_df['test_score'].mean())

Average Train Score: 0.9333752892569347
Average Test Score: 0.8801245650403494


In [19]:
hgb.fit(train_input, train_target)

  return f(**kwargs)


HistGradientBoostingClassifier(random_state=1234)

In [20]:
print('Finalized Test Score:', hgb.score(test_input, test_target).round(4))

Finalized Test Score: 0.8731


### Histogram-based Gradient Boosting (XGBoost)

In [22]:
! pip install xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', random_state=1234)

result = cross_validate(xgb, train_input, train_target, return_train_score=True)
result_df = pd.DataFrame(result)
result_df

Collecting xgboost
  Downloading xgboost-1.4.2-py3-none-win_amd64.whl (97.8 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.4.2


You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.
  return f(**kwargs)




Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.166313,0.010971,0.870192,0.959346
1,0.163563,0.006982,0.886538,0.953813
2,0.155584,0.006981,0.884504,0.960077
3,0.155584,0.004987,0.880654,0.957912
4,0.154587,0.004987,0.876805,0.95671


In [23]:
print('Average Train Score:', result_df['train_score'].mean())
print('Average Test Score:', result_df['test_score'].mean())

Average Train Score: 0.957571580496767
Average Test Score: 0.879738839120456


### Histogram-based Gradient Boosting (LightBGM)

In [24]:
! pip install lightgbm
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=1234)

result = cross_validate(lgb, train_input, train_target, return_train_score=True, n_jobs=-1)
result_df = pd.DataFrame(result)
result_df

Collecting lightgbm
  Downloading lightgbm-3.2.1-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.2.1


You should consider upgrading via the 'C:\ProgramData\Anaconda3\python.exe -m pip install --upgrade pip' command.


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.072806,0.009972,0.858654,0.940101
1,0.095743,0.01197,0.886538,0.93553
2,0.092752,0.009974,0.887392,0.931217
3,0.088758,0.010972,0.877767,0.932419
4,0.090751,0.012967,0.884504,0.931698


In [27]:
print('Avg. Train Score:', result_df['train_score'].mean())
print('Avg. Test Score:', result_df['test_score'].mean())

Avg. Train Score: 0.9341931520666185
Avg. Test Score: 0.8789710890649293
