# Adaboost Classifier
### Used Dataset: Iris Data

종속변수(dependent variable) : 꽃의 종

독립변수(independent variable) : Petal.Length, Petal.Width

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings("ignore")

iris = load_iris()
dfx = pd.DataFrame(iris.data[:,[2,3]]) 
dfy = pd.DataFrame(iris.target)
dfiris=pd.concat([dfx,dfy],axis=1)
dfiris.columns=['Petal.Length','Petal.Width','Species'] 
dfiris['Species'] = dfiris['Species'].map({0: "setosa", 1: "versicolor", 2: "virginica"})

## Split dataset

In [2]:
from sklearn.model_selection import train_test_split # 데이터를 train과 test로 분리

x_train, x_test , y_train, y_test = train_test_split(dfx, dfy, test_size=0.3, random_state=1)

dfXtrain=pd.DataFrame(x_train)
dfytrain=pd.DataFrame(y_train)
dfXtest=pd.DataFrame(x_test)
dfytest=pd.DataFrame(y_test)

dftrain=pd.concat([dfXtrain,dfytrain],axis=1)
dftest=pd.concat([dfXtest,dfytest],axis=1)
dftrain.columns=['Petal.Length','Petal.Width','Species']
dftest.columns=['Petal.Length','Petal.Width','Species']

dftrain

Unnamed: 0,Petal.Length,Petal.Width,Species
118,6.9,2.3,2
18,1.7,0.3,0
4,1.4,0.2,0
45,1.4,0.3,0
59,3.9,1.4,1
39,1.5,0.2,0
36,1.3,0.2,0
117,6.7,2.2,2
139,5.4,2.1,2
107,6.3,1.8,2


## model training

In [3]:
from sklearn.ensemble import AdaBoostClassifier


clf=AdaBoostClassifier(n_estimators=50,  # 반복적으로 학습할 모델의 수
                       learning_rate=1,  # 가중치의 기여도. 작을수록 가중치가 증가 또는 감소량이 작아져 학습속도가 느려짐
                       random_state=0)
# AdaBoostClassifier(n_estimators=50,learning_rate=1,base_estimator='DecisionTreeClassifier'  
#                     random_state=0,algorithm=’SAMME.R’)
#
# algorithm : {‘SAMME’, ‘SAMME.R’}, optional (default='SAMME.R’)

AdaBoost_clas = clf.fit(x_train, y_train)

## Result

In [4]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

print(classification_report(y_train, AdaBoost_clas.predict(x_train)))
print(confusion_matrix(y_train, AdaBoost_clas.predict(x_train)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        36
          1       0.91      0.97      0.94        32
          2       0.97      0.92      0.94        37

avg / total       0.96      0.96      0.96       105

[[36  0  0]
 [ 0 31  1]
 [ 0  3 34]]


In [5]:
y_pred=AdaBoost_clas.predict(x_test)

print(classification_report(y_test, y_pred))
print('\nAccuracy: {0:.4f}'.format(accuracy_score(y_test, y_pred)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        14
          1       0.94      0.94      0.94        18
          2       0.92      0.92      0.92        13

avg / total       0.96      0.96      0.96        45


Accuracy: 0.9556


# Adaboost Regressor
### Used Dataset: 당뇨병 환자 442명의 검사 데이터 from sklearn (442 diabetic patients data from sklearn)
종속변수(dependent variable) : 1년 후의 당뇨병 진행도 (diabetes progression after 1 year)

독립변수(independent variable) : 나이, 성별, BMI지수, 혈압 등 10개의 변수 (age, sex, BMI index, pressure pressure, etc.)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
%matplotlib inline

diabetes = datasets.load_diabetes()
dfxx=pd.DataFrame(diabetes.data)
dfyy=pd.DataFrame(diabetes.target)
dfdiabetes=pd.concat([dfxx, dfyy], axis=1)
dfdiabetes.columns=['AGE','SEX', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'Y']
dfdiabetes

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041180,-0.096346,97.0
6,-0.045472,0.050680,-0.047163,-0.015999,-0.040096,-0.024800,0.000779,-0.039493,-0.062913,-0.038357,138.0
7,0.063504,0.050680,-0.001895,0.066630,0.090620,0.108914,0.022869,0.017703,-0.035817,0.003064,63.0
8,0.041708,0.050680,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014956,0.011349,110.0
9,-0.070900,-0.044642,0.039062,-0.033214,-0.012577,-0.034508,-0.024993,-0.002592,0.067736,-0.013504,310.0


## model training

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

xx_train, xx_test , yy_train, yy_test = train_test_split(dfxx, dfyy, test_size=0.3, random_state=1)
#데이터를 train과 test로 분리

Decision_regr=DecisionTreeRegressor(max_depth=4).fit(xx_train,yy_train) # Decision tree Model
AdaBoost_regr=AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),     # Adaboost Model
                          n_estimators=300, random_state=1).fit(xx_train,yy_train) 

#loss : {‘linear’, ‘square’, ‘exponential’}, optional (default=’linear’)
# base_estimator : object, optional (default=DecisionTreeRegressor)

## Result

In [8]:
dec_y_pred = Decision_regr.predict(xx_test)
ada_y_pred = AdaBoost_regr.predict(xx_test)


print('\nDecisionTreeRegressor\n')
print('MeanSquaredError: %.2f'
     % mean_squared_error(yy_test,dec_y_pred))
print('MeanAbsoluteError: %.2f'
     % mean_absolute_error(yy_test, dec_y_pred))
print('R Square: %.2f'
     % r2_score(yy_test, dec_y_pred), '\n\n','-'*25)

print('\nAdaBoostRegressor\n')
print('MeanSquaredError: %.2f'
     % mean_squared_error(yy_test,ada_y_pred))
print('MeanAbsoluteError: %.2f'
     % mean_absolute_error(yy_test, ada_y_pred))
print('R Square: %.2f'
     % r2_score(yy_test, ada_y_pred), '\n')


DecisionTreeRegressor

MeanSquaredError: 4330.60
MeanAbsoluteError: 52.81
R Square: 0.14 

 -------------------------

AdaBoostRegressor

MeanSquaredError: 3537.40
MeanAbsoluteError: 47.29
R Square: 0.30 



# Gradient Boosting Machine(GBM)
### breast_cancer data (569개의 데이터)

종속변수(dependent variable) : 양성1 , 음성0

독립변수(independent variable) : 심장의 이미지를 설명하는 30개의 변수

In [9]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
import warnings
from sklearn import svm
warnings.filterwarnings("ignore")

In [10]:
breast_cancer = load_breast_cancer()


X = pd.DataFrame(breast_cancer.data[100:300, :])
y = pd.DataFrame(breast_cancer.target[100:300])

breast_cancer_data=pd.concat([X, y], axis=1)

breast_cancer_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,0.1
0,13.610,24.98,88.05,582.7,0.09488,0.08511,0.086250,0.044890,0.1609,0.05871,...,35.27,108.60,906.5,0.12650,0.19430,0.316900,0.118400,0.2651,0.07397,0
1,6.981,13.43,43.79,143.5,0.11700,0.07568,0.000000,0.000000,0.1930,0.07818,...,19.54,50.41,185.2,0.15840,0.12020,0.000000,0.000000,0.2932,0.09382,1
2,12.180,20.52,77.22,458.7,0.08013,0.04038,0.023830,0.017700,0.1739,0.05677,...,32.84,84.58,547.8,0.11230,0.08862,0.114500,0.074310,0.2694,0.06878,1
3,9.876,19.40,63.95,298.3,0.10050,0.09697,0.061540,0.030290,0.1945,0.06322,...,26.83,72.22,361.2,0.15590,0.23020,0.264400,0.097490,0.2622,0.08490,1
4,10.490,19.29,67.41,336.1,0.09989,0.08578,0.029950,0.012010,0.2217,0.06481,...,23.31,74.22,402.8,0.12190,0.14860,0.079870,0.032030,0.2826,0.07552,1
5,13.110,15.56,87.21,530.2,0.13980,0.17650,0.207100,0.096010,0.1925,0.07692,...,22.40,106.40,827.2,0.18620,0.40990,0.637600,0.198600,0.3147,0.14050,0
6,11.640,18.33,75.17,412.5,0.11420,0.10170,0.070700,0.034850,0.1801,0.06520,...,29.26,85.51,521.7,0.16880,0.26600,0.287300,0.121800,0.2806,0.09097,1
7,12.360,18.54,79.01,466.7,0.08477,0.06815,0.026430,0.019210,0.1602,0.06066,...,27.49,85.56,544.1,0.11840,0.19630,0.193700,0.084420,0.2983,0.07185,1
8,22.270,19.67,152.80,1509.0,0.13260,0.27680,0.426400,0.182300,0.2556,0.07039,...,28.01,206.80,2360.0,0.17010,0.69970,0.960800,0.291000,0.4055,0.09789,0
9,11.340,21.26,72.48,396.5,0.08759,0.06575,0.051330,0.018990,0.1487,0.06529,...,29.15,83.99,518.1,0.16990,0.21960,0.312000,0.082780,0.2829,0.08832,1


## Split dataset

In [11]:
x_train_GMB, x_test_GMB , y_train_GMB, y_test_GMB = train_test_split(X,y, test_size=0.3, random_state=1)

x_train_GMB=x_train_GMB[:]
y_train_GMB=y_train_GMB[:]

dfxxtrain=pd.DataFrame(x_train_GMB)
dfyytrain=pd.DataFrame(y_train_GMB)
dfxxtest=pd.DataFrame(x_test_GMB)
dfyytest=pd.DataFrame(y_test_GMB)
print(dfxxtrain.shape)
print(dfxxtest.shape)

(140, 30)
(60, 30)


## model training

In [12]:
from sklearn.ensemble import GradientBoostingClassifier



GBM_model= GradientBoostingClassifier(n_estimators=100, max_depth=2, random_state=0).fit(x_train_GMB, y_train_GMB)

#GBM_model= GradientBoostingClassifier(criterion='friedman_mse', init=None,
#              learning_rate=0.1, loss='deviance', max_depth=2,
#              max_features=None, max_leaf_nodes=None,
#              min_impurity_decrease=0.0, min_impurity_split=None,
#              min_samples_leaf=1, min_samples_split=2,
#              min_weight_fraction_leaf=0.0, n_estimators=100,
#              presort='auto', random_state=0, subsample=1.0, verbose=0,
#              warm_start=False).fit(x_train, y_train)
# 
# loss : {‘deviance’, ‘exponential’}, optional (default=’deviance’)

## Result

In [13]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_train_GMB, GBM_model.predict(x_train_GMB)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        55
          1       1.00      1.00      1.00        85

avg / total       1.00      1.00      1.00       140



In [14]:
from sklearn.metrics import precision_score,recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

y_pred_GMB=GBM_model.predict(x_test_GMB)


print ('Accuracy:', accuracy_score(y_test_GMB, y_pred_GMB))
print ('F1 score:', f1_score(y_test_GMB, y_pred_GMB))
print ('Recall:', recall_score(y_test_GMB, y_pred_GMB))
print ('Precision:', precision_score(y_test_GMB, y_pred_GMB))
print ('\n clasification report:\n', classification_report(y_test_GMB,y_pred_GMB))
print ('\n confussion matrix:\n',confusion_matrix(y_test_GMB, y_pred_GMB))

Accuracy: 0.983333333333
F1 score: 0.985507246377
Recall: 1.0
Precision: 0.971428571429

 clasification report:
              precision    recall  f1-score   support

          0       1.00      0.96      0.98        26
          1       0.97      1.00      0.99        34

avg / total       0.98      0.98      0.98        60


 confussion matrix:
 [[25  1]
 [ 0 34]]
