In [16]:
from sklearn.ensemble import BaggingClassifier , BaggingRegressor , RandomForestClassifier , RandomForestRegressor , AdaBoostClassifier , AdaBoostRegressor , GradientBoostingClassifier , GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor
from xgboost import XGBClassifier , XGBRegressor
from lightgbm import LGBMClassifier , LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve , auc , f1_score , mean_squared_error
from sklearn.datasets import load_breast_cancer , load_iris , load_diabetes

In [14]:
breast = load_breast_cancer()
breast_data = breast.data
breast_target = breast.target
train_x_b , test_x_b , train_y_b , test_y_b = train_test_split(breast_data , breast_target , test_size = 0.2 , random_state = 42 , stratify = breast_target)

In [13]:
iris = load_iris()
iris_data = iris.data
iris_target = iris.target
train_x_i , test_x_i , train_y_i , test_y_i = train_test_split(iris_data , iris_target , test_size = 0.2 , random_state = 42 , stratify = iris_target)

In [15]:
diabetes = load_diabetes()
diabetes_data = diabetes.data
diabetes_target = diabetes.target
train_x_d , test_x_d , train_y_d , test_y_d = train_test_split(diabetes_data , diabetes_target , test_size = 0.2 , random_state = 42)

In [19]:
## Bagging 투표 방식 시스템으로 기본적으로 모델 탐색기 가 있어야 한다.
dtr = DecisionTreeClassifier(max_depth = 3 ,min_samples_leaf = 10)
model = BaggingClassifier(base_estimator = dtr , n_estimators = 500 , random_state = 42)

In [20]:
model.fit(train_x_b , train_y_b)



In [22]:
y_score = model.predict_proba(test_x_b)[:,1]

In [24]:
fpr , tpr , thresholds = roc_curve(test_y_b , y_score)

In [25]:
AUC = auc(fpr , tpr)

In [26]:
AUC

0.9910714285714285

In [27]:
dtr = DecisionTreeClassifier(max_depth = 3 ,min_samples_leaf = 10)
model = BaggingClassifier(base_estimator = dtr , n_estimators = 500 , random_state = 42)

In [29]:
model.fit(train_x_i , train_y_i)



In [31]:
y_pred = model.predict(test_x_i)
macro_f1 = f1_score(test_y_i , y_pred,average = 'macro')

In [32]:
macro_f1

0.9333333333333332

In [33]:
dtr = DecisionTreeRegressor(max_depth = 3 ,min_samples_leaf = 10)
model = BaggingRegressor(base_estimator = dtr , n_estimators = 500 , random_state = 42)

In [34]:
model.fit(train_x_d , train_y_d)



In [35]:
y_pred = model.predict(test_x_d)

In [36]:
print(mean_squared_error(test_y_d , y_pred , squared = False))
print(mean_squared_error(test_y_d , y_pred) ** 0.5)


52.771937798612036
52.771937798612036


In [37]:
## RandomForest

model = RandomForestClassifier(n_estimators = 500 , max_depth = 3 , min_samples_leaf = 10 ,max_features = 'sqrt' ,random_state = 42)
model.fit(train_x_b , train_y_b)

In [38]:
y_score = model.predict_proba(test_x_b)[:,1]

In [39]:
fpr , tpr , thresholds = roc_curve(test_y_b , y_score)

In [40]:
AUC = auc(fpr , tpr)

In [41]:
AUC

0.9917328042328042

In [42]:
model = RandomForestClassifier(n_estimators = 500 , max_depth = 3 , min_samples_leaf = 10 ,max_features = 'sqrt' ,random_state = 42)
model.fit(train_x_i , train_y_i)

In [43]:
y_pred = model.predict(test_x_i)

In [44]:
macro_f1 = f1_score(test_y_i , y_pred , average = 'macro')

In [45]:
macro_f1

0.9333333333333332

In [46]:
model = RandomForestRegressor(n_estimators = 500 , max_depth = 3 , min_samples_leaf = 10 ,random_state = 42)
model.fit(train_x_d , train_y_d)

In [47]:
y_pred = model.predict(test_x_d)

In [48]:
print(mean_squared_error(test_y_d , y_pred , squared = False))
print(mean_squared_error(test_y_d , y_pred) ** 0.5)

52.77122907287036
52.77122907287036


In [50]:
## AdaBoost learning rate 라는 개념이 도입된다 학습률

model = AdaBoostClassifier(n_estimators = 100 , learning_rate = 0.5 , random_state = 42)

model.fit(train_x_b , train_y_b)


In [52]:
y_score = model.predict_proba(test_x_b)[:,1]

fpr,tpr,thresholds = roc_curve(test_y_b , y_score)

In [53]:
AUC = auc(fpr , tpr)
AUC

0.980489417989418

In [54]:
model = AdaBoostClassifier(n_estimators = 100 , learning_rate = 0.5 , random_state = 42)
model.fit(train_x_i , train_y_i)

In [55]:
y_pred = model.predict(test_x_i)

macro_f1 = f1_score(test_y_i , y_pred , average = 'macro')

In [56]:
macro_f1

0.8653198653198654

In [57]:
model = AdaBoostRegressor(n_estimators = 500 , learning_rate = 0.01 , loss = 'square' , random_state = 42) #loss 설정이 가능하다
model.fit(train_x_d , train_y_d)

In [58]:
y_pred = model.predict(test_x_d)
print(mean_squared_error(test_y_d , y_pred) ** 0.5)
print(mean_squared_error(test_y_d , y_pred , squared = False))

52.87237462832626
52.87237462832626


In [59]:
## GradientBoosting

model = GradientBoostingClassifier(n_estimators = 100 , max_depth = 4 , min_samples_leaf = 10 , learning_rate = 0.1 , random_state = 42)

In [60]:
model.fit(train_x_b , train_y_b)

In [61]:
y_score = model.predict_proba(test_x_b)[:,1]

In [62]:
fpr, tpr , thresholds = roc_curve(test_y_b , y_score)

In [63]:
AUC = auc(fpr , tpr)

In [64]:
AUC

0.9947089947089948

In [65]:
model = GradientBoostingClassifier(n_estimators = 100 , max_depth = 4 , min_samples_leaf = 10 , learning_rate = 0.1 , random_state = 42)
model.fit(train_x_i , train_y_i)

y_pred = model.predict(test_x_i)
macro_f1 = f1_score(test_y_i , y_pred , average = 'macro')
macro_f1

0.8653198653198654

In [69]:
model = GradientBoostingRegressor(n_estimators = 100 , max_depth = 4 , min_samples_leaf = 10 , learning_rate = 0.1 , random_state = 42)
model.fit(train_x_d , train_y_d)

y_pred = model.predict(test_x_d)
print(mean_squared_error(test_y_d , y_pred , squared = False))

56.39672185880616


In [72]:
## XGBoost
model = XGBClassifier(n_estimators = 100 , max_depth = 8 , random_state = 42)
model.fit(train_x_b , train_y_b)

y_score = model.predict_proba(test_x_b)[:,1]
fpr,tpr,thresholds = roc_curve(test_y_b , y_score)
AUC = auc(fpr , tpr)
AUC

0.9917328042328042

In [73]:
model = XGBClassifier(n_estimators = 100 , max_depth = 8 , random_state = 42)
model.fit(train_x_i , train_y_i)

y_pred = model.predict(test_x_i)
macro_f1 = f1_score(test_y_i , y_pred , average = 'macro')
macro_f1

0.9333333333333332

In [76]:
model = XGBRegressor(n_estimators = 100 , max_depth = 8 , random_state = 42)
model.fit(train_x_d , train_y_d)
y_pred = model.predict(test_x_d)

print(mean_squared_error(test_y_d , y_pred,squared = False))

59.7424277840816


In [77]:
## LGBoost
model = LGBMClassifier(n_estimators = 100 , max_depth = 8 , random_state = 42)
model.fit(train_x_b , train_y_b)

y_score = model.predict_proba(test_x_b)[:,1]
fpr,tpr,thresholds = roc_curve(test_y_b , y_score)
AUC = auc(fpr , tpr)
AUC

[LightGBM] [Info] Number of positive: 285, number of negative: 170
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.626374 -> initscore=0.516691
[LightGBM] [Info] Start training from score 0.516691


0.9887566137566137

In [78]:
model = LGBMClassifier(n_estimators = 100 , max_depth = 8 , random_state = 42)
model.fit(train_x_i , train_y_i)

y_pred = model.predict(test_x_i)
macro_f1 = f1_score(test_y_i , y_pred , average = 'macro')
macro_f1

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000037 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 86
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


0.899749373433584

In [79]:
model = LGBMRegressor(n_estimators = 100 , max_depth = 8 , random_state = 42)
model.fit(train_x_d , train_y_d)
y_pred = model.predict(test_x_d)

print(mean_squared_error(test_y_d , y_pred,squared = False))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 595
[LightGBM] [Info] Number of data points in the train set: 353, number of used features: 10
[LightGBM] [Info] Start training from score 153.736544
56.89678928938987
