### PART 04) 머신러닝

## 1장. 지도학습모형

### 8절. 의사결정나무

In [1]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
dtr_bin = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 10, random_state = 2022)

# 모델학습
model_dtr_bin = dtr_bin.fit(X_train, y_train)

In [2]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_dtr_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.999834656084656


In [3]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
dtr_multi = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 10, random_state = 2022)

# 모델학습
model_dtr_multi = dtr_multi.fit(X_train, y_train)

In [4]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_dtr_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [5]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
dtr_conti = DecisionTreeRegressor(max_depth = 3, min_samples_leaf = 10, random_state = 2022)

# 모델학습
model_dtr_conti = dtr_conti.fit(X_train, y_train)

In [6]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_dtr_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

68.43615007635243


---

### 9절. 앙상블

#### 1. 배깅

In [1]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
dtr = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 10)
bag_bin = BaggingClassifier(base_estimator = dtr, n_estimators = 500, random_state = 2022)

# 모델학습
model_bag_bin = bag_bin.fit(X_train, y_train)

In [2]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_bag_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9973544973544974


In [3]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
dtr = DecisionTreeClassifier(max_depth = 3, min_samples_leaf = 10)
bag_multi = BaggingClassifier(base_estimator = dtr, n_estimators = 500, random_state = 2022)

# 모델학습
model_bag_multi = bag_multi.fit(X_train, y_train)

In [4]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_bag_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [5]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
dtr = DecisionTreeRegressor(max_depth = 3, min_samples_leaf = 10)
bag_conti = BaggingRegressor(base_estimator = dtr, n_estimators = 500, random_state = 2022)

# 모델학습
model_bag_conti = bag_conti.fit(X_train, y_train)

In [6]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_bag_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

62.445688892722956


#### 2. 랜덤포레스트

In [7]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
rf_bin = RandomForestClassifier(n_estimators = 500, max_depth = 3, min_samples_leaf = 10, max_features = 'sqrt', random_state = 2022)

# 모델학습
model_rf_bin = rf_bin.fit(X_train, y_train)

In [8]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_rf_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9996693121693121


In [9]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
rf_multi = RandomForestClassifier(n_estimators = 500, max_depth = 3, min_samples_leaf = 15, max_features = 'sqrt', random_state = 2022)

# 모델학습
model_rf_multi = rf_multi.fit(X_train, y_train)

In [10]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_rf_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [11]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
rf_conti = RandomForestRegressor(n_estimators = 500, max_depth = 3, min_samples_leaf = 10, max_features = 3, random_state = 2022)

# 모델학습
model_rf_conti = rf_conti.fit(X_train, y_train)

In [12]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_rf_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

63.49825173792558


#### 3. AdaBoost

In [18]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
ada_bin = AdaBoostClassifier(n_estimators = 100, learning_rate = 0.5, random_state = 2022)

# 모델학습
model_ada_bin = ada_bin.fit(X_train, y_train)

In [19]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_ada_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9957010582010583


In [20]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
ada_multi = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.01, random_state = 2022)

# 모델학습
model_ada_multi = ada_multi.fit(X_train, y_train)

In [21]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_ada_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [22]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
ada_conti = AdaBoostRegressor(n_estimators = 500, learning_rate = 0.01, loss = 'square', random_state = 2022)

# 모델학습
model_ada_conti = ada_conti.fit(X_train, y_train)

In [23]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_ada_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

62.077241273777446


#### 4. Gradient Boosting

In [24]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
gbm_bin = GradientBoostingClassifier(n_estimators = 500, max_depth = 4, min_samples_leaf = 10, learning_rate = 0.1, random_state = 2022)

# 모델학습
model_gbm_bin = gbm_bin.fit(X_train, y_train)

In [25]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_gbm_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9983465608465608


In [26]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
gbm_multi = GradientBoostingClassifier(n_estimators = 500, max_depth = 8, min_samples_leaf = 5, learning_rate = 0.5, random_state = 2022)

# 모델학습
model_gbm_multi = gbm_multi.fit(X_train, y_train)

In [27]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_gbm_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [28]:
# 패키지로부터 클래스, 함수를 호출
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
gbm_conti = GradientBoostingRegressor(n_estimators = 500, max_depth = 2, min_samples_leaf = 5, learning_rate = 0.5, random_state = 2022)

# 모델학습
model_gbm_conti = gbm_conti.fit(X_train, y_train)

In [29]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_gbm_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

75.25181596869564


#### 5. XGBoost

In [30]:
# 패키지로부터 클래스, 함수를 호출
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
xgb_wrap_bin = XGBClassifier(max_depth = 8, n_estimators = 500, nthread = 5, min_child_weight = 20, gamma = 0.5, objective = 'binary:logistic', use_label_encoder = False, random_state = 2022)

# 모델학습
model_xgb_wrap_bin = xgb_wrap_bin.fit(X_train, y_train, eval_metric = 'mlogloss')

  from pandas import MultiIndex, Int64Index


In [31]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_xgb_wrap_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9952050264550264


In [32]:
# 패키지로부터 클래스, 함수를 호출
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
xgb_wrap_multi = XGBClassifier(max_depth = 8, n_estimators = 500, nthread = 5, min_child_weight = 10, gamma = 0.5, objective = 'multi:softmax', use_label_encoder = False, random_state = 2022)

# 모델학습
model_xgb_wrap_multi = xgb_wrap_multi.fit(X_train, y_train, eval_metric = 'mlogloss')

In [33]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_xgb_wrap_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [34]:
# 패키지로부터 클래스, 함수를 호출
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205)

# 모형객체 생성
xgb_wrap_conti = XGBRegressor(max_depth = 8, n_estimators = 500, nthread = 5, min_child_weight = 10, gamma = 0.5, objective = 'reg:squarederror', use_label_encoder = False, random_state = 2022)

# 모델학습
model_xgb_wrap_conti = xgb_wrap_conti.fit(X_train, y_train, eval_metric = 'rmse')

In [35]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_xgb_wrap_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

67.9309022195871


#### 6. LightGBM

In [36]:
# 패키지로부터 클래스, 함수를 호출
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

# breast_cancer 데이터셋 호출
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
data = breast_cancer.data
target = breast_cancer.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
lgb_wrap_bin = LGBMClassifier(max_depth = 8, n_estimators = 500, n_jobs = 30, min_child_weight = 10, learning_rate = 0.2, objective = 'binary', random_state = 2022)

# 모델학습
model_lgb_wrap_bin = lgb_wrap_bin.fit(X_train, y_train)                                                   

In [37]:
# ROC
from sklearn.metrics import roc_curve, auc
y_score = model_lgb_wrap_bin.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)

# AUC
AUC = auc(fpr, tpr) # roc_curve()에서 반환된 fpr을 x축, tpr을 y축
print(AUC)

0.9996693121693121


In [38]:
# 패키지로부터 클래스, 함수를 호출
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split

# iris 데이터셋 호출
from sklearn.datasets import load_iris
iris = load_iris()
data = iris.data
target = iris.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 2205, stratify = target)

# 모형객체 생성
lgb_wrap_multi = LGBMClassifier(num_leaves=31, max_depth = 8, n_estimators = 500, n_jobs = 5, min_child_weight = 10, learning_rate = 0.5, objective = 'multiclass', random_state = 2022)

# 모델학습
model_lgb_wrap_multi = lgb_wrap_multi.fit(X_train, y_train)

In [39]:
# macro f1-score
from sklearn.metrics import f1_score
y_pred = model_lgb_wrap_multi.predict(X_test)
macro_f1 = f1_score(y_test, y_pred, average = "macro")
print(macro_f1)

0.9665831244778613


In [40]:
# 패키지로부터 클래스, 함수를 호출
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

# 연속형 예측 데이터 예
# diabetes 데이터셋 호출
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
data = diabetes.data
target = diabetes.target

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(data,
 target,
 test_size = 0.2,
 random_state = 2205)

# 모형객체 생성
lgb_wrap_conti = LGBMRegressor(max_depth = 8, n_estimators = 500, n_jobs = 5, min_child_weight = 10, learning_rate = 0.5, objective = 'regression', random_state = 2022)

# 모델학습
model_lgb_wrap_conti = lgb_wrap_conti.fit(X_train, y_train)

In [41]:
# RMSE
from sklearn.metrics import mean_squared_error
y_pred = model_lgb_wrap_conti.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared = False)
print(rmse)

63.63732148056437
