# Bagging(1)

- Bagging이란 bootstrap aggregating을 의미
- bootstrap이란 복원추출을 의미
- 즉 복원추출한 것들을 합친다는 것이다.

## #01. 패키지

In [21]:
import warnings
warnings.filterwarnings('ignore')

from pandas import read_excel
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score

## #02. 분류 문제

### 1. 데이터

In [22]:
origin = read_excel("https://data.hossam.kr/G02/breast_cancer.xlsx")
print(origin.info())
origin.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


### 데이터 전처리

#### 독립/종속 변수 구분

In [23]:
origin.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [24]:
x = origin.drop("target", axis=1)
y = origin['target']
x.shape, y.shape

((569, 30), (569,))

### 데이터 표준화

In [25]:
scaler = StandardScaler()
std_x = scaler.fit_transform(x)
std_x[:5]

array([[ 1.09706398e+00, -2.07333501e+00,  1.26993369e+00,
         9.84374905e-01,  1.56846633e+00,  3.28351467e+00,
         2.65287398e+00,  2.53247522e+00,  2.21751501e+00,
         2.25574689e+00,  2.48973393e+00, -5.65265059e-01,
         2.83303087e+00,  2.48757756e+00, -2.14001647e-01,
         1.31686157e+00,  7.24026158e-01,  6.60819941e-01,
         1.14875667e+00,  9.07083081e-01,  1.88668963e+00,
        -1.35929347e+00,  2.30360062e+00,  2.00123749e+00,
         1.30768627e+00,  2.61666502e+00,  2.10952635e+00,
         2.29607613e+00,  2.75062224e+00,  1.93701461e+00],
       [ 1.82982061e+00, -3.53632408e-01,  1.68595471e+00,
         1.90870825e+00, -8.26962447e-01, -4.87071673e-01,
        -2.38458552e-02,  5.48144156e-01,  1.39236330e-03,
        -8.68652457e-01,  4.99254601e-01, -8.76243603e-01,
         2.63326966e-01,  7.42401948e-01, -6.05350847e-01,
        -6.92926270e-01, -4.40780058e-01,  2.60162067e-01,
        -8.05450380e-01, -9.94437403e-02,  1.80592744e+

### 훈련/검증 데이터 분리

In [26]:
x_train, x_test, y_train, y_test = train_test_split(std_x, y, test_size=0.3, random_state=777)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((398, 30), (171, 30), (398,), (171,))

### 3. 분류 모델 구현

#### 분류 알고리즘 객체 정의

In [27]:
# KNN, DTREE 등 모든 분류 알고리즘 적용 가능
lr = LogisticRegression()

#### Bagging 모델 구현

In [29]:
clf = BaggingClassifier(
    base_estimator=lr,
    n_estimators=50,    # 부트스트랩 샘플 개수
    max_samples=1,  # 부트스트랩 샘플 비율 => 1이면 학습데이터를 모두 샘플링
    bootstrap=True, # 복원 추출, False이면 비복원 추출
    random_state=777,
    # 하나의 예측기에 들어가는 샘플에 대해 컬럼의 중복 사용 여부를 결정
    bootstrap_features=False,
    n_jobs=-1
)

# 모델 학습
clf.fit(x_train, y_train)
print("BaggingClassifier 훈련 정확도 : {:.3f}".format(clf.score(x_train,y_train)))

# 예측값
y_pred = clf.predict(x_test)
print("BaggingClassifier 테스트 정확도 : {:.3f}".format(accuracy_score(y_test,y_pred)))


BaggingClassifier 훈련 정확도 : 0.611
BaggingClassifier 테스트 정확도 : 0.667


## #03. 회귀문제

### 1. 데이터

In [30]:
origin = read_excel("https://data.hossam.kr/E04/boston.xlsx")
print(origin.info())
origin.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   CRIM       506 non-null    float64
 1   ZN         506 non-null    float64
 2   INDUS      506 non-null    float64
 3   CHAS       506 non-null    int64  
 4   NOX        506 non-null    float64
 5   RM         506 non-null    float64
 6   AGE        506 non-null    float64
 7   DIS        506 non-null    float64
 8   RAD        506 non-null    int64  
 9   TAX        506 non-null    int64  
 10  PTRATIO    506 non-null    float64
 11  B          506 non-null    float64
 12  LSTAT      506 non-null    float64
 13  MEDV       506 non-null    float64
 14  CAT. MEDV  506 non-null    int64  
dtypes: float64(11), int64(4)
memory usage: 59.4 KB
None


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,1


### 2. 데이터 전처리

#### 독립/종속변수 분할

In [31]:
x = origin.drop("MEDV", axis=1)
y = origin['MEDV']
x.shape, y.shape

((506, 14), (506,))

#### 훈련/검증 데이터 분리

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=777)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((354, 14), (152, 14), (354,), (152,))

### 3. 회귀모델 구현

#### 회귀 알고리즘 객체 정의

In [34]:
rg = LinearRegression()

#### Bagging 모델 구현

In [35]:
reg = BaggingRegressor(
    base_estimator=rg,
    n_estimators=50,    # 부트스트랩 샘플 개수
    max_samples=1,  # 부트스트랩 샘플 비율 => 1이면 학습데이터를 모두 샘플링
    bootstrap=True, # 복원 추출, False이면 비복원 추출
    random_state=777,
    # 하나의 예측기에 들어가는 샘플에 대해 컬럼의 중복 사용 여부를 결정
    bootstrap_features=False,
    n_jobs=-1
)

# 모델 학습
reg.fit(x_train, y_train)
print("BaggingRegressor 훈련 R2 : {:.3f}".format(reg.score(x_train,y_train)))

# 예측값
y_pred = reg.predict(x_test)
print("BaggingRegressor 테스트 R2 : {:.3f}".format(r2_score(y_test,y_pred)))

BaggingRegressor 훈련 R2 : -0.017
BaggingRegressor 테스트 R2 : -0.010


이후 GridSearchCV, 하이퍼 파라미터 튜닝 작업 실행