## 5장-1절. Bagging(Bootstrap aggregating)
- Bootstraping(부트스트래핑) : 예측값과 실제값의 차이 중복을 허용한 리샘플링(복원추출)
- 정의
    - Boostrapping을 통해 여러 학습 데이터를 만들고, 이들을 averaging prediction을 통해 합친다!
    
    
- 프로세스 예시
    1. 1~12까지 12개의 데이터가 있다면 복원추출로 12개를 뽑는다.(안뽑히는 숫자가 존재할 수 있다.)(이때 전체데이터의 약 63%가 추출된다.)
    2. 1번을 k번 반복해서 k개의 tree를 만든다.
    3. k개의 tree로 각각 예측을 한다.
    4. 그 예측값의 평균을 계산
    5. 이 때 k개의 tree를 뽑을 때 학습데이터에서 추출되지 않는 데이터들을 활용하기위해 각각의 tree가 미추출된 데이터들을 예측하고, 이에대한 에러율을 구한다. 그 에러율의 평균은 Out-Of-Bag error(OOB error)라고 한다.
        - 즉, 이 학습데이터 내에서도 검증데이터에 대한 성능지표를 계산할 수 있게된다. (랜덤포레스트에서도 이 OOB error 사용)    
        
        
- 장점 : Tree들의 편향 유지, 분산 감소, 학습데이터의 noise에 강건해진다.
- 단점 : 모형해석이 어렵다.(Tree는 직관적이기 때문에 해석이 쉽지만, bagging은 어려워 진다.)

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np

## 데이터 불러오기

In [2]:
os.getcwd()

'C:\\Users\\rbtkd\\ADP_codingbook\\앙상블'

In [3]:
data = pd.read_csv("./data/kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [4]:
data.shape

(21613, 14)

In [5]:
# 의미없다고 판단되는 변수 제거
data = data.drop(['id','date','zipcode','lat','long'],axis=1)

In [6]:
from sklearn.model_selection import train_test_split
feature_columns = list(data.columns.difference(['price']))

X = data[feature_columns]
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15129, 8), (6484, 8), (15129,), (6484,))

In [7]:
from sklearn.preprocessing import StandardScaler

ss_scaler = StandardScaler()

X_train_scaled = ss_scaler.fit_transform(X_train)
X_test_scaled = ss_scaler.transform(X_test)

---
---

## 01. 로지스틱회귀분석 - 베이스모델

In [8]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
sm_X_train = sm.add_constant(X_train_scaled, has_constant='add')
sm_model = sm.OLS(y_train, sm_X_train)

In [10]:
fitted_sm_model = sm_model.fit()
fitted_sm_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.59
Model:,OLS,Adj. R-squared:,0.59
Method:,Least Squares,F-statistic:,2724.0
Date:,"Fri, 10 Sep 2021",Prob (F-statistic):,0.0
Time:,16:07:55,Log-Likelihood:,-208750.0
No. Observations:,15129,AIC:,417500.0
Df Residuals:,15120,BIC:,417600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.392e+05,1934.240,278.786,0.000,5.35e+05,5.43e+05
x1,1.068e+05,3129.532,34.140,0.000,1.01e+05,1.13e+05
x2,-4188.3511,2289.717,-1.829,0.067,-8676.473,299.770
x3,1.285e+04,2130.565,6.032,0.000,8674.840,1.7e+04
x4,-139.0375,2408.550,-0.058,0.954,-4860.087,4582.012
x5,2.294e+05,2663.317,86.120,0.000,2.24e+05,2.35e+05
x6,7.127e+04,1954.443,36.464,0.000,6.74e+04,7.51e+04
x7,-1.279e+05,2656.843,-48.122,0.000,-1.33e+05,-1.23e+05
x8,4808.4637,2063.579,2.330,0.020,763.600,8853.328

0,1,2,3
Omnibus:,13677.883,Durbin-Watson:,1.961
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1681500.452
Skew:,3.883,Prob(JB):,0.0
Kurtosis:,54.06,Cond. No.,3.26


In [11]:
sm_X_test = sm.add_constant(X_test_scaled, has_constant='add')
sm_model_predict = fitted_sm_model.predict(sm_X_test)

In [12]:
# 모델 평가
np.sqrt(mean_squared_error(y_test,sm_model_predict))

221736.04649882956

## 02. 로지스틱회귀모델을 배깅에 이용 - for문이용 => 나중 앙상블의 앙상블에 필요

In [13]:
# 밑에서 코드를 짜서 직접 돌리기 위해서는 이 과정이 이루어 져야 함 1
sm_X_train = pd.DataFrame(sm_X_train,index=y_train.index)
sm_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
20992,1.0,0.504246,-0.400536,-0.628343,1.87462,0.294623,-0.089416,1.397736,-0.209477
2378,1.0,1.154496,0.673118,0.908883,-0.916463,0.294623,-0.089416,0.10282,-0.209477
7863,1.0,0.504246,-0.400536,0.908883,-0.916463,0.294623,-0.089416,0.307281,-0.209477
6887,1.0,1.804746,-0.400536,-0.628343,0.944259,0.294623,-0.089416,0.886585,-0.209477
20369,1.0,1.804746,0.673118,-0.628343,0.944259,1.151149,-0.089416,1.499966,-0.209477


In [14]:
# 밑에서 코드를 짜서 직접 돌리기 위해서는 이 과정이 이루어 져야 함 2
sm_X_test = pd.DataFrame(sm_X_test, index=y_test.index)
sm_X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
11531,1.0,0.504246,0.673118,-0.628343,0.944259,2.007674,-0.089416,0.545818,-0.209477
6531,1.0,0.504246,0.673118,0.908883,-0.916463,0.294623,-0.089416,-0.169793,-0.209477
13505,1.0,-0.146004,-1.47419,-0.628343,-0.916463,0.294623,-0.089416,-0.749098,-0.209477
10858,1.0,0.179121,-0.400536,0.908883,0.944259,0.294623,-0.089416,0.409511,-0.209477
5687,1.0,-1.446504,-1.47419,-0.628343,-0.916463,-1.418427,-0.089416,-0.612791,-0.209477


In [15]:
# bagging을 for문으로 만들어보기

bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(X_train.shape[0])]
    random_data_index = np.random.choice(data_index, X_train.shape[0])
    print(len(set(random_data_index)))
    bg_X_train = sm_X_train.iloc[random_data_index,]
    bg_y_train = y_train.iloc[random_data_index,]
    sm_model = sm.OLS(bg_y_train,bg_X_train)
    fitted_sm_model = sm_model.fit()
    pred = fitted_sm_model.predict(sm_X_test)
    bagging_predict_result.append(pred)
    print(np.sqrt(mean_squared_error(y_test,pred)))

9513
221561.73914839167
9589
221785.73708073085
9548
221678.84691074723
9609
222729.88932666025
9564
221461.80270488796
9585
221735.68255435672
9585
222001.3461592786
9550
222077.7291782319
9533
221881.1537224147
9558
221970.60672460884


In [16]:
# bagging 결과의 평균 구하기 - 결과가 리스트로 생성되기 떄문에 아래에서 [lst_index].values로 값을 지정해 주어야 됨
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [17]:
# 모델 평가
np.sqrt(mean_squared_error(bagging_predict,y_test))

221692.8115700165

---
---

## 03. 로지스틱회귀모델을 배깅에 이용 - 패키지이용

In [18]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
fitted_linear_model = regression_model.fit(X_train_scaled, y_train)
pred_test_linear = fitted_linear_model.predict(X_test_scaled)
print(np.sqrt(mean_squared_error(y_test,pred_test_linear)))

221736.04649882956


In [19]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=10)
fitted_bagging_model = bagging_model.fit(X_train_scaled, y_train)
pred_test_bg = fitted_bagging_model.predict(X_test_scaled)

print(np.sqrt(mean_squared_error(y_test,pred_test_bg)))

221787.6776777772


In [30]:
# train_test_split을 사용하지 않았을때는 교차검증을 이렇게 하는 것

from sklearn.model_selection import cross_val_score

X = data[feature_columns]
y = data['price']

regression_model = LinearRegression()
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=10)
scores = cross_val_score(bagging_model,X,y,scoring='neg_mean_squared_error',cv=5)
print('평균 검증 정확도:', np.mean((np.sqrt(-1*scores))))

평균 검증 정확도: 233584.29527415632


---
---

## 04. 트리모델을 배깅에 이용 - for문이용
- bagging은 트리모델 기반임으로 트리모델을 배깅에 이용하는게 더 성능이 좋다.

In [329]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr_model1 = dtr.fit(X_train_scaled, y_train)
pred_test_dtr = dtr.predict(X_test_scaled)

print(np.sqrt(mean_squared_error(y_test,pred_test_dtr)))

267441.81575594534


In [330]:
X_train_scaled = pd.DataFrame(X_train_scaled, index=y_train.index)
X_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7
1586,1.479993,0.690018,-0.633445,0.933554,2.851289,-0.092008,0.645544,-0.208961
17068,-0.147743,1.787389,2.438648,-0.915466,-0.555598,-0.092008,-0.877653,-0.208961
21524,0.503351,0.690018,-0.633445,0.933554,1.999567,-0.092008,1.254823,-0.208961
4989,-0.47329,-0.407352,0.902601,-0.915466,-0.555598,-0.092008,0.239358,-0.208961
4188,-1.449932,-0.407352,-0.633445,-0.915466,-0.555598,-0.092008,-1.080746,-0.208961


In [331]:
X_test_scaled = pd.DataFrame(X_test_scaled, index=y_test.index)
X_test_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7
17527,-0.47329,-0.407352,0.902601,-0.915466,0.296124,-0.092008,-0.945351,-0.208961
12273,0.828898,1.787389,2.438648,0.009044,-0.555598,-0.092008,0.611695,-0.208961
12742,-1.449932,-0.407352,-2.169492,-0.915466,-1.40732,-0.092008,-0.336072,-0.208961
6343,0.503351,-0.407352,-0.633445,0.933554,-0.555598,-0.092008,1.153277,-0.208961
5894,0.503351,0.690018,-0.633445,0.933554,-0.555598,-0.092008,0.747091,-0.208961


In [334]:
bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(X_train.shape[0])]
    random_data_index = np.random.choice(data_index, X_train.shape[0])
    print(len(set(random_data_index)))
    bg_X_train = X_train_scaled.iloc[random_data_index,]
    bg_y_train = y_train.iloc[random_data_index,]
    dtr = DecisionTreeRegressor()
    dtr.fit(bg_X_train,bg_y_train)
    pred_tree = dtr.predict(X_test_scaled)
    bagging_predict_result.append(pred_tree)
    print(np.sqrt(mean_squared_error(y_test,pred_tree)))

9538
265025.67182190384
9476
289055.7504580901
9612
280866.52658801945
9502
269646.7801176419
9542
278810.7718611196
9555
282686.30726748984
9504
287126.8055049103
9517
283674.04448530613
9596
284506.5975839485
9623
278492.96912895364


In [335]:
### 주의 values는 빠져야 한다!!!
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [336]:
np.sqrt(mean_squared_error(y_test,bagging_predict))

228917.3976470912

---
---


## 05. 트리모델을 배깅에 이용 - 패키지이용

In [337]:
dtr = DecisionTreeRegressor()
bagging_decision_tree_model = BaggingRegressor(base_estimator=dtr,
                                              n_estimators=10,
                                              verbose=1)#학습 과정 표시
bagging_decision_tree_model.fit(X_train_scaled,y_train)
pred_test_tree_bagging = bagging_decision_tree_model.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test,pred_test_tree_bagging))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


236682.62166123808