# Bagging(Bootstrap aggregating)
- Bootstraping(부트스트래핑) : 예측값과 실제값의 차이 중복을 허용한 리샘플링(복원추출)
- 정의
    - Boostrapping을 통해 여러 학습 데이터를 만들고, 이들을 averaging prediction을 통해 합친다!
    
    
- 프로세스 예시
    1. 1~12까지 12개의 데이터가 있다면 복원추출로 12개를 뽑는다.(안뽑히는 숫자가 존재할 수 있다.)(이때 전체데이터의 약 63%가 추출된다.)
    2. 1번을 k번 반복해서 k개의 tree를 만든다.
    3. k개의 tree로 각각 예측을 한다.
    4. 그 예측값의 평균을 계산
    5. 이 때 k개의 tree를 뽑을 때 학습데이터에서 추출되지 않는 데이터들을 활용하기위해 각각의 tree가 미추출된 데이터들을 예측하고, 이에대한 에러율을 구한다. 그 에러율의 평균은 Out-Of-Bag error(OOB error)라고 한다.
        - 즉, 이 학습데이터 내에서도 검증데이터에 대한 성능지표를 계산할 수 있게된다. (랜덤포레스트에서도 이 OOB error 사용)    
        
        
- 장점 : Tree들의 편향 유지, 분산 감소, 학습데이터의 noise에 강건해진다.
- 단점 : 모형해석이 어렵다.(Tree는 직관적이기 때문에 해석이 쉽지만, bagging은 어려워 진다.)

In [308]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np

In [309]:
os.getcwd()

'C:\\Users\\rbtkd\\ADP_codingbook\\앙상블'

In [310]:
data = pd.read_csv("./data/kc_house_data.csv")
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900.0,3,1.0,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,6414100192,20141209T000000,538000.0,3,2.25,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,5631500400,20150225T000000,180000.0,2,1.0,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,2487200875,20141209T000000,604000.0,4,3.0,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,1954400510,20150218T000000,510000.0,3,2.0,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [311]:
data.shape

(21613, 14)

In [312]:
# 의미없다고 판단되는 변수 제거
data = data.drop(['id','date','zipcode','lat','long'],axis=1)

In [313]:
from sklearn.model_selection import train_test_split
feature_columns = list(data.columns.difference(['price']))

X = data[feature_columns]
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15129, 8), (6484, 8), (15129,), (6484,))

In [314]:
from sklearn.preprocessing import StandardScaler

ss_scaler = StandardScaler()

X_train_scaled = ss_scaler.fit_transform(X_train)
X_test_scaled = ss_scaler.transform(X_test)

In [315]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [316]:
sm_X_train = sm.add_constant(X_train_scaled, has_constant='add')
sm_model = sm.OLS(y_train, sm_X_train)

In [317]:
fitted_sm_model = sm_model.fit()
fitted_sm_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.603
Model:,OLS,Adj. R-squared:,0.603
Method:,Least Squares,F-statistic:,2868.0
Date:,"Tue, 07 Sep 2021",Prob (F-statistic):,0.0
Time:,18:22:43,Log-Likelihood:,-208010.0
No. Observations:,15129,AIC:,416000.0
Df Residuals:,15120,BIC:,416100.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.398e+05,1841.492,293.148,0.000,5.36e+05,5.43e+05
x1,9.355e+04,2988.685,31.303,0.000,8.77e+04,9.94e+04
x2,-946.2382,2209.214,-0.428,0.668,-5276.565,3384.088
x3,1.178e+04,2033.873,5.790,0.000,7790.045,1.58e+04
x4,4062.3005,2293.635,1.771,0.077,-433.502,8558.103
x5,2.29e+05,2544.010,90.016,0.000,2.24e+05,2.34e+05
x6,7.332e+04,1864.049,39.333,0.000,6.97e+04,7.7e+04
x7,-1.259e+05,2545.537,-49.461,0.000,-1.31e+05,-1.21e+05
x8,3163.5864,1962.209,1.612,0.107,-682.580,7009.753

0,1,2,3
Omnibus:,10707.726,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,554452.207
Skew:,2.836,Prob(JB):,0.0
Kurtosis:,32.11,Cond. No.,3.29


In [318]:
sm_X_test = sm.add_constant(X_test_scaled, has_constant='add')
sm_model_predict = fitted_sm_model.predict(sm_X_test)

In [319]:
np.sqrt(mean_squared_error(y_test,sm_model_predict))

248141.50969649773

In [320]:
sm_X_train = pd.DataFrame(sm_X_train,index=y_train.index)
sm_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
1586,1.0,1.479993,0.690018,-0.633445,0.933554,2.851289,-0.092008,0.645544,-0.208961
17068,1.0,-0.147743,1.787389,2.438648,-0.915466,-0.555598,-0.092008,-0.877653,-0.208961
21524,1.0,0.503351,0.690018,-0.633445,0.933554,1.999567,-0.092008,1.254823,-0.208961
4989,1.0,-0.47329,-0.407352,0.902601,-0.915466,-0.555598,-0.092008,0.239358,-0.208961
4188,1.0,-1.449932,-0.407352,-0.633445,-0.915466,-0.555598,-0.092008,-1.080746,-0.208961


In [323]:
sm_X_test = pd.DataFrame(sm_X_test, index=y_test.index)
sm_X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
17527,1.0,-0.47329,-0.407352,0.902601,-0.915466,0.296124,-0.092008,-0.945351,-0.208961
12273,1.0,0.828898,1.787389,2.438648,0.009044,-0.555598,-0.092008,0.611695,-0.208961
12742,1.0,-1.449932,-0.407352,-2.169492,-0.915466,-1.40732,-0.092008,-0.336072,-0.208961
6343,1.0,0.503351,-0.407352,-0.633445,0.933554,-0.555598,-0.092008,1.153277,-0.208961
5894,1.0,0.503351,0.690018,-0.633445,0.933554,-0.555598,-0.092008,0.747091,-0.208961


In [324]:
# bagging을 for문으로 만들어보기

bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(X_train.shape[0])]
    random_data_index = np.random.choice(data_index, X_train.shape[0])
    print(len(set(random_data_index)))
    bg_X_train = sm_X_train.iloc[random_data_index,]
    bg_y_train = y_train.iloc[random_data_index,]
    sm_model = sm.OLS(bg_y_train,bg_X_train)
    fitted_sm_model = sm_model.fit()
    pred = fitted_sm_model.predict(sm_X_test)
    bagging_predict_result.append(pred)
    print(np.sqrt(mean_squared_error(y_test,pred)))

9514
248542.96528828936
9504
248236.4165701166
9514
248075.38223490448
9548
248878.57433757745
9572
247750.99680667382
9623
248832.80486046406
9622
248273.5934018418
9570
248293.4935494066
9600
247690.156770017
9522
248235.49053694136


In [325]:
# bagging 결과의 평균 구하기
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index].values[lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [326]:
np.sqrt(mean_squared_error(bagging_predict,y_test))

248148.71394643662

In [327]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
fitted_linear_model = regression_model.fit(X_train_scaled, y_train)
pred_test_linear = fitted_linear_model.predict(X_test_scaled)
print(np.sqrt(mean_squared_error(y_test,pred_test_linear)))

248141.50969649773


In [328]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
bagging_model = BaggingRegressor(base_estimator=regression_model, n_estimators=10)
fitted_bagging_model = bagging_model.fit(X_train_scaled, y_train)
pred_test_bg = fitted_bagging_model.predict(X_test_scaled)

print(np.sqrt(mean_squared_error(y_test,pred_test_bg)))

248130.15468965747


In [329]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr_model1 = dtr.fit(X_train_scaled, y_train)
pred_test_dtr = dtr.predict(X_test_scaled)

print(np.sqrt(mean_squared_error(y_test,pred_test_dtr)))

267441.81575594534


In [330]:
X_train_scaled = pd.DataFrame(X_train_scaled, index=y_train.index)
X_train_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7
1586,1.479993,0.690018,-0.633445,0.933554,2.851289,-0.092008,0.645544,-0.208961
17068,-0.147743,1.787389,2.438648,-0.915466,-0.555598,-0.092008,-0.877653,-0.208961
21524,0.503351,0.690018,-0.633445,0.933554,1.999567,-0.092008,1.254823,-0.208961
4989,-0.47329,-0.407352,0.902601,-0.915466,-0.555598,-0.092008,0.239358,-0.208961
4188,-1.449932,-0.407352,-0.633445,-0.915466,-0.555598,-0.092008,-1.080746,-0.208961


In [331]:
X_test_scaled = pd.DataFrame(X_test_scaled, index=y_test.index)
X_test_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7
17527,-0.47329,-0.407352,0.902601,-0.915466,0.296124,-0.092008,-0.945351,-0.208961
12273,0.828898,1.787389,2.438648,0.009044,-0.555598,-0.092008,0.611695,-0.208961
12742,-1.449932,-0.407352,-2.169492,-0.915466,-1.40732,-0.092008,-0.336072,-0.208961
6343,0.503351,-0.407352,-0.633445,0.933554,-0.555598,-0.092008,1.153277,-0.208961
5894,0.503351,0.690018,-0.633445,0.933554,-0.555598,-0.092008,0.747091,-0.208961


In [334]:
bagging_predict_result = []
for _ in range(10):
    data_index = [data_index for data_index in range(X_train.shape[0])]
    random_data_index = np.random.choice(data_index, X_train.shape[0])
    print(len(set(random_data_index)))
    bg_X_train = X_train_scaled.iloc[random_data_index,]
    bg_y_train = y_train.iloc[random_data_index,]
    dtr = DecisionTreeRegressor()
    dtr.fit(bg_X_train,bg_y_train)
    pred_tree = dtr.predict(X_test_scaled)
    bagging_predict_result.append(pred_tree)
    print(np.sqrt(mean_squared_error(y_test,pred_tree)))

9538
265025.67182190384
9476
289055.7504580901
9612
280866.52658801945
9502
269646.7801176419
9542
278810.7718611196
9555
282686.30726748984
9504
287126.8055049103
9517
283674.04448530613
9596
284506.5975839485
9623
278492.96912895364


In [335]:
### 주의 values는 빠져야 한다!!!
# Bagging을 바탕으로 예측한 결과값에 대한 평균을 계산
bagging_predict = [] # 빈 리스트 생성
for lst2_index in range(X_test.shape[0]): # 테스트 데이터 개수만큼의 반복
    temp_predict = [] # 임시 빈 리스트 생성 (반복문 내 결과값 저장)
    for lst_index in range(len(bagging_predict_result)): # Bagging 결과 리스트 반복
        temp_predict.append(bagging_predict_result[lst_index][lst2_index]) # 각 Bagging 결과 예측한 값 중 같은 인덱스를 리스트에 저장
    bagging_predict.append(np.mean(temp_predict)) # 해당 인덱스의 30개의 결과값에 대한 평균을 최종 리스트에 추가

In [336]:
np.sqrt(mean_squared_error(y_test,bagging_predict))

228917.3976470912

In [337]:
dtr = DecisionTreeRegressor()
bagging_decision_tree_model = BaggingRegressor(base_estimator=dtr,
                                              n_estimators=10,
                                              verbose=1)#학습 과정 표시
bagging_decision_tree_model.fit(X_train_scaled,y_train)
pred_test_tree_bagging = bagging_decision_tree_model.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test,pred_test_tree_bagging))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


236682.62166123808