# Sprint8_アンサンブル学習

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('drive/My Drive/data/house-prices-advanced-regression-techniques_train.csv')

X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# 【問題1】ブレンディングのスクラッチ実装
ブレンディング をスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。精度があがるとは、検証データに対する平均二乗誤差（MSE）が小さくなることを指します。

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

In [38]:
#デフォルト（基準）

print('----Default models MSE----')

linear = LinearRegression().fit(X_train, y_train)
linear_Y_pred = linear.predict(X_test)
print(f'LinearRegression; {mean_squared_error(y_test, linear_Y_pred)/ 100000000}') #best（単位：億）

svr = SVR().fit(X_train, y_train)
svr_Y_pred = svr.predict(X_test)
print(f'SVR; {mean_squared_error(y_test, svr_Y_pred)/ 100000000}')

tree = DecisionTreeRegressor().fit(X_train, y_train)
tree_Y_pred = tree.predict(X_test)
print(f'DecisionTreeRegressor; {mean_squared_error(y_test, tree_Y_pred)/ 100000000}')

----Default models MSE----
LinearRegression; 23.09870747753036
SVR; 80.80337267890563
DecisionTreeRegressor; 29.96875189784627


In [39]:
#手法のブレンディング

Y_pred = np.mean([linear_Y_pred, svr_Y_pred, tree_Y_pred], axis=0)
print(f'linear, svr, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([linear_Y_pred, svr_Y_pred], axis=0)
print(f'linear, svr; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([svr_Y_pred, tree_Y_pred], axis=0)
print(f'svr, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([linear_Y_pred, tree_Y_pred], axis=0)
print(f'linear, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}') #best 単位：億

linear, svr, tree; 29.819944209022392
linear, svr; 41.22424759331846
svr, tree; 37.79910231171434
linear, tree; 21.539232578838142


In [40]:
#標準化のブレンディング

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linear = LinearRegression().fit(X_train_scaled, y_train)
linear_Y_pred = linear.predict(X_test_scaled)
print(f'LinearRegression; {mean_squared_error(y_test, linear_Y_pred) / 100000000}')

tree = DecisionTreeRegressor(max_depth=5, min_samples_leaf=10).fit(X_train_scaled, y_train)
tree_Y_pred = tree.predict(X_test_scaled)
print(f'DecisionTreeRegressor; {mean_squared_error(y_test, tree_Y_pred) / 100000000}')

svr = SVR(kernel='poly', C=400, epsilon=25, coef0=15).fit(X_train_scaled, y_train)
svr_Y_pred = svr.predict(X_test_scaled)
print(f'SVR; {mean_squared_error(y_test, svr_Y_pred) / 100000000}') #good

print()

Y_pred = np.mean([linear_Y_pred, svr_Y_pred, tree_Y_pred], axis=0)
print(f'linear, svr, tree; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([linear_Y_pred, svr_Y_pred], axis=0)
print(f'linear, svr; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([svr_Y_pred, tree_Y_pred], axis=0)
print(f'svr, tree; {mean_squared_error(y_test, Y_pred) / 100000000}') #best

Y_pred = np.mean([linear_Y_pred, tree_Y_pred], axis=0)
print(f'linear, tree; {mean_squared_error(y_test, Y_pred) / 100000000}') 

LinearRegression; 23.098707477530354
DecisionTreeRegressor; 20.56496764724979
SVR; 24.76449909334306

linear, svr, tree; 21.047850671527886
linear, svr; 23.269784707437626
svr, tree; 21.21656208803024
linear, tree; 19.978360770000677


In [41]:
#次元削減（1列目削除）

linear = LinearRegression().fit(X_train_scaled[:,0].reshape(-1,1), y_train)
linear_Y_pred = linear.predict(X_test_scaled[:,0].reshape(-1,1))
print(f'LinearRegression; {mean_squared_error(y_test, linear_Y_pred) / 100000000}')

tree = DecisionTreeRegressor(max_depth=5, min_samples_leaf=10).fit(X_train_scaled[:,0].reshape(-1,1), y_train)
tree_Y_pred = tree.predict(X_test_scaled[:,0].reshape(-1,1))
print(f'DecisionTreeRegressor; {mean_squared_error(y_test, tree_Y_pred) / 100000000}')

svr = SVR(kernel='poly', C=400, epsilon=25, coef0=15).fit(X_train_scaled[:,0].reshape(-1,1), y_train) #best (単位：億)
svr_Y_pred = svr.predict(X_test_scaled[:,0].reshape(-1,1))
print(f'SVR; {mean_squared_error(y_test, svr_Y_pred) / 100000000}')

print()

Y_pred = np.mean([linear_Y_pred, svr_Y_pred, tree_Y_pred], axis=0)
print(f'linear, svr, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([linear_Y_pred, svr_Y_pred], axis=0)
print(f'linear, svr; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([svr_Y_pred, tree_Y_pred], axis=0)
print(f'svr, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([linear_Y_pred, tree_Y_pred], axis=0)
print(f'linear, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}') 

LinearRegression; 34.78476223193823
DecisionTreeRegressor; 37.45350854184568
SVR; 38.62927186329794

linear, svr, tree; 36.42667167244415
linear, svr; 36.504338710096356
svr, tree; 37.5432562926157
linear, tree; 35.62930191955775


In [42]:
#次元削減（0列目削除） ※良い結果得られず

linear = LinearRegression().fit(X_train_scaled[:,1].reshape(-1,1), y_train)
linear_Y_pred = linear.predict(X_test_scaled[:,1].reshape(-1,1))
print(f'LinearRegression; {mean_squared_error(y_test, linear_Y_pred) / 100000000}')

tree = DecisionTreeRegressor().fit(X_train_scaled[:,1].reshape(-1,1), y_train)
tree_Y_pred = tree.predict(X_test_scaled[:,1].reshape(-1,1))
print(f'DecisionTreeRegressor; {mean_squared_error(y_test, tree_Y_pred) / 100000000}')

svr = SVR(C=300).fit(X_train_scaled[:,1].reshape(-1,1), y_train)
svr_Y_pred = svr.predict(X_test_scaled[:,1].reshape(-1,1))
print(f'SVR; {mean_squared_error(y_test, svr_Y_pred) / 100000000}')

print()

Y_pred = np.mean([linear_Y_pred, svr_Y_pred, tree_Y_pred], axis=0)
print(f'linear, svr, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([linear_Y_pred, svr_Y_pred], axis=0)
print(f'linear, svr; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([svr_Y_pred, tree_Y_pred], axis=0)
print(f'svr, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

Y_pred = np.mean([linear_Y_pred, tree_Y_pred], axis=0)
print(f'linear, tree; {mean_squared_error(y_test, Y_pred)/ 100000000}')

LinearRegression; 53.78966293259382
DecisionTreeRegressor; 47.86023914334054
SVR; 61.13540481769159

linear, svr, tree; 50.50274469991106
linear, svr; 55.73078171791606
svr, tree; 50.51105236202921
linear, tree; 48.08566821826108


In [43]:
#GrLivAreaカラムの外れ値除外
limit_low=df['GrLivArea'].quantile(.5)
limit_high=df['GrLivArea'].quantile(.95)

df_selected = df.query('not @limit_low < GrLivArea < @limit_high')

X = df_selected[['GrLivArea', 'YearBuilt']].values
y = df_selected['SalePrice'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [44]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linear = LinearRegression().fit(X_train_scaled, y_train)
linear_Y_pred = linear.predict(X_test_scaled)
print(f'LinearRegression; {mean_squared_error(y_test, linear_Y_pred) / 100000000}')

tree = DecisionTreeRegressor(max_depth=5, min_samples_leaf=10).fit(X_train_scaled, y_train)
tree_Y_pred = tree.predict(X_test_scaled)
print(f'DecisionTreeRegressor; {mean_squared_error(y_test, tree_Y_pred) / 100000000}')

svr = SVR(kernel='poly', C=400, epsilon=25, coef0=15).fit(X_train_scaled, y_train)
svr_Y_pred = svr.predict(X_test_scaled)
print(f'SVR; {mean_squared_error(y_test, svr_Y_pred) / 100000000}') #good

print()

Y_pred = np.mean([linear_Y_pred, svr_Y_pred, tree_Y_pred], axis=0)
print(f'linear, svr, tree; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([linear_Y_pred, svr_Y_pred], axis=0)
print(f'linear, svr; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([svr_Y_pred, tree_Y_pred], axis=0)
print(f'svr, tree; {mean_squared_error(y_test, Y_pred) / 100000000}') #best

Y_pred = np.mean([linear_Y_pred, tree_Y_pred], axis=0)
print(f'linear, tree; {mean_squared_error(y_test, Y_pred) / 100000000}') 

LinearRegression; 27.064227625751375
DecisionTreeRegressor; 29.638022391294385
SVR; 27.67105633409051

linear, svr, tree; 26.64429462308935
linear, svr; 26.54873539334852
svr, tree; 27.692373625078
linear, tree; 26.801880471308593


# 【問題2】バギングのスクラッチ実装
バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。


バギングとは
バギングは入力データの選び方を多様化する方法です。訓練データから重複を許した上でランダムに抜き出すことで、N種類のサブセット（ ブートストラップサンプル ）を作り出します。それらによってモデルをN個学習し、推定結果の平均をとります。ブレンディングと異なり、それぞれの重み付けを変えることはありません。


推定結果の平均をとる部分はブースティングと同様の実装になります。

In [45]:
import random

temp = df[['GrLivArea', 'YearBuilt', 'SalePrice']].values.reshape(-1,3)

def make_samples(temp=temp):
  temp_array = np.array([])
  samples = 10000

  for i in range(samples):
    v = random.randint(0, temp.shape[0] -1)
    temp_array = np.append(temp_array, temp[v,:])

  temp_array = temp_array.reshape(-1,3)

  X = temp_array[:,:2]
  y = temp_array[:,2]

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

  return X_train, X_test, y_train, y_test

In [46]:
#10000サンプル作成
X_train, X_test, y_train, y_test = make_samples()

linear = LinearRegression().fit(X_train, y_train)
linear_Y_pred = linear.predict(X_test)
print(f'LinearRegression; {mean_squared_error(y_test, linear_Y_pred) / 100000000}')

tree = DecisionTreeRegressor().fit(X_train, y_train)
tree_Y_pred = tree.predict(X_test)
print(f'DecisionTreeRegressor; {mean_squared_error(y_test, tree_Y_pred) / 100000000}')

svr = SVR().fit(X_train, y_train)
svr_Y_pred = svr.predict(X_test)
print(f'SVR; {mean_squared_error(y_test, svr_Y_pred) / 100000000}') 

#決定木単独の結果が最良

LinearRegression; 21.468095301308658
DecisionTreeRegressor; 0.27247049287199027
SVR; 61.93585873471752


In [47]:
#平均を算出

Y_pred = np.mean([linear_Y_pred, svr_Y_pred, tree_Y_pred], axis=0)
print(f'linear, svr, tree; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([linear_Y_pred, svr_Y_pred], axis=0)
print(f'linear, svr; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([svr_Y_pred, tree_Y_pred], axis=0)
print(f'svr, tree; {mean_squared_error(y_test, Y_pred) / 100000000}')

Y_pred = np.mean([linear_Y_pred, tree_Y_pred], axis=0)
print(f'linear, tree; {mean_squared_error(y_test, Y_pred) / 100000000}') #best

linear, svr, tree; 13.792764589047465
linear, svr; 30.73057966815517
svr, tree; 15.649758459634432
linear, tree; 5.572488329791733


# 【問題3】スタッキングのスクラッチ実装
スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。


スタッキングとは
スタッキングの手順は以下の通りです。最低限ステージ0とステージ1があればスタッキングは成立するため、それを実装してください。まずは 
K
0
=
3
,
M
0
=
2
 程度にします。

In [4]:
df = pd.read_csv('drive/My Drive/data/house-prices-advanced-regression-techniques_train.csv')

X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.2, random_state=0) #訓練データとテストデータ

In [20]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold

model_1 = LinearRegression()
model_5 = SVR(kernel='poly', C=400, epsilon=25, coef0=15)
model_3 = RandomForestRegressor(max_depth=5, min_samples_leaf=10) 
model_6 = DecisionTreeRegressor(max_depth=5, min_samples_leaf=10) 
model_4 = Ridge() 
model_2 = Lasso() 


Fold = 8

preds_array = np.zeros((1022,1)) #ブレンドデータ
test_preds_array = np.zeros((292,1)) #テストデータ（推定結果）

kf = KFold(n_splits=Fold, random_state=0, shuffle=True)

for train_id,test_id in kf.split(X_train_valid):
  #訓練データ、テストデータ作成
  X_train ,y_train = X_train_valid[train_id], y_train_valid[train_id]
  X_test_kford ,y_test_kford = X_train_valid[train_id], y_train_valid[train_id]

  #モデル1、検証データにて推定、データ格納
  reg = model_1.fit(X_train, y_train)
  Y_pred_model_1 = reg.predict(X_test_kford).reshape(-1,1)
  preds_array = np.append(preds_array, Y_pred_model_1, axis=1)
  
  #テストデータにて推定、データ格納  
  Y_pred_model_1 = reg.predict(X_test).reshape(-1,1)
  test_preds_array = np.append(test_preds_array, Y_pred_model_1, axis=1)

  #モデル2、検証データにて推定、データ格納
  reg = model_2.fit(X_train, y_train)
  Y_pred_model_2 = reg.predict(X_test_kford).reshape(-1,1)
  preds_array = np.append(preds_array, Y_pred_model_2, axis=1)

  #テストデータにて推定、データ格納  
  Y_pred_model_2 = reg.predict(X_test).reshape(-1,1)
  test_preds_array = np.append(test_preds_array, Y_pred_model_2, axis=1)

  #モデル3、検証データにて推定、データ格納
  reg = model_3.fit(X_train, y_train)
  Y_pred_model_3 = reg.predict(X_test_kford).reshape(-1,1)
  preds_array = np.append(preds_array, Y_pred_model_3, axis=1)

  #テストデータにて推定、データ格納  
  Y_pred_model_3 = reg.predict(X_test).reshape(-1,1)
  test_preds_array = np.append(test_preds_array, Y_pred_model_3, axis=1)

  #モデル4、検証データにて推定、データ格納
  reg = model_4.fit(X_train, y_train)
  Y_pred_model_4 = reg.predict(X_test_kford).reshape(-1,1)
  preds_array = np.append(preds_array, Y_pred_model_4, axis=1)

  #テストデータにて推定、データ格納  
  Y_pred_model_4 = reg.predict(X_test).reshape(-1,1)
  test_preds_array = np.append(test_preds_array, Y_pred_model_4, axis=1)

  #モデル5、検証データにて推定、データ格納
  reg = model_5.fit(X_train, y_train)
  Y_pred_model_5 = reg.predict(X_test_kford).reshape(-1,1)
  preds_array = np.append(preds_array, Y_pred_model_5, axis=1)

  #テストデータにて推定、データ格納  
  Y_pred_model_5 = reg.predict(X_test).reshape(-1,1)
  test_preds_array = np.append(test_preds_array, Y_pred_model_5, axis=1)

meta_model = model_6
meta_model.fit(preds_array[:,1:],  y_test_kford)

# スタッキングの検証
meta_test_pred = meta_model.predict(test_preds_array[:,1:])
print(f'MSE; {mean_squared_error(y_test, meta_test_pred) / 100000000}') #best

MSE; 18.91176634058408
