# Regression-2: ridge vs ols

最小２乗回帰とリッジ回帰モデルを構築しモデル性能とその中身を比較してみましょう。<br>データはボストン・ハウジングデータを使います。

In [1]:
# import the data for regression
import pandas as pd
from IPython.core.display import display
from sklearn.datasets import load_boston

# set dataframe
dataset = load_boston()
X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
y = pd.DataFrame(dataset.target, columns=['y'])

# check the shape
print('--------------------------------------------')
print('X shape: (%i,%i)' %X.shape)
print('y shape: (%i,%i)' %y.shape)
print('--------------------------------------------')
display(X.join(y).head())

--------------------------------------------
X shape: (506,13)
y shape: (506,1)
--------------------------------------------


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


OLSとRidgeのどちらが良い予測モデルかをholdout（交差検証）により検証してみましょう。また、リッジ回帰のalphaを変化させた時（ドフォルトの1.0から10.0などへ）のモデルパフォーマンスや、偏回帰係数の総和の変化を見てみましょう。このデータでは、OLSとリッジ回帰に大きな性能差は見られないと思います。ただリッジ回帰のalphaを大きくすると、係数総和が減少していく様子が確認できるはずです。

In [7]:
# import libraries
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 交差検証のためデータを訓練とテストに分割
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.20,
                                                 random_state=1)
# 整形
y_train = y_train.as_matrix().ravel()
y_test = y_test.as_matrix().ravel()

# make pipelines
pipe_ols = Pipeline([('scl',StandardScaler()),('est',LinearRegression())])
pipe_ridge = Pipeline([('scl',StandardScaler()),('est',Ridge(alpha=1.0))])

# build models
pipe_ols.fit(X_train,y_train)
pipe_ridge.fit(X_train,y_train)

# 性能指標の表示
print('-----------------------------------------------------')
print('Test Score of OLS:%.6f'% r2_score(y_test, pipe_ols.predict(X_test)))
print('Test Score of Ridge:%.6f'% r2_score(y_test, pipe_ridge.predict(X_test)))

# 回帰係数の総和比較
# リッジ回帰の正則化項の役割把握のため（モデルの性能評価ではありません）
print('-----------------------------------------------------')
print('Absolute Sum of coefficient of OLS:%.6f'%(np.absolute(pipe_ols.named_steps['est'].coef_).sum()))
print('Absolute Sum of coefficient of Ridge:%.6f'%(np.absolute(pipe_ridge.named_steps['est'].coef_).sum()))

-----------------------------------------------------
Test Score of OLS:0.763481
Test Score of Ridge:0.763468
-----------------------------------------------------
Absolute Sum of coefficient of OLS:22.070732
Absolute Sum of coefficient of Ridge:21.717317


  from ipykernel import kernelapp as app
  app.launch_new_instance()
