## 練習時間
試著使用 sklearn datasets 的其他資料集 (boston, ...)，來訓練自己的線性迴歸模型，並加上適當的正則話來觀察訓練情形。

In [1]:
from sklearn import datasets, linear_model
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import pandas as pd

In [2]:
boston = datasets.load_boston()

In [3]:
bos = pd.DataFrame(boston.data)
bos.columns = boston.feature_names
bos['PRICE'] = boston.target
bos.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [4]:
# 將所有變數都放進去train
X = bos.drop('PRICE', axis = 1)

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, bos.PRICE, test_size=0.1, random_state=4)

# 建立一個線性回歸模型
regr = linear_model.LinearRegression()

# 將訓練資料丟進去模型訓練
regr.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

In [5]:
# 看回歸模型的參數值
print('Coefficients: ', regr.coef_)

# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Coefficients:  [ -1.24793110e-01   4.83961673e-02   1.88111508e-02   3.08800922e+00
  -1.73655165e+01   3.60982405e+00   2.27233321e-03  -1.49381500e+00
   3.19455416e-01  -1.27236845e-02  -9.28369630e-01   9.60925451e-03
  -5.34508193e-01]
Mean squared error: 17.03


### LASSO

In [6]:
# 將所有變數都放進去train
X = bos.drop('PRICE', axis = 1)

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, bos.PRICE, test_size=0.1, random_state=4)

# 建立一個線性回歸模型
lasso = linear_model.Lasso(alpha=1.0)

# 將訓練資料丟進去模型訓練
lasso.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = lasso.predict(x_test)

In [7]:
# 印出各特徵對應的係數，可以看到許多係數都變成 0，Lasso Regression 的確可以做特徵選取
print('Coefficients: ', lasso.coef_)

# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

Coefficients:  [-0.07256167  0.049677   -0.          0.         -0.          0.80504721
  0.02330318 -0.68471274  0.26857502 -0.01526236 -0.71722423  0.00834102
 -0.77160917]
Mean squared error: 23.25


### Ridge

In [8]:
# 將所有變數都放進去train
X = bos.drop('PRICE', axis = 1)

# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, bos.PRICE, test_size=0.1, random_state=4)

# 建立一個線性回歸模型
ridge = linear_model.Ridge(alpha=1.0)

# 將訓練資料丟進去模型訓練
ridge.fit(x_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = regr.predict(x_test)

In [9]:
# 印出 Ridge 的參數，可以很明顯看到比起 Linear Regression，參數的數值都明顯小了許多
print(ridge.coef_)

# 預測值與實際值的差距，使用 MSE
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))

[ -1.21252383e-01   4.95144936e-02  -1.12401718e-02   2.89351315e+00
  -1.00608832e+01   3.66023694e+00  -4.37201416e-03  -1.38975681e+00
   3.01632305e-01  -1.32209825e-02  -8.52842040e-01   9.96596180e-03
  -5.44637977e-01]
Mean squared error: 17.03
