In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
# 讀取糖尿病資料集
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target

# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
reg = linear_model.LinearRegression()

# 將訓練資料丟進去模型訓練
reg.fit(X_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = reg.predict(X_test)


print(reg.coef_)

# 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error: %.2f" % mse)

[  33.40877011 -292.24672884  481.07153405  369.06269614 -966.37849405
  589.81383056  232.61924401  288.3263166   802.72704593   37.81285219]
Mean squared error: 2939.42


## LASSO

In [3]:
# 讀取糖尿病資料集
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target

# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
lasso = linear_model.Lasso(alpha=1.0)

# 將訓練資料丟進去模型訓練
lasso.fit(X_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = lasso.predict(X_test)


print(lasso.coef_)

# 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error: %.2f" % mse)

[  0.          -0.         321.203877    57.74744332   0.
   0.          -0.           0.         332.41817196   0.        ]
Mean squared error: 3505.84


## Ridge

In [4]:
# 讀取糖尿病資料集
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target

# 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# 建立一個線性回歸模型
ridge = linear_model.Ridge(alpha=1.0)

# 將訓練資料丟進去模型訓練
ridge.fit(X_train, y_train)

# 將測試資料丟進模型得到預測結果
y_pred = ridge.predict(X_test)


print(ridge.coef_)

# 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error: %.2f" % mse)

[  48.8125786   -85.49511577  270.22532535  201.91767903   17.41308665
  -19.04346706 -136.47737574  122.26503311  247.60074795   95.59855598]
Mean squared error: 3221.42


### 可以看見 LASSO 與 Ridge 的結果並沒有比原本的線性回歸來得好， 這是因為目標函數被加上了正規化函數，讓模型不能過於複雜，相當於限制模型擬和資料的能力。因此若沒有發現 Over-fitting 的情況，是可以不需要一開始就加上太強的正規化的。

## [作業重點]

使用 Sklearn 中的 Lasso, Ridge 模型，來訓練各種資料集，務必了解送進去模型訓練的資料型態為何，也請了解模型中各項參數的意義。

機器學習的模型非常多種，但要訓練的資料多半有固定的格式，確保你了解訓練資料的格式為何，這樣在應用新模型時，就能夠最快的上手開始訓練！

## 練習時間

試著使用 sklearn datasets 的其他資料集 (boston, ...)，來訓練自己的線性迴歸模型，並調整不同的 alpha (正規化參數) 來觀察模型訓練的情形。

## boston

In [5]:
# 讀取 boston 資料集 - ridge
boston = datasets.load_boston()

print('target y : %s \n' % boston.target[0:5])
print('shape:\n', boston.data.shape)
print('\nfeature name:\n', boston.feature_names)
# print('\ntarget names:\n', boston.target_names)

# # 為方便視覺化，我們只使用資料集中的 1 個 feature (column)
# X = boston.data[:, np.newaxis, 2]
# print("Data shape: ", X.shape) # 我們取出的其中一個 feature

# 上述範例是為了視覺化方便只取1個feature，這次用所有feature
X = boston.data
y = boston.target

# train test split - 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 87)

target y : [24.  21.6 34.7 33.4 36.2] 

shape:
 (506, 13)

feature name:
 ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [6]:
# model - 建立一個線性回歸模型 - LinearRegression
reg = linear_model.LinearRegression()

# fit (train model) - 將訓練資料丟進去模型訓練
reg.fit(X_train, y_train)
# cross validation
print(f'\n c.v. score : {cross_val_score(reg, X_train, y_train, cv=5).mean()}\n')

# predict - 將測試資料丟進模型得到預測結果
y_pred = reg.predict(X_test)


print(reg.coef_)

# evaluation - 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE : %.2f" % mse)


 c.v. score : 0.6796744375539973

[-1.39098133e-01  3.97316638e-02  1.56281579e-02  2.23420723e+00
 -1.73650287e+01  3.81367677e+00  4.13297396e-06 -1.32562522e+00
  3.00934394e-01 -1.04274363e-02 -9.69146963e-01  8.59336770e-03
 -5.28261221e-01]
MSE : 26.57


In [7]:
# model - 建立一個線性回歸模型 - Lasso
lasso = linear_model.Lasso(alpha=0.001)

# fit (train model) - 將訓練資料丟進去模型訓練
lasso.fit(X_train, y_train)
# cross validation
print(f'\n c.v. score : {cross_val_score(lasso, X_train, y_train, cv=5).mean()}\n')

# predict - 將測試資料丟進模型得到預測結果
y_pred = lasso.predict(X_test)


print(lasso.coef_)

# evaluation - 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE : %.2f" % mse)


 c.v. score : 0.679859678061506

[-1.38867043e-01  3.97653707e-02  1.42925929e-02  2.21642968e+00
 -1.70477891e+01  3.81471723e+00 -2.46258642e-04 -1.32093461e+00
  3.00137218e-01 -1.04560694e-02 -9.65564144e-01  8.62094784e-03
 -5.28675787e-01]
MSE : 26.58


In [8]:
# model - 建立一個線性回歸模型 - Ridge
ridge = linear_model.Ridge(alpha=0.01)

# fit (train model) - 將訓練資料丟進去模型訓練
ridge.fit(X_train, y_train)
# cross validation
print(f'\n c.v. score : {cross_val_score(ridge, X_train, y_train, cv=5).mean()}\n')

# predict - 將測試資料丟進模型得到預測結果
y_pred = ridge.predict(X_test)


print(ridge.coef_)

# evaluation - 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE : %.2f" % mse)


 c.v. score : 0.67972737145291

[-1.38990807e-01  3.97489489e-02  1.49915639e-02  2.23189097e+00
 -1.72149884e+01  3.81515912e+00 -1.37906718e-04 -1.32355812e+00
  3.00537430e-01 -1.04400411e-02 -9.67402512e-01  8.60619254e-03
 -5.28363616e-01]
MSE : 26.57


## breast_cancer

In [9]:
# 讀取 breast_cancer 資料集 - ridge
breast_cancer = datasets.load_breast_cancer()

print('target y : %s \n' % breast_cancer.target[0:5])
print('shape:\n', breast_cancer.data.shape)
print('\nfeature name:\n', breast_cancer.feature_names)
print('\ntarget names:\n', breast_cancer.target_names)

# # 為方便視覺化，我們只使用資料集中的 1 個 feature (column)
# X = breast_cancer.data[:, np.newaxis, 2]
# print("Data shape: ", X.shape) # 我們取出的其中一個 feature

# 上述範例是為了視覺化方便只取1個feature，這次用所有feature
X = breast_cancer.data
y = breast_cancer.target

# train test split - 切分訓練集/測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 87)

target y : [0 0 0 0 0] 

shape:
 (569, 30)

feature name:
 ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

target names:
 ['malignant' 'benign']


In [10]:
# model - 建立一個線性回歸模型 - LogisticRegression
logistic = linear_model.LogisticRegression(penalty = 'l2', C = 3)

# fit (train model) - 將訓練資料丟進去模型訓練
logistic.fit(X_train, y_train)
# cross validation
print(f'\n c.v. score : {cross_val_score(logistic, X_train, y_train, cv=5).mean()}\n')

# predict - 將測試資料丟進模型得到預測結果
y_pred = logistic.predict(X_test)


print(logistic.coef_)

# evaluation - 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE : %.2f" % mse)


 c.v. score : 0.9572435897435898

[[ 3.16827771e+00  1.12453942e-01 -4.34139616e-01  3.00125078e-02
  -1.98044056e-01 -6.48121787e-01 -9.43354336e-01 -4.69343500e-01
  -3.85536326e-01 -4.56262392e-02  1.49809629e-03  2.50543793e-01
   5.63424081e-01 -5.67725023e-02 -2.36742727e-02 -5.09932642e-02
  -1.00641715e-01 -4.70169637e-02 -1.27031587e-01 -5.03053313e-04
   1.95723365e+00 -2.51917256e-01 -7.97381654e-02 -5.04585602e-02
  -3.24692870e-01 -1.33672162e+00 -1.76207024e+00 -7.69479707e-01
  -1.03706326e+00 -1.23606900e-01]]
MSE : 0.05


In [11]:
# model - 建立一個線性回歸模型 - Lasso
lasso = linear_model.Lasso(alpha=0.00001)

# fit (train model) - 將訓練資料丟進去模型訓練
lasso.fit(X_train, y_train)
# cross validation
print(f'\n c.v. score : {cross_val_score(lasso, X_train, y_train, cv=5).mean()}\n')

# predict - 將測試資料丟進模型得到預測結果
y_pred = lasso.predict(X_test)


print(lasso.coef_)

# evaluation - 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE : %.2f" % mse)


 c.v. score : 0.7408237815245793

[ 1.20553660e-01 -1.14784762e-03 -4.63939812e-03 -3.99130681e-04
  1.52608994e+00  3.25641330e+00 -2.13807946e+00 -0.00000000e+00
 -1.55253507e+00 -0.00000000e+00 -4.32083183e-01  0.00000000e+00
  6.30641734e-02 -4.63093256e-04 -8.68528800e+00  8.52394865e-01
  3.93782656e+00 -3.39746324e+00 -8.78622485e+00 -0.00000000e+00
 -1.92321112e-01 -8.80263185e-03 -2.73561625e-03  1.14283378e-03
 -1.94817644e+00 -1.20428561e-01 -3.60071000e-01 -1.48160700e+00
  5.98256098e-01 -2.94987573e+00]
MSE : 0.07


In [12]:
# model - 建立一個線性回歸模型 - Ridge
ridge = linear_model.Ridge(alpha=0.00001)

# fit (train model) - 將訓練資料丟進去模型訓練
ridge.fit(X_train, y_train)
# cross validation
print(f'\n c.v. score : {cross_val_score(ridge, X_train, y_train, cv=5).mean()}\n')

# predict - 將測試資料丟進模型得到預測結果
y_pred = ridge.predict(X_test)


print(ridge.coef_)

# evaluation - 預測值與實際值的差距，使用 MSE
mse = mean_squared_error(y_test, y_pred)
print("MSE : %.2f" % mse)


 c.v. score : 0.7318954196904071

[ 3.26938874e-01  9.53124753e-04 -1.85952136e-02 -1.17469035e-03
  1.34958589e+00  3.62487948e+00 -2.10534213e+00 -1.10713037e-01
 -1.89910584e+00  1.94433255e-01 -2.68422314e-01  2.02930004e-03
  9.56930639e-02 -2.11531847e-03 -1.27801277e+01  2.22602918e+00
  4.33521512e+00 -8.49544379e+00 -1.09432588e+01 -3.98610795e+00
 -2.71338162e-01 -1.01242856e-02 -5.73205444e-03  1.70942054e-03
 -1.33204538e+00 -2.64061315e-01 -3.98542520e-01 -9.68460515e-01
  9.49367897e-01 -2.64288992e+00]
MSE : 0.08
