## 紅酒品質預測 Part2

### Use Lasso to do feature selection first and then do Polynomial Regression

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

# 使用pandas讀取 winequality-red.csv
data = pd.read_csv("dataset/winequality-red.csv")

# 定義特徵向量X以及label y
y = data["quality"]
X = data.drop("quality", axis=1)
print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

# 標準化
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
X = pd.DataFrame(X)

# 1.建立Lasso模型(假設alpha=0.1)
model = Lasso(alpha=0.1)

# Train the model using the training sets
model.fit(X, y)

# The coefficients
print("\nCoefficients: \n{}".format(model.coef_))

# 2.顯示出有多少個係數不為 0
print("Number of model coefficients: {}".format(np.sum(model.coef_ != 0)))

X shape: (1599, 11)
y shape: (1599,)

Coefficients: 
[ 0.         -0.15459205  0.          0.         -0.         -0.
 -0.         -0.         -0.          0.03926141  0.24947033]
Number of model coefficients: 3


In [2]:
print("Before feature extraction\nX shape: {}".format(X.shape))
X.head()

Before feature extraction
X shape: (1599, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246


In [3]:
mask = model.coef_ != 0
print(mask)

# 3.使用mask將係數為0對應的特徵剔除
X = X.loc[:, mask]
print("\nAfter feature extraction\nX shape: {}".format(X.shape))
X.head()

[False  True False False False False False False False  True  True]

After feature extraction
X shape: (1599, 3)


Unnamed: 0,1,9,10
0,0.961877,-0.579207,-0.960246
1,1.967442,0.12895,-0.584777
2,1.297065,-0.048089,-0.584777
3,-1.384443,-0.46118,-0.584777
4,0.961877,-0.579207,-0.960246


In [4]:
# 4.將抽取過後的特徵做PolynomialFeatures提升維度(假設degree=2)
poly = PolynomialFeatures(degree=2).fit(X)
X_poly = poly.transform(X)

# 分成訓練集與測試集
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=1) #random_state種子值

# 標準化訓練集 & 建立模型
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = LinearRegression()
model.fit(X_train, y_train)

# 標準化測試集 & 預測
X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

# The coefficients
print("Coefficients: \n{}\n".format(model.coef_))
# The mean squared error
print("Mean squared error: {:.3f}".format(mean_squared_error(y_test, y_pred)))
# Explained variance score: 1 is perfect prediction
print("R2 score: {:.3f}".format(r2_score(y_test, y_pred)))

Coefficients: 
[ 0.         -0.1979168   0.22973068  0.33944012 -0.02937768 -0.02456795
 -0.0200283  -0.14557193  0.05641426 -0.04929108]

Mean squared error: 0.406
R2 score: 0.333
