# Lasso Regression (Polynomial features) on wine quality dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model, preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

df = pd.read_csv('./dataset/winequality-red.csv')

y = df['quality']
X = df.drop(['quality'], axis=1)


print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

# Standardize dataset
standardizer = preprocessing.StandardScaler()
X = standardizer.fit_transform(X)
X = pd.DataFrame(X)

# Create a Lasso model (assuming alpha=0.1)
model = linear_model.Lasso(alpha=0.1)

# Train the model using the training sets
model.fit(X, y)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))

# Display how many coefficients are not equal to 0
print('Number of model coef: {}'.format(np.sum(model.coef_ != 0)))

X shape: (1599, 11)
y shape: (1599,)
Coefficients: [ 0.         -0.15459205  0.          0.         -0.         -0.
 -0.         -0.         -0.          0.03926141  0.24947033]

Number of model coef: 3


In [2]:
print('before feature extraction\nX shape: {}'.format(X.shape))
print(X.head())

before feature extraction
X shape: (1599, 11)
         0         1         2         3         4         5         6   \
0 -0.528360  0.961877 -1.391472 -0.453218 -0.243707 -0.466193 -0.379133   
1 -0.298547  1.967442 -1.391472  0.043416  0.223875  0.872638  0.624363   
2 -0.298547  1.297065 -1.186070 -0.169427  0.096353 -0.083669  0.229047   
3  1.654856 -1.384443  1.484154 -0.453218 -0.264960  0.107592  0.411500   
4 -0.528360  0.961877 -1.391472 -0.453218 -0.243707 -0.466193 -0.379133   

         7         8         9         10  
0  0.558274  1.288643 -0.579207 -0.960246  
1  0.028261 -0.719933  0.128950 -0.584777  
2  0.134264 -0.331177 -0.048089 -0.584777  
3  0.664277 -0.979104 -0.461180 -0.584777  
4  0.558274  1.288643 -0.579207 -0.960246  


In [3]:
mask = model.coef_ != 0
print(mask)

# Use a mask to eliminate features corresponding to coefficients that are 0
X = X.iloc[:, mask]

print('after feature extraction\nX shape: {}'.format(X.shape))
print(X.head())

[False  True False False False False False False False  True  True]
after feature extraction
X shape: (1599, 3)
         1         9         10
0  0.961877 -0.579207 -0.960246
1  1.967442  0.128950 -0.584777
2  1.297065 -0.048089 -0.584777
3 -1.384443 -0.461180 -0.584777
4  0.961877 -0.579207 -0.960246


In [4]:
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures

# Perform PolynomialFeatures on the extracted features to increase the dimension (assuming degree=2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train,X_test,y_train,y_test=train_test_split(X_poly, y, test_size=0.3, random_state=1)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)


model = linear_model.LinearRegression()
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format(mean_squared_error(y_test, y_pred)))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(y_test, y_pred)))

Coefficients: [ 0.         -0.1979168   0.22973068  0.33944012 -0.02937768 -0.02456795
 -0.0200283  -0.14557193  0.05641426 -0.04929108]

Mean squared error: 0.40597004977398327
R2 score: 0.3329898562520003
