In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

# Adjusting for Data Frame Output
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

In [2]:
# Read Dataset
# Dataset Source:
#   https://www.kaggle.com/datasets/gauravduttakiit/white-wine-quality?select=Training_set_whitewine.csv
df_origin = pd.read_csv('dataset/white_wine.csv')

In [3]:
# Check data information
df_origin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3428 entries, 0 to 3427
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         3428 non-null   float64
 1   volatile acidity      3428 non-null   float64
 2   citric acid           3428 non-null   float64
 3   residual sugar        3428 non-null   float64
 4   chlorides             3428 non-null   float64
 5   free sulfur dioxide   3428 non-null   float64
 6   total sulfur dioxide  3428 non-null   float64
 7   density               3428 non-null   float64
 8   pH                    3428 non-null   float64
 9   sulphates             3428 non-null   float64
 10  alcohol               3428 non-null   float64
 11  quality               3428 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 321.5 KB


In [4]:
# Check Dataset
display(df_origin)

# Copying a data set from the original dataset
df = df_origin.copy()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.2,0.47,0.19,8.3,0.029,24.0,142.0,0.99200,3.22,0.45,12.30,6
1,6.8,0.38,0.29,9.9,0.037,40.0,146.0,0.99326,3.11,0.37,11.50,6
2,7.9,0.17,0.32,1.6,0.053,47.0,150.0,0.99480,3.29,0.76,9.60,6
3,7.6,0.30,0.27,10.6,0.039,31.0,119.0,0.99815,3.27,0.30,9.30,6
4,5.8,0.24,0.39,1.5,0.054,37.0,158.0,0.99320,3.21,0.52,9.30,6
...,...,...,...,...,...,...,...,...,...,...,...,...
3423,8.0,0.25,0.13,17.2,0.036,49.0,219.0,0.99960,2.96,0.46,9.70,5
3424,6.1,0.16,0.34,6.1,0.034,31.0,114.0,0.99365,3.32,0.58,10.15,7
3425,6.0,0.24,0.33,2.5,0.026,31.0,85.0,0.99014,3.13,0.50,11.30,7
3426,6.5,0.22,0.45,8.0,0.053,52.0,196.0,0.99590,3.23,0.48,9.10,6


In [5]:
# Data Scaling
scalier = StandardScaler()
df = pd.DataFrame(scalier.fit_transform(df), columns=df.columns)

In [6]:
# Check Dataset
display(df)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.796240,1.936948,-1.211251,0.376915,-0.795101,-0.689063,0.070611,-0.702363,0.206011,-0.359132,1.456615,0.137625
1,-0.072808,1.032794,-0.368582,0.697658,-0.412062,0.272340,0.164509,-0.270603,-0.521899,-1.061467,0.807887,0.137625
2,1.253485,-1.076900,-0.115781,-0.966198,0.354014,0.692954,0.258406,0.257105,0.669225,2.362415,-0.732841,0.137625
3,0.891769,0.229101,-0.537116,0.837984,-0.316303,-0.268449,-0.469299,1.405040,0.536878,-1.676010,-0.976114,0.137625
4,-1.278528,-0.373668,0.474088,-0.986244,0.401894,0.092077,0.446201,-0.291163,0.139837,0.255411,-0.976114,0.137625
...,...,...,...,...,...,...,...,...,...,...,...,...
3423,1.374057,-0.273207,-1.716853,2.161050,-0.459942,0.813129,1.878137,1.901907,-1.514502,-0.271340,-0.651750,-0.991030
3424,-0.916812,-1.177361,0.052753,-0.064107,-0.555702,-0.268449,-0.586671,-0.136962,0.867746,0.782162,-0.286841,1.266280
3425,-1.037384,-0.373668,-0.031514,-0.785780,-0.938740,-0.268449,-1.267427,-1.339724,-0.389551,0.079827,0.645705,1.266280
3426,-0.434524,-0.574592,0.979689,0.316776,0.354014,0.993392,1.338227,0.634039,0.272184,-0.095757,-1.138296,0.137625


In [7]:
# Separating datasets by x & y
x = df.drop(['quality'], axis=1)
y = df['quality'].values.reshape(-1, 1)
result = pd.DataFrame(None, columns=['Model', 'MSE'])

In [8]:
# OLS (Linear Regression)
linear = LinearRegression()
MSE5 = cross_val_score(linear, x, y, scoring='neg_mean_squared_error', cv=5)
mean_MSE = np.mean(MSE5)
print('Linear Regression mean of MSE', mean_MSE)

Linear Regression mean of MSE -0.7127251554485273


In [9]:
# Ridge Regression
ridge = Ridge()
ridge_parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge_regression = GridSearchCV(ridge, ridge_parameters, scoring='neg_mean_squared_error', cv=5)
ridge_regression.fit(x, y)

print('Ridge Regression Best parameter:', ridge_regression.best_params_)
print('Ridge Regression Best MSE:', ridge_regression.best_score_)

Ridge Regression Best parameter: {'alpha': 1e-15}
Ridge Regression Best MSE: -0.7127251554485274


In [10]:
# LASSO Regression
lasso = Lasso()
lasso_parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
lasso_regression = GridSearchCV(lasso, lasso_parameters, scoring='neg_mean_squared_error', cv=5)
lasso_regression.fit(x, y)

print('LASSO Regression Best parameter:', lasso_regression.best_params_)
print('LASSO Regression Best MSE:', lasso_regression.best_score_)

LASSO Regression Best parameter: {'alpha': 0.001}
LASSO Regression Best MSE: -0.7126218242267213


In [11]:
print('Compare all models')
print('Linear Regression mean MSE:\t', mean_MSE)
print('Ridge Regression Best MSE:\t', ridge_regression.best_score_)
print('LASSO Regression Best MSE:\t', lasso_regression.best_score_)

Compare all models
Linear Regression mean MSE:	 -0.7127251554485273
Ridge Regression Best MSE:	 -0.7127251554485274
LASSO Regression Best MSE:	 -0.7126218242267213
