In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

# Adjusting for Data Frame Output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
# Read Dataset
# Dataset Source:
#   https://www.kaggle.com/datasets/gauravduttakiit/white-wine-quality?select=Training_set_whitewine.csv
df_origin = pd.read_csv('dataset/white_wine.csv')

In [3]:
# Check data information
df_origin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3428 entries, 0 to 3427
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         3428 non-null   float64
 1   volatile acidity      3428 non-null   float64
 2   citric acid           3428 non-null   float64
 3   residual sugar        3428 non-null   float64
 4   chlorides             3428 non-null   float64
 5   free sulfur dioxide   3428 non-null   float64
 6   total sulfur dioxide  3428 non-null   float64
 7   density               3428 non-null   float64
 8   pH                    3428 non-null   float64
 9   sulphates             3428 non-null   float64
 10  alcohol               3428 non-null   float64
 11  quality               3428 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 321.5 KB


In [4]:
# Check Dataset
display(df_origin)

# Copying a data set from the original dataset
df = df_origin.copy()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.2,0.47,0.19,8.3,0.029,24.0,142.0,0.992,3.22,0.45,12.3,6
1,6.8,0.38,0.29,9.9,0.037,40.0,146.0,0.99326,3.11,0.37,11.5,6
2,7.9,0.17,0.32,1.6,0.053,47.0,150.0,0.9948,3.29,0.76,9.6,6
3,7.6,0.3,0.27,10.6,0.039,31.0,119.0,0.99815,3.27,0.3,9.3,6
4,5.8,0.24,0.39,1.5,0.054,37.0,158.0,0.9932,3.21,0.52,9.3,6
5,7.0,0.21,0.42,5.3,0.037,36.0,123.0,0.99321,3.14,0.52,10.9,6
6,6.8,0.14,0.18,1.4,0.047,30.0,90.0,0.99164,3.27,0.54,11.2,6
7,7.0,0.29,0.49,3.8,0.047,37.0,136.0,0.9938,2.95,0.4,9.4,6
8,7.2,0.31,0.46,5.0,0.04,3.0,29.0,0.9906,3.04,0.53,12.5,4
9,5.8,0.28,0.66,9.1,0.039,26.0,159.0,0.9965,3.66,0.55,10.8,5


In [5]:
# Data Scaling
scalier = StandardScaler()
df = pd.DataFrame(scalier.fit_transform(df), columns=df.columns)

In [6]:
# Check Dataset
display(df)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.79624,1.936948,-1.211251,0.376915,-0.795101,-0.689063,0.070611,-0.702363,0.206011,-0.359132,1.456615,0.137625
1,-0.072808,1.032794,-0.368582,0.697658,-0.412062,0.27234,0.164509,-0.270603,-0.521899,-1.061467,0.807887,0.137625
2,1.253485,-1.0769,-0.115781,-0.966198,0.354014,0.692954,0.258406,0.257105,0.669225,2.362415,-0.732841,0.137625
3,0.891769,0.229101,-0.537116,0.837984,-0.316303,-0.268449,-0.469299,1.40504,0.536878,-1.67601,-0.976114,0.137625
4,-1.278528,-0.373668,0.474088,-0.986244,0.401894,0.092077,0.446201,-0.291163,0.139837,0.255411,-0.976114,0.137625
5,0.168337,-0.675053,0.726888,-0.224479,-0.412062,0.03199,-0.375401,-0.287736,-0.323378,0.255411,0.321341,0.137625
6,-0.072808,-1.378284,-1.295518,-1.006291,0.066736,-0.328537,-1.150055,-0.825724,0.536878,0.430995,0.564614,0.137625
7,0.168337,0.128639,1.316757,-0.525176,0.066736,0.092077,-0.070235,-0.085562,-1.580675,-0.798091,-0.895023,0.137625
8,0.409481,0.329563,1.063956,-0.284618,-0.268423,-1.950904,-2.581991,-1.182097,-0.985113,0.343203,1.618797,-2.119685
9,-1.278528,0.028178,2.749295,0.537287,-0.316303,-0.568887,0.469675,0.839639,3.117647,0.518786,0.24025,-0.99103


In [7]:
# Separating datasets by x & y
x = df.drop(['quality'], axis=1)
y = df['quality'].values.reshape(-1, 1)
result = pd.DataFrame(None, columns=['Model', 'MSE'])

In [8]:
# OLS (Linear Regression)
lin_reg = LinearRegression()
MSE5 = cross_val_score(lin_reg, x, y, scoring='neg_mean_squared_error', cv=5)
mean_MSE = np.mean(MSE5)
print('Linear Regression mean of MSE', mean_MSE)

Linear Regression mean of MSE -0.7127251554485273


In [9]:
# Ridge Regression
ridge = Ridge()
ridge_parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
ridge_regression = GridSearchCV(ridge, ridge_parameters, scoring='neg_mean_squared_error', cv=5)
ridge_regression.fit(x, y)

print('Ridge Regression Best parameter:', ridge_regression.best_params_)
print('Ridge Regression Best MSE:', ridge_regression.best_score_)

Ridge Regression Best parameter: {'alpha': 1e-15}
Ridge Regression Best MSE: -0.7127251554485274


In [10]:
# LASSO Regression
lasso = Lasso()
lasso_parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
lasso_regression = GridSearchCV(lasso, lasso_parameters, scoring='neg_mean_squared_error', cv=5)
lasso_regression.fit(x, y)

print('LASSO Regression Best parameter:', lasso_regression.best_params_)
print('LASSO Regression Best MSE:', lasso_regression.best_score_)

LASSO Regression Best parameter: {'alpha': 0.001}
LASSO Regression Best MSE: -0.7126218242267213


In [11]:
print('Compare all models')
print('Linear Regression mean MSE:\t', mean_MSE)
print('Ridge Regression Best MSE:\t', ridge_regression.best_score_)
print('LASSO Regression Best MSE:\t', lasso_regression.best_score_)

Compare all models
Linear Regression mean MSE:	 -0.7127251554485273
Ridge Regression Best MSE:	 -0.7127251554485274
LASSO Regression Best MSE:	 -0.7126218242267213
