In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler

In [7]:
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/"
data = pd.read_csv(DATA_PATH + "winequality-white.csv", sep=";")

In [6]:
data[10:150]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
10,8.1,0.270,0.41,1.45,0.033,11.0,63.0,0.9908,2.99,0.56,12.0,5
11,8.6,0.230,0.40,4.20,0.035,17.0,109.0,0.9947,3.14,0.53,9.7,5
12,7.9,0.180,0.37,1.20,0.040,16.0,75.0,0.9920,3.18,0.63,10.8,5
13,6.6,0.160,0.40,1.50,0.044,48.0,143.0,0.9912,3.54,0.52,12.4,7
14,8.3,0.420,0.62,19.25,0.040,41.0,172.0,1.0002,2.98,0.67,9.7,5
...,...,...,...,...,...,...,...,...,...,...,...,...
145,6.3,0.255,0.37,1.10,0.040,37.0,114.0,0.9905,3.00,0.39,10.9,6
146,5.6,0.160,0.27,1.40,0.044,53.0,168.0,0.9918,3.28,0.37,10.1,6
147,6.4,0.595,0.14,5.20,0.058,15.0,97.0,0.9951,3.38,0.36,9.0,4
148,6.3,0.340,0.33,4.60,0.034,19.0,80.0,0.9917,3.38,0.58,12.0,7


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [16]:
y = data["quality"]
X = data.drop("quality", axis=1)

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.3, random_state=17)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)

In [18]:
linreg = LinearRegression()
linreg.fit(X_train_scaled, y_train)

LinearRegression()

In [28]:
print("Mean squared error (train): %.3f" % mean_squared_error(y_train, linreg.predict(X_train_scaled)))
print("Mean squared error (train): %.3f" % mean_squared_error(y_holdout, linreg.predict(X_holdout_scaled)))

Mean squared error (train): 0.558
Mean squared error (train): 0.584


In [34]:
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
linreg.coef_

array([ 9.78219223e-02, -1.92259947e-01, -1.83224449e-04,  5.38164096e-01,
        8.12724353e-03,  4.21804406e-02,  1.43040227e-02, -6.65720472e-01,
        1.50036006e-01,  6.20533605e-02,  1.29533447e-01])

In [35]:
linreg.rank_

11

In [36]:
linreg.singular_

array([105.02943578,  74.37879206,  64.81841936,  59.04973256,
        57.72025778,  57.11203382,  49.02342135,  44.91146082,
        37.30295321,  31.59526557,   7.26606247])

In [37]:
linreg.intercept_

5.876896149358229

In [38]:
linreg.n_features_in_

11

In [49]:
#Feature this linear regression model treats as the most influential on wine quality

linreg_coef = pd.DataFrame(
    {"coef": linreg.coef_, "coef_abs": np.abs(linreg.coef_)},
    index=data.columns.drop("quality"),
)

linreg_coef.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
density,-0.665720,0.665720
residual sugar,0.538164,0.538164
volatile acidity,-0.192260,0.192260
pH,0.150036,0.150036
alcohol,0.129533,0.129533
...,...,...
sulphates,0.062053,0.062053
free sulfur dioxide,0.042180,0.042180
total sulfur dioxide,0.014304,0.014304
chlorides,0.008127,0.008127


In [57]:
#Lasso
lasso1 = Lasso(alpha=0.01, random_state=17)
lasso1.fit(X_train_scaled, y_train)

Lasso(alpha=0.01, random_state=17)

In [58]:
lasso1_coef = pd.DataFrame(
    {"coef": lasso1.coef_, "coef_abs": np.abs(lasso1.coef_)},
    index=data.columns.drop("quality"),
)
lasso1_coef.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
alcohol,0.322425,0.322425
residual sugar,0.256363,0.256363
density,-0.235492,0.235492
volatile acidity,-0.188479,0.188479
pH,0.067277,0.067277
...,...,...
sulphates,0.029722,0.029722
chlorides,-0.002747,0.002747
fixed acidity,-0.000000,0.000000
citric acid,-0.000000,0.000000


In [52]:
alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(random_state=17, cv=5, alphas=alphas)
lasso_cv.fit(X_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-06, 1.09698580e-06, 1.20337784e-06, 1.32008840e-06,
       1.44811823e-06, 1.58856513e-06, 1.74263339e-06, 1.91164408e-06,
       2.09704640e-06, 2.30043012e-06, 2.52353917e-06, 2.76828663e-06,
       3.03677112e-06, 3.33129479e-06, 3.65438307e-06, 4.00880633e-06,
       4.39760361e-06, 4.82410870e-06, 5.29197874e-06, 5.80522552e-06,
       6.36824994e-06, 6.98587975e-0...
       1.18953407e+01, 1.30490198e+01, 1.43145894e+01, 1.57029012e+01,
       1.72258597e+01, 1.88965234e+01, 2.07292178e+01, 2.27396575e+01,
       2.49450814e+01, 2.73644000e+01, 3.00183581e+01, 3.29297126e+01,
       3.61234270e+01, 3.96268864e+01, 4.34701316e+01, 4.76861170e+01,
       5.23109931e+01, 5.73844165e+01, 6.29498899e+01, 6.90551352e+01,
       7.57525026e+01, 8.30994195e+01, 9.11588830e+01, 1.00000000e+02]),
        cv=5, random_state=17)

In [59]:
#Best alpha for lasso
lasso_cv.alpha_

0.0002833096101839324

In [56]:
lasso1 = Lasso(alpha=0.002, random_state=17)
lasso1.fit(X_train_scaled, y_train)

lasso1_coef = pd.DataFrame(
    {"coef": lasso1.coef_, "coef_abs": np.abs(lasso1.coef_)},
    index=data.columns.drop("quality"),
)
lasso1_coef.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
density,-0.541786,0.541786
residual sugar,0.458631,0.458631
volatile acidity,-0.190986,0.190986
alcohol,0.183205,0.183205
pH,0.125366,0.125366
...,...,...
sulphates,0.054257,0.054257
free sulfur dioxide,0.045842,0.045842
total sulfur dioxide,0.004970,0.004970
citric acid,0.000000,0.000000


In [60]:
print(
    "Mean squared error (train): %.3f"
    % mean_squared_error(y_train, lasso_cv.predict(X_train_scaled))
)
print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, lasso_cv.predict(X_holdout_scaled))
)

Mean squared error (train): 0.558
Mean squared error (test): 0.583


In [63]:
#Random Forest

forest = RandomForestRegressor(random_state=17)
forest.fit(X_train_scaled, y_train)

print(
    "Mean squared error (train): %.3f"
    % mean_squared_error(y_train, forest.predict(X_train_scaled))
)
print(
    "Mean squared error (cv): %.3f"
    % np.mean(
        np.abs(
            cross_val_score(
                forest, X_train_scaled, y_train, scoring="neg_mean_squared_error"
            )
        )
    )
)
print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, forest.predict(X_holdout_scaled))
)

Mean squared error (train): 0.053
Mean squared error (cv): 0.414
Mean squared error (test): 0.372


In [64]:
forest_params = {"max_depth": list(range(10, 25)), "max_features": list(range(6, 12))}

locally_best_forest = GridSearchCV(
    RandomForestRegressor(n_jobs=-1, random_state=17),
    forest_params,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    cv=5,
    verbose=True,
)
locally_best_forest.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1, random_state=17),
             n_jobs=-1,
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24],
                         'max_features': [6, 7, 8, 9, 10, 11]},
             scoring='neg_mean_squared_error', verbose=True)

In [65]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 21, 'max_features': 6}, -0.39773288191505934)

In [66]:
locally_best_forest.best_estimator_

RandomForestRegressor(max_depth=21, max_features=6, n_jobs=-1, random_state=17)

In [68]:
print(
    "Mean squared error (cv): %.3f"
    % np.mean(
        np.abs(
            cross_val_score(
                locally_best_forest.best_estimator_,
                X_train_scaled,
                y_train,
                scoring="neg_mean_squared_error",
            )
        )
    )
)
print(
    "Mean squared error (test): %.3f"
    % mean_squared_error(y_holdout, locally_best_forest.predict(X_holdout_scaled))
)

Mean squared error (cv): 0.398
Mean squared error (test): 0.366


In [69]:
rf_importance = pd.DataFrame(
    locally_best_forest.best_estimator_.feature_importances_,
    columns=["coef"],
    index=data.columns[:-1],
)
rf_importance.sort_values(by="coef", ascending=False)

Unnamed: 0,coef
alcohol,0.206056
volatile acidity,0.117578
free sulfur dioxide,0.111556
density,0.088549
pH,0.073659
...,...
chlorides,0.073366
residual sugar,0.072072
citric acid,0.062601
fixed acidity,0.061813


In [None]:
#Result: The dependency of wine quality on other features in hand is, presumable, non-linear. So Random Forest works better in this task.