In [211]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (GridSearchCV, cross_val_score,
                                     train_test_split)
from sklearn.preprocessing import StandardScaler

In [212]:
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/"

In [213]:
data = pd.read_csv(DATA_PATH + "winequality-white.csv", sep=";")

In [214]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [215]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [216]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=17)

In [218]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_test)

In [243]:
#Train a simple linear regression model (Ordinary Least Squares).
regressor = LinearRegression()
regressor.fit(X_train_scaled, y_train)

LinearRegression()

In [280]:
#1. What are mean squared errors of model predictions on train and holdout sets?
y_pred_train = regressor.predict(X_train_scaled)
y_pred_test = regressor.predict(X_holdout_scaled)
mse_train = round(mean_squared_error(y_pred_train, y_train), 3)
mse_hold = round(mean_squared_error(y_pred_test, y_test), 3)

print('Mean squared error (train): {}'.format(mse_train))
print('Mean squared error (test): {}'.format(mse_hold))

Mean squared error (train): 0.558
Mean squared error (test): 0.584


In [255]:
#Sort features by their influence on the target feature (wine quality)
features = pd.DataFrame(
{'coeff': regressor.coef_, 'absolute_coeff': abs(regressor.coef_)},
index = data.columns.drop('quality'))
features.sort_values(by='absolute_coeff', ascending=False)

Unnamed: 0,coeff,absolute_coeff
density,-0.66572,0.66572
residual sugar,0.538164,0.538164
volatile acidity,-0.19226,0.19226
pH,0.150036,0.150036
alcohol,0.129533,0.129533
fixed acidity,0.097822,0.097822
sulphates,0.062053,0.062053
free sulfur dioxide,0.04218,0.04218
total sulfur dioxide,0.014304,0.014304
chlorides,0.008127,0.008127


In [223]:
#2 Which feature this linear regression model treats as the most influential on wine quality?
features['Name'][7]

'density'

In [241]:
#Lasso regression
lasso = Lasso(alpha=0.01, random_state=17)
lasso.fit(X_train_scaled, y_train)

Lasso(alpha=0.01, random_state=17)

In [263]:
#Which feature is the least informative in predicting wine quality, according to this LASSO model?
features = pd.DataFrame(
{'coeff': lasso.coef_, 'abs_coeff': abs(lasso.coef_)},
index = data.columns.drop('quality'))
features.sort_values('abs_coeff', ascending=False)

Unnamed: 0,coeff,abs_coeff
alcohol,0.322425,0.322425
residual sugar,0.256363,0.256363
density,-0.235492,0.235492
volatile acidity,-0.188479,0.188479
pH,0.067277,0.067277
free sulfur dioxide,0.043088,0.043088
sulphates,0.029722,0.029722
chlorides,-0.002747,0.002747
fixed acidity,-0.0,0.0
citric acid,-0.0,0.0


In [264]:
#Train LassoCV with random_state=17 to choose the best value of alpha
# in 5-fold cross-validation.

alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(random_state=17, cv=5, alphas=alphas)
lasso_cv.fit(X_train_scaled, y_train)

lasso_cv.alpha_

0.0002833096101839324

In [265]:
#3: Which feature is the least informative in predicting wine quality, according to the tuned LASSO model?

features = pd.DataFrame(
{'coeff': lasso_cv.coef_, 'abs_coeff': abs(lasso_cv.coef_)},
index = data.columns.drop('quality'))
features.sort_values('abs_coeff', ascending=False)

#citric acid

Unnamed: 0,coeff,abs_coeff
density,-0.648161,0.648161
residual sugar,0.526883,0.526883
volatile acidity,-0.192049,0.192049
pH,0.146549,0.146549
alcohol,0.137115,0.137115
fixed acidity,0.093295,0.093295
sulphates,0.060939,0.060939
free sulfur dioxide,0.042698,0.042698
total sulfur dioxide,0.012969,0.012969
chlorides,0.006933,0.006933


In [279]:
#4: What are mean squared errors of tuned LASSO predictions on train and holdout sets?

y_pred_train = lasso_cv.predict(X_train_scaled)
y_pred_test = lasso_cv.predict(X_holdout_scaled)
mse_train = round(mean_squared_error(y_pred_train, y_train), 3)
mse_hold = round(mean_squared_error(y_pred_test, y_test), 3)

print('Mean squared error (train): {}'.format(mse_train))
print('Mean squared error (test): {}'.format(mse_hold))

Mean squared error (train): 0.558
Mean squared error (test): 0.583


In [281]:
#Random Forest

forest = RandomForestRegressor(random_state=17)
forest.fit(X_train_scaled, y_train)

RandomForestRegressor(random_state=17)

In [293]:
#5. What are mean squared errors of RF model on the training set, in cross-validation and on holdout set?

y_pred_train = forest.predict(X_train_scaled)
y_pred_test = forest.predict(X_holdout_scaled)
mse_train = round(mean_squared_error(y_pred_train, y_train), 3)
mse_hold = round(mean_squared_error(y_pred_test, y_test), 3)
mse_cv = round(abs(cross_val_score(forest, X_train_scaled, y_train, scoring='neg_mean_squared_error').mean()), 3)

print('Mean squared error (train): {}'.format(mse_train))
print('Mean squared error (cv): {}'.format(mse_cv))
print('Mean squared error (test): {}'.format(mse_hold))

Mean squared error (train): 0.053
Mean squared error (cv): 0.414
Mean squared error (test): 0.372


In [303]:
#tune
forest_params = {'max_depth': list(range(10, 25)),
                  'max_features': list(range(6,12))}

locally_best_forest = GridSearchCV(estimator=forest ,param_grid=forest_params)
locally_best_forest.fit(X_train_scaled, y_train)

GridSearchCV(estimator=RandomForestRegressor(random_state=17),
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24],
                         'max_features': [6, 7, 8, 9, 10, 11]})

In [304]:
locally_best_forest.best_params_, locally_best_forest.best_score_

({'max_depth': 21, 'max_features': 6}, 0.4939384585901731)

In [310]:
#6: What are mean squared errors of tuned RF model in cross-validation 
# and on holdout set?
y_pred_test = locally_best_forest.predict(X_holdout_scaled)

mse_hold = round(mean_squared_error(y_pred_test, y_test), 3)
mse_cv = round(abs(cross_val_score(locally_best_forest, X_train_scaled, y_train, scoring='neg_mean_squared_error').mean()), 3)
print('Mean squared error (cv): {}'.format(mse_cv))
print('Mean squared error (test): {}'.format(mse_hold))

Mean squared error (cv): 0.402
Mean squared error (test): 0.366


In [308]:
#7: What is the most important feature, according to the Random Forest model?

rf_importance = pd.DataFrame(
    locally_best_forest.best_estimator_.feature_importances_,
    columns=["coef"],
    index=data.columns[:-1],
)
rf_importance.sort_values(by="coef", ascending=False)

Unnamed: 0,coef
alcohol,0.206056
volatile acidity,0.117578
free sulfur dioxide,0.111556
density,0.088549
pH,0.073659
total sulfur dioxide,0.07364
chlorides,0.073366
residual sugar,0.072072
citric acid,0.062601
fixed acidity,0.061813
