In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Prepare Data

In [3]:
wine_quality_red = pd.read_csv('../data/winequality-red.csv', ';')
wine_quality_white = pd.read_csv('../data/winequality-white.csv', ';')


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
wine_quality_red['red wine'] = 1
wine_quality_red['white wine'] = 0
wine_quality_white['red wine'] = 1
wine_quality_white['white wine'] = 0

wine = pd.concat([wine_quality_red, wine_quality_white])
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red wine,white wine
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0


In [5]:
# Shuffle Data
wine = wine.sample(frac=1, replace=True)
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red wine,white wine
220,7.8,0.34,0.37,2.00,0.082,24.0,58.0,0.99640,3.34,0.59,9.400000,6,1,0
1514,6.9,0.84,0.21,4.10,0.074,16.0,65.0,0.99842,3.53,0.72,9.233333,6,1,0
765,7.2,0.27,0.42,1.60,0.050,35.0,135.0,0.99200,2.94,0.46,11.000000,6,1,0
2995,6.6,0.28,0.34,0.80,0.037,42.0,119.0,0.98880,3.03,0.37,12.500000,6,1,0
3193,5.5,0.12,0.33,1.00,0.038,23.0,131.0,0.99164,3.25,0.45,9.800000,5,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1650,7.6,0.31,0.49,3.95,0.044,27.0,131.0,0.99120,3.08,0.67,12.800000,7,1,0
1504,7.0,0.17,0.74,12.80,0.045,24.0,126.0,0.99420,3.26,0.38,12.200000,8,1,0
1403,7.2,0.33,0.33,1.70,0.061,3.0,13.0,0.99600,3.23,1.10,10.000000,8,1,0
123,8.0,0.71,0.00,2.60,0.080,11.0,34.0,0.99760,3.44,0.53,9.500000,5,1,0


In [6]:
# White + Red
wine_feature = wine.drop('quality', axis=1)
wine_quality_label = wine['quality']

# Red Only
X_red = wine_quality_red.drop('quality', axis=1)
y_red = wine_quality_red['quality']

# White Only
X_white = wine_quality_white.drop('quality', axis=1)
y_white = wine_quality_white['quality']

In [7]:
wine_quality_label.unique()

array([6, 5, 4, 7, 8, 3, 9], dtype=int64)

Wine prepared dataframe
* wine_feature
* wine_quality

Red wine prepared dataframe

White wine prepared dataframe

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine_feature, wine_quality_label, test_size=0.2)

# Create Model
### Random Forest Regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def regressor_evaluation(y_true:pd.Series, y_preds:pd.Series) -> dict:
    mae = mean_absolute_error(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    r2 = r2_score(y_true, y_preds)
    print(f"Mae: {mae} ")
    print(f"mse: {mse}")
    print(f"R^2: {r2}")
    return {"mae": mae,
            "mse": mse,
            "r2": r2}

In [10]:
np.random.seed(42)

baseline_rfr = RandomForestRegressor()
baseline_rfr.fit(X_train, y_train)
baseline_preds = baseline_rfr.predict(X_test)
baseline_score = regressor_evaluation(y_test, baseline_preds)
baseline_cv_score = cross_val_score(baseline_rfr, wine_feature, wine_quality_label, cv=5,)
print(f"Baseline CV score =  {np.mean(baseline_cv_score)}")

Mae: 0.26957692307692305 
mse: 0.19729315384615387
R^2: 0.7421495155812097
Baseline CV score =  0.7317021283288889


## Hyperparameter Tuning

In [14]:
np.random.seed(42)

est_option = [50, 300, 500, 700, 1000, 1200, 1400, 1800, 2000]
handtuned_cv_scores = []
for x in est_option:
    handtuned_rfr = RandomForestRegressor(n_estimators=500)
    handtuned_cv_score =  cross_val_score(handtuned_rfr, wine_feature, wine_quality_label)
    handtuned_cv_scores.append(np.mean(handtuned_cv_score))

fig, ax = plt.subplots()
ax.plot(est_option, handtuned_cv_scores)
ax.set_xlabel("Estimators")
ax.set_ylabel("R-Squared Score CV")
plt.show()
plt.savefig("R-Squared_vs_Estimators.svg",format="svg")

KeyboardInterrupt: 

## RandomizedSearchCV for Random Forest Regressor

In [24]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

grid = {
    "n_estimators": [10, 100, 300, 500, 700, 1000],
    "max_depth": [None, 5, 10, 20, 30, 50],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 4, 5, 6],
    "max_features": ["auto", "sqrt"],
}

wine_model_rfr = RandomForestRegressor(n_jobs=1)

rs_rfr = RandomizedSearchCV(estimator=wine_model_rfr,
                            param_distributions=grid,
                            n_iter=10,
                            cv=5,
                            verbose=2,
                            return_train_score=True
                            )
rs_rfr.fit(X_train, y_train)
rs_preds = rs_rfr.predict(X_test)
rs_score = regressor_evaluation(y_test, rs_preds)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=700; total time=   5.7s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=700; total time=   5.3s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=700; total time=   5.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=700; total time=   4.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=700; total time=   4.9s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=5, min_samples_split=8, n_estimators=100

In [35]:
rs_rfr.best_params_

{'n_estimators': 500,
 'min_samples_split': 6,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 20}