In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

(c) Commonly, we instead split the data into a training set, a test set, and a validation set used to select hyperparameters. Use the dataset on wine quality provided `winequality-white.csv` and then divide it randomly into 70% train, 10% validation, and 20% test. The last column “quality” is the target, an integer from 1-10. We’ll view this as a regression problem with mean squared error as the desired loss function. Pick two model classes from the set { RandomForestRegressor, KNeighborsRegressor, MLPRegressor } and read their documentation in Scikit-learn to understand the hyperparameters of these models and what they do. Do grid search over a reasonably-large set of parameters (number of parameter sets > the number of points in the validation set): fit the model on the train set for each set of parameters, and select the “best parameters” using the score on the validation set. Finally, report the performance on the test set for each of the model classes you chose. What’s the relationship between the empirical risk on the validation set and the empirical risk on the test set?

In [2]:
# load and see data 
fname = os.path.join('data', 'winequality-white.csv')
dset = pd.read_csv(fname, sep=';')
dset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [3]:
# target and features:
y = dset.quality
X = dset.iloc[:, :-1]
print(y.shape)
print(X.shape)

(4898,)
(4898, 11)


In [4]:
# split in training, test and validations
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=10.0/80.0, random_state=1)

print('shapes:\n- - - -')
print(f'x_train: {X_train.shape}\t y_train: {y_train.shape}\t frac: {y_train.shape[0]/y.shape[0]*100:.2f}%')
print(f'x_test: {X_test.shape}\t y_test: {y_test.shape}\t\t frac: {y_test.shape[0]/y.shape[0]*100:.2f}%')
print(f'x_val: {X_val.shape}\t y_val: {y_val.shape}\t\t frac: {y_val.shape[0]/y.shape[0]*100:.2f}%')

shapes:
- - - -
x_train: (3428, 11)	 y_train: (3428,)	 frac: 69.99%
x_test: (980, 11)	 y_test: (980,)		 frac: 20.01%
x_val: (490, 11)	 y_val: (490,)		 frac: 10.00%


I'll start with the multi-layer perceptron regressor using the adam solver and relu activation function. It has many hyperparameters, but the most relevant ones are the number of layers, number of neurons per layer, initial learning rate and batch size. I'll mantain the number of neurons fixed thoughout the layers, but still scan over this number. I'll keep the other hyperparameters in default, including the beta values for the adam optimizer. 

In [5]:
n_hidden_layers = np.linspace(1, 9, 9, dtype=int)
n_neurons = np.linspace(10, 90, 9, dtype=int)
learning_rate = np.array([1e-3, 1e-2, 1e-1])
batch_size = np.array([32, 64, 128])
n_total = len(n_hidden_layers)*len(n_neurons)*len(learning_rate)*len(batch_size)
print(f'total number of grid points: {n_total}')

total number of grid points: 729


In [6]:
from sklearn.neural_network import MLPRegressor

In [7]:
score = np.zeros(
    (len(n_hidden_layers), 
    len(n_neurons),
    len(batch_size),
    len(learning_rate)
    )
)
iteration = 1
for i, n_hl in enumerate(n_hidden_layers):
    for j, n_n in enumerate(n_neurons):
        for k, bs in enumerate(batch_size):
            for l, lr in enumerate(learning_rate):
                #print(f'___________________')
                #print(f'{iteration}/{n_total}\t index: {i} {j} {k} {l}')
                #print(f'n_hl = {n_hl}')
                #print(f'n_n = {n_n}')
                #print(f'lr = {lr}')
                #print(f'bs = {bs}')
                reg = MLPRegressor(
                    hidden_layer_sizes = n_n*np.ones(n_hl, dtype=int),
                    batch_size = bs,
                    learning_rate_init = lr,
                    random_state=0
                ).fit(X_train, y_train)
                score[i, j, k, l] = reg.score(X_val, y_val)
                #print(f'score = {score[i, j, k, l]}')
                iteration+=1



In [8]:
max_index = np.unravel_index(score.argmax(), score.shape)
n_hidden_layers_opt = n_hidden_layers[max_index[0]]
n_neurons_opt = n_neurons[max_index[1]]
batch_size_opt = batch_size[max_index[2]]
learning_rate_opt = learning_rate[max_index[3]]
print('optimal hyperparameters based on grid scan:')
print(f'hidden layers: {n_hidden_layers_opt}')
print(f'neurons per layer: {n_neurons_opt}')
print(f'batch size: {batch_size_opt}')
print(f'initial learning rate: {learning_rate_opt}')
print(f'Optimized model score:  {score.max()}')

optimal hyperparameters based on grid scan:
hidden layers: 7
neurons per layer: 30
batch size: 128
initial learning rate: 0.001
Optimized model score:  0.3464109209402946


In [9]:
# 'optimized' model
mlp_reg_opt = MLPRegressor(
    hidden_layer_sizes = n_neurons_opt*np.ones(n_hidden_layers_opt, dtype=int),
    batch_size = batch_size_opt,
    learning_rate_init = learning_rate_opt,
    random_state=0
).fit(X_train, y_train)

In [10]:
from sklearn.metrics import mean_squared_error

print(f'Score on validation dataset: {mlp_reg_opt.score(X_val, y_val)}')
print(f'Score on test dataset: {mlp_reg_opt.score(X_test, y_test)}')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
print(f'Mean squared error on validation dataset: {mean_squared_error(y_val, mlp_reg_opt.predict(X_val))}')
print(f'Mean squared error on test dataset: {mean_squared_error(y_test, mlp_reg_opt.predict(X_test))}')

Score on validation dataset: 0.3464109209402946
Score on test dataset: 0.27692386326911866
Mean squared error on validation dataset: 0.5178870000084546
Mean squared error on test dataset: 0.6373030745931618


Lastly, I'll use the random forest regressor. The hyperparameters I'm going to scan through are the number of trees, maximum depth, minimum samples to split, and minimum samples at leaf. The rest will be left in their default values.

In [11]:
n_estimators = [10, 20, 40, 80, 160, 320, 640, 1280]
max_depth = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]
min_samples_split = [2, 4, 8]
min_samples_leaf = [1, 2, 4]
n_total = len(n_estimators)*len(max_depth)*len(min_samples_split)*len(min_samples_leaf)
print(f'total number of grid points: {n_total}')

total number of grid points: 792


In [12]:
from sklearn.ensemble import RandomForestRegressor
score = np.zeros(
    (len(n_estimators), 
    len(max_depth),
    len(min_samples_split),
    len(min_samples_leaf)
    )
)
iteration = 1
for i, n_t in enumerate(n_estimators):
    for j, max_d in enumerate(max_depth):
        for k, min_s_s in enumerate(min_samples_split):
            for l, min_s_l in enumerate(min_samples_leaf):
                #print(f'___________________')
                #print(f'{iteration}/{n_total}\t index: {i} {j} {k} {l}')
                #print(f'n_t = {n_t}')
                #print(f'max_d = {max_d}')
                #print(f'min_s_s = {min_s_s}')
                #print(f'min_s_l = {min_s_l}')
                reg = RandomForestRegressor(
                    n_estimators = n_t,
                    max_depth = max_d,
                    min_samples_split = min_s_s,
                    min_samples_leaf = min_s_l, 
                    random_state=0
                ).fit(X_train, y_train)
                score[i, j, k, l] = reg.score(X_val, y_val)
                #print(f'score = {score[i, j, k, l]}')
                iteration+=1

In [13]:
max_index = np.unravel_index(score.argmax(), score.shape)
n_estimators_opt = n_estimators[max_index[0]]
max_depth_opt = max_depth[max_index[1]]
min_samples_split_opt = min_samples_split[max_index[2]]
min_samples_leaf_opt = min_samples_leaf[max_index[3]]
print('optimal hyperparameters based on grid scan:')
print(f'number of trees: {n_estimators_opt}')
print(f'max depth: {max_depth_opt}')
print(f'min samples to split: {min_samples_split_opt}')
print(f'min samples at leaf: {min_samples_leaf_opt}')
print(f'Optimized model score: {score.max()}')

optimal hyperparameters based on grid scan:
number of trees: 160
max depth: 20
min samples to split: 2
min samples at leaf: 1
Optimized model score: 0.5171789817202856


In [14]:
rf_reg_opt = RandomForestRegressor(
    n_estimators = n_estimators_opt,
    max_depth = max_depth_opt,
    min_samples_split = min_samples_split_opt,
    min_samples_leaf = min_samples_leaf_opt, 
    random_state=0
).fit(X_train, y_train)

In [15]:
print(f'Score on validation dataset: {rf_reg_opt.score(X_val, y_val)}')
print(f'Score on test dataset: {rf_reg_opt.score(X_test, y_test)}')                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
print(f'Mean squared error on validation dataset: {mean_squared_error(y_val, rf_reg_opt.predict(X_val))}')
print(f'Mean squared error on test dataset: {mean_squared_error(y_test, rf_reg_opt.predict(X_test))}')

Score on validation dataset: 0.5171789817202856
Score on test dataset: 0.4346579982043318
Mean squared error on validation dataset: 0.382574826766753
Mean squared error on test dataset: 0.4982797490316411
