In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

(c) Commonly, we instead split the data into a training set, a test set, and a validation set used to select hyperparameters. Use the dataset on wine quality provided `winequality-white.csv` and then divide it randomly into 70% train, 10% validation, and 20% test. The last column “quality” is the target, an integer from 1-10. We’ll view this as a regression problem with mean squared error as the desired loss function. Pick two model classes from the set { RandomForestRegressor, KNeighborsRegressor, MLPRegressor } and read their documentation in Scikit-learn to understand the hyperparameters of these models and what they do. Do grid search over a reasonably-large set of parameters (number of parameter sets > the number of points in the validation set): fit the model on the train set for each set of parameters, and select the “best parameters” using the score on the validation set. Finally, report the performance on the test set for each of the model classes you chose. What’s the relationship between the empirical risk on the validation set and the empirical risk on the test set?

In [2]:
# load and see data 
fname = os.path.join('data', 'winequality-white.csv')
dset = pd.read_csv(fname, sep=';')
dset

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [3]:
# target and features:
y = dset.quality
X = dset.iloc[:, :-1]
print(y.shape)
print(X.shape)

(4898,)
(4898, 11)


In [4]:
# split in training, test and validations
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=10.0/80.0, random_state=1)

print('shapes:\n- - - -')
print(f'x_train: {X_train.shape}\t y_train: {y_train.shape}\t frac: {y_train.shape[0]/y.shape[0]*100:.2f}%')
print(f'x_test: {X_test.shape}\t y_test: {y_test.shape}\t\t frac: {y_test.shape[0]/y.shape[0]*100:.2f}%')
print(f'x_val: {X_val.shape}\t y_val: {y_val.shape}\t\t frac: {y_val.shape[0]/y.shape[0]*100:.2f}%')

shapes:
- - - -
x_train: (3428, 11)	 y_train: (3428,)	 frac: 69.99%
x_test: (980, 11)	 y_test: (980,)		 frac: 20.01%
x_val: (490, 11)	 y_val: (490,)		 frac: 10.00%


I'll start with the multi-layer perceptron regressor using the adam solver and relu activation function. It has many hyperparameters, but the most relevant ones are the number of layers, number of neurons per layer, learning rate and batch size.I'll mantain the number of neurons fixed thoughout the layers, but still scan over this number. I'll keep the other hyperparameters in default, including the beta values for the adam optimizer. 

In [5]:
n_hidden_layers = np.linspace(1, 10, 9, dtype=int)
n_neurons = np.linspace(10, 50, 9, dtype=int)
learning_rate = np.array([1e-3, 1e-2, 1e-1])
batch_size = np.array([32, 64, 128])
n_total = len(n_hidden_layers)*len(n_neurons)*len(learning_rate)*len(batch_size)
print(f'total number of grid points: {n_total}')

total number of grid points: 729


In [6]:
from sklearn.neural_network import MLPRegressor

In [7]:
i = 0
score = np.zeros(n_total)
for n_hl in n_hidden_layers:
    for n_n in n_neurons:
        for lr in learning_rate:
            for bs in batch_size:
                print(f'{i}/{n_total}')
                print(f'n_hl = {n_hl}')
                print(f'n_n = {n_n}')
                print(f'lr = {lr}')
                print(f'bs = {bs}')
                reg = MLPRegressor(
                    hidden_layer_sizes = n_n*np.ones(n_hl, dtype=int),
                    batch_size = bs,
                    learning_rate_init = lr
                ).fit(X_train, y_train)
                score[i] = reg.score(X_val, y_val)
                i+=1

0/729
n_hl = 1
n_n = 10
lr = 0.001
bs = 32


1/729
n_hl = 1
n_n = 10
lr = 0.001
bs = 64
2/729
n_hl = 1
n_n = 10
lr = 0.001
bs = 128
3/729
n_hl = 1
n_n = 10
lr = 0.01
bs = 32
4/729
n_hl = 1
n_n = 10
lr = 0.01
bs = 64
5/729
n_hl = 1
n_n = 10
lr = 0.01
bs = 128
6/729
n_hl = 1
n_n = 10
lr = 0.1
bs = 32
7/729
n_hl = 1
n_n = 10
lr = 0.1
bs = 64
8/729
n_hl = 1
n_n = 10
lr = 0.1
bs = 128
9/729
n_hl = 1
n_n = 15
lr = 0.001
bs = 32
10/729
n_hl = 1
n_n = 15
lr = 0.001
bs = 64
11/729
n_hl = 1
n_n = 15
lr = 0.001
bs = 128
12/729
n_hl = 1
n_n = 15
lr = 0.01
bs = 32
13/729
n_hl = 1
n_n = 15
lr = 0.01
bs = 64
14/729
n_hl = 1
n_n = 15
lr = 0.01
bs = 128
15/729
n_hl = 1
n_n = 15
lr = 0.1
bs = 32
16/729
n_hl = 1
n_n = 15
lr = 0.1
bs = 64
17/729
n_hl = 1
n_n = 15
lr = 0.1
bs = 128
18/729
n_hl = 1
n_n = 20
lr = 0.001
bs = 32
19/729
n_hl = 1
n_n = 20
lr = 0.001
bs = 64
20/729
n_hl = 1
n_n = 20
lr = 0.001
bs = 128
21/729
n_hl = 1
n_n = 20
lr = 0.01
bs = 32
22/729
n_hl = 1
n_n = 20
lr = 0.01
bs = 64
23/729
n_hl = 1
n_n = 20
lr = 0.01
bs = 128
24/729
n_h



558/729
n_hl = 7
n_n = 50
lr = 0.001
bs = 32
559/729
n_hl = 7
n_n = 50
lr = 0.001
bs = 64
560/729
n_hl = 7
n_n = 50
lr = 0.001
bs = 128
561/729
n_hl = 7
n_n = 50
lr = 0.01
bs = 32
562/729
n_hl = 7
n_n = 50
lr = 0.01
bs = 64
563/729
n_hl = 7
n_n = 50
lr = 0.01
bs = 128
564/729
n_hl = 7
n_n = 50
lr = 0.1
bs = 32
565/729
n_hl = 7
n_n = 50
lr = 0.1
bs = 64
566/729
n_hl = 7
n_n = 50
lr = 0.1
bs = 128
567/729
n_hl = 8
n_n = 10
lr = 0.001
bs = 32
568/729
n_hl = 8
n_n = 10
lr = 0.001
bs = 64
569/729
n_hl = 8
n_n = 10
lr = 0.001
bs = 128
570/729
n_hl = 8
n_n = 10
lr = 0.01
bs = 32
571/729
n_hl = 8
n_n = 10
lr = 0.01
bs = 64
572/729
n_hl = 8
n_n = 10
lr = 0.01
bs = 128
573/729
n_hl = 8
n_n = 10
lr = 0.1
bs = 32
574/729
n_hl = 8
n_n = 10
lr = 0.1
bs = 64
575/729
n_hl = 8
n_n = 10
lr = 0.1
bs = 128
576/729
n_hl = 8
n_n = 15
lr = 0.001
bs = 32
577/729
n_hl = 8
n_n = 15
lr = 0.001
bs = 64
578/729
n_hl = 8
n_n = 15
lr = 0.001
bs = 128
579/729
n_hl = 8
n_n = 15
lr = 0.01
bs = 32
580/729
n_hl = 8
n_n =



720/729
n_hl = 10
n_n = 50
lr = 0.001
bs = 32
721/729
n_hl = 10
n_n = 50
lr = 0.001
bs = 64
722/729
n_hl = 10
n_n = 50
lr = 0.001
bs = 128
723/729
n_hl = 10
n_n = 50
lr = 0.01
bs = 32
724/729
n_hl = 10
n_n = 50
lr = 0.01
bs = 64
725/729
n_hl = 10
n_n = 50
lr = 0.01
bs = 128
726/729
n_hl = 10
n_n = 50
lr = 0.1
bs = 32
727/729
n_hl = 10
n_n = 50
lr = 0.1
bs = 64
728/729
n_hl = 10
n_n = 50
lr = 0.1
bs = 128


In [21]:
score

array([ 3.01499242e-01,  2.82255249e-01,  2.77019579e-01,  1.37390288e-01,
        3.06030228e-01,  3.02569743e-01,  2.21483856e-01,  2.59514005e-01,
        2.92958360e-01,  1.21425718e-01,  2.98421983e-01,  3.02047284e-01,
        8.12667148e-02,  3.00938856e-01,  3.11389116e-01, -1.13212495e-01,
        2.21283646e-01,  2.80570037e-01, -1.87491101e-01,  3.26685575e-01,
        3.20884168e-01,  2.76618421e-01, -1.04717051e-01,  3.02280036e-01,
        1.47904855e-01,  2.81800818e-01,  1.84172482e-01,  2.99527988e-01,
        1.85859773e-01,  2.90040834e-01,  2.15796789e-01,  2.97717420e-01,
        2.04211732e-01,  2.38545047e-01,  3.08539077e-01, -7.58934403e-03,
        3.07948466e-01,  3.02632547e-01,  3.07105812e-01, -8.40296464e-02,
        2.24160635e-01,  3.08360193e-01,  1.53832905e-01,  3.23798411e-01,
        1.37283252e-01,  2.73301515e-01,  3.24646980e-01,  1.24331835e-01,
       -1.83277404e-01,  2.81812465e-01,  2.58387113e-01,  2.99673129e-01,
        2.50174763e-01,  