In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection
import time
import pickle
#import antigravity
import random
random.seed()
import sklearn.neural_network
from collections import namedtuple
from handydict import dict as hd
from tqdm.notebook import tqdm

In [2]:
data_train_path = 'data\\train.csv'
data_test_path = 'data\\test.csv'
src_data_train = pd.read_csv(data_train_path)
src_data_test = pd.read_csv(data_test_path)
src_data_test['SalePrice']=np.NaN # training target is denoted by NaN
source_data = pd.concat((src_data_train,src_data_test))
source_data=source_data.set_index('Id')
rows_test=np.where(np.isnan(source_data['SalePrice']))[0] # these are used for my own training and validation of the model, and must suffice for all validation actions
rows_train=np.where(~np.isnan(source_data['SalePrice']))[0] # Note: These have no target label. These are only used for final submission

# get the target transformer so that I can un-transform the target for final error evaluation
p_target_basic = pd.DataFrame(source_data['SalePrice'])
with open('target_transformer.pickle','rb') as f:
    pr_target_pipe = pickle.load(f)

# get the cleaned, augumented data
p_data = pd.read_pickle('p_data.pickle')
pr_target_log = pd.read_pickle('p_target_transformed.pickle')
p_data.shape

(2919, 91)

In [3]:
# additionally rescale the data - NNs are supposedely sensitive to data scaling
p_data_scaler = sklearn.preprocessing.RobustScaler()
training_data = p_data_scaler.fit_transform(p_data)[rows_train,:]

# training_target is already transformed
training_target = pr_target_log.iloc[rows_train].values.ravel()

In [4]:
def layers_function(nn_width,nn_depth):
    '''
    create a tuple of sizes for layers out of two parameters: initial layer size, and count of layers
    '''
    return tuple(np.int32(np.logspace(np.log10(nn_width),np.log10(5.0),nn_depth)))

In [5]:
layers_function(120,5)

(119, 54, 24, 11, 5)

* Concept: grid search for grid_size, grid_nn_depth, alpha, learning_rate

In [6]:
validation_errors=[]
t_learning_costs=[]
worst_error_for_this_config=999999
best_config_validation_error=np.NaN
this_config_errors=[]
configs=[]



In [7]:
grid_alphas = np.logspace(-6,-3,4); print(f'alpha (L2 regularizer) rates: {grid_alphas}');
grid_nn_widths=[25,50,75,125,200]
grid_nn_depths=[2,4,6,8,10]
grid_learningrates=np.logspace(-5,-1,5); print(f'learning rates: {grid_learningrates}');
grid_activations=['logistic','tanh','relu']

alpha (L2 regularizer) rates: [1.e-06 1.e-05 1.e-04 1.e-03]
learning rates: [1.e-05 1.e-04 1.e-03 1.e-02 1.e-01]


In [8]:
configs=[]
for idxa, learningrate in enumerate(grid_learningrates):
    for idxb, alpha in enumerate(grid_alphas):
        for idxc, nn_width in enumerate(grid_nn_widths):
            for idxd, nn_depth in enumerate(grid_nn_depths):            
                for idxe, activation in enumerate(grid_activations):
                    this_config=hd({'idx':(idxa,idxb,idxc,idxd,idxe), 'alpha':alpha, 'learningrate':learningrate, 'nn_width':nn_width,'nn_depth':nn_depth,'activation':activation})
                    configs.append(this_config)

In [None]:
best_alpha=-1
best_nn_width=-1
best_depth=-1
best_learningrate=-1
best_activation=''
best_validation_error=9999999

with tqdm(configs,ncols='100%') as pbar:
    pbar.set_description(f'e: {best_validation_error:0.1f} for alpha:{best_alpha}, lr:{best_learningrate}, width:{best_nn_width}, depth:{best_depth}, {best_activation}')
    for config in pbar:
        # prepare as per config
        basic_nn = sklearn.neural_network.MLPRegressor(
            hidden_layer_sizes=layers_function(config.nn_width,config.nn_depth),
            activation=config.activation,  
            alpha=config.alpha,
            learning_rate_init=config.learningrate,
            random_state=None,
            verbose=False,
            early_stopping=True,
            n_iter_no_change =20,
            tol = (pr_target_pipe.transform([[180050]])-pr_target_pipe.transform([[180000]]))[0][0], # tolerance of 50 USD on the price of the home
            validation_fraction=0.03,
            max_iter=20000)
        # execute
        # OK, I do not believe the SKLearn's internal validator all that much. I'd rather split the dataset by hand anyway.
        t0=time.perf_counter_ns()
        rs = sklearn.model_selection.ShuffleSplit(n_splits=3, test_size=0.03, random_state=random.randint(1,1024))   
        local_validation_errors=[]
        for train_index, validation_index in rs.split(rows_train):
            basic_nn.fit(training_data[train_index,:],training_target[train_index])            
            # evaluate 
            # note: since this regressor is self-validating AND early-stopping, check final error on entire dataset.
            validation_prediction=basic_nn.predict(training_data[validation_index,:]).reshape(1, -1)
            validation_target_values = training_target[validation_index].reshape(1, -1)
            local_validation_error=np.sqrt(np.mean(np.square(pr_target_pipe.inverse_transform(validation_target_values)-pr_target_pipe.inverse_transform(validation_prediction))))
            local_validation_errors.append(local_validation_error)
        validation_error=np.max(local_validation_errors)  # only remember the worst validation error. I do not want an overfitter!
        t_learning_cost=time.perf_counter_ns()-t0
        t_learning_costs.append(t_learning_cost)
        validation_errors.append(validation_error)
        if validation_error<best_validation_error:
            best_validation_error=validation_error
            best_alpha=config.alpha
            best_learningrate = config.learningrate
            best_depth=config.nn_depth
            best_nn_width=config.nn_width
            best_activation=config.activation
            pbar.set_description(f'e: {best_validation_error:0.1f} for alpha:{best_alpha}, lr:{best_learningrate}, width:{best_nn_width}, depth:{best_depth}, {best_activation}')          
            # save the state of the nn, can be loaded later from file
            with open('best_nn.pickle','wb') as file_handle: # save the pr_target_pipe state
                pickle.dump(basic_nn,file_handle)            

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1500.0), HTML(value='')), layout=Layout(d…

In [None]:
best_validation_error

In [None]:
validation_error