# Toy modelling demonstration

This notebook demonstrates the use of several other scripts contained in this directory:
 - **[toy_data_generation.py](./toy_data_generation.py)**: generates "toy" datasets using known, specified generative processes
 - **[param_dict_generation.py](./param_dict_generation.py)**: generates parameter dicts specifying candidate models to train on the toy (or other, real) datasets
 - **[model_wrappers.py](./model_wrappers.py)**: presents a common interface for scikit-learn, statsmodels and XGBoost models, allowing multiple model types to be specified using the parameter dicts generated by param_dict_generation.py
 
The output is a csv file evaluating the performance of several different models on several toy datasets.

# Imports and settings

In [2]:
import multiprocessing
import pandas as pd
from sklearn.model_selection import train_test_split

from model_wrappers import try_models
from toy_data_generation import generate_linear_data
from param_dict_generation import generate_ols_list, generate_mlp_list, generate_xgb_list

In [3]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)

# Dataset generation

In [4]:
datasets = []
for nrows in [1000, 3000]:
    for nvars in [10, 30]:
        datasets.append((*generate_linear_data(nrows, nvars), f'linear_{nvars}vars_{nrows}rows'))
        print(f'Datasets with {nrows} rows and {nvars} vars successfully appended')

Datasets with 1000 rows and 10 vars successfully appended
Datasets with 1000 rows and 30 vars successfully appended
Datasets with 3000 rows and 10 vars successfully appended
Datasets with 3000 rows and 30 vars successfully appended


# Experiment list generation

## Statsmodels list

In [5]:
ols_list = generate_ols_list(alpha_list=[0.01, 1], L1_wt_list=[0, 0.5, 1])

statsmodels_param_list = ols_list

## Scikit list

In [6]:
n_cpus = multiprocessing.cpu_count()

In [7]:
mlp_list = generate_mlp_list(hidden_layout_list=[(200,), (50, 50)])

xgb_list = generate_xgb_list(depth_list = [3,5,7],
                             learning_rate_list = [0.03, 0.1, 0.3],
                             n_jobs=max(1, n_cpus-1),
                             objective='Linear',
                             early_stopping_rounds=10)

scikit_param_list = mlp_list

xgb_param_list = xgb_list

# Model training and evaluation

In [8]:
results = {}

for X, y, dataset_name in datasets:
    print(f'\n\n{dataset_name}\n')
    # update scikit_param_list for xgboost early stopping eval_sets if present
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=303)
    
    scikit_results = try_models(
            X_train, X_test, y_train, y_test,
            param_list=scikit_param_list, model='scikit'
        )
    for key in scikit_results:
        results[(dataset_name, key)] = scikit_results[key]
        
    xgb_results = try_models(
            X_train, X_test, y_train, y_test,
            param_list=xgb_param_list, model='xgb'
        )
    for key in xgb_results:
        results[(dataset_name, key)] = xgb_results[key]
        
    sm_results = try_models(
            X_train, X_test, y_train, y_test,
            param_list=statsmodels_param_list, model='statsmodels'
        )
    for key in sm_results:
        results[(dataset_name, key)] = sm_results[key]
    
display(pd.DataFrame(results).T)



linear_10vars_1000rows

MLP_200 trained in 0.65 seconds
MLP_50_50 trained in 0.83 seconds
XGB_Linear_maxd_3_lr_0.03 trained in 0.19 seconds
XGB_Linear_maxd_3_lr_0.1 trained in 0.03 seconds
XGB_Linear_maxd_3_lr_0.3 trained in 0.01 seconds
XGB_Linear_maxd_5_lr_0.03 trained in 0.07 seconds
XGB_Linear_maxd_5_lr_0.1 trained in 0.03 seconds
XGB_Linear_maxd_5_lr_0.3 trained in 0.01 seconds
XGB_Linear_maxd_7_lr_0.03 trained in 0.08 seconds
XGB_Linear_maxd_7_lr_0.1 trained in 0.03 seconds
XGB_Linear_maxd_7_lr_0.3 trained in 0.01 seconds
OLS_no_reg trained in 0.24 seconds
OLS_alpha_0.01_L1_0 trained in 0.04 seconds
OLS_alpha_0.01_L1_0.5 trained in 0.08 seconds
OLS_alpha_0.01_L1_1 trained in 0.06 seconds
OLS_alpha_1_L1_0 trained in 0.01 seconds
OLS_alpha_1_L1_0.5 trained in 0.02 seconds
OLS_alpha_1_L1_1 trained in 0.01 seconds


linear_30vars_1000rows

MLP_200 trained in 3.78 seconds
MLP_50_50 trained in 1.44 seconds
XGB_Linear_maxd_3_lr_0.03 trained in 0.29 seconds
XGB_Linear_maxd_3_lr_0.1 tra

Unnamed: 0,Unnamed: 1,mse_test,prediction_time,training_time
linear_10vars_1000rows,MLP_200,1.043281,0.001665,0.651705
linear_10vars_1000rows,MLP_50_50,1.062724,0.001359,0.827991
linear_10vars_1000rows,XGB_Linear_maxd_3_lr_0.03,1.07045,0.002043,0.189507
linear_10vars_1000rows,XGB_Linear_maxd_3_lr_0.1,1.059702,0.001249,0.034852
linear_10vars_1000rows,XGB_Linear_maxd_3_lr_0.3,1.064197,0.001048,0.011868
linear_10vars_1000rows,XGB_Linear_maxd_5_lr_0.03,1.062643,0.001737,0.07393
linear_10vars_1000rows,XGB_Linear_maxd_5_lr_0.1,1.067359,0.001206,0.027147
linear_10vars_1000rows,XGB_Linear_maxd_5_lr_0.3,1.061659,0.000954,0.011511
linear_10vars_1000rows,XGB_Linear_maxd_7_lr_0.03,1.09814,0.001999,0.082734
linear_10vars_1000rows,XGB_Linear_maxd_7_lr_0.1,1.101398,0.001122,0.029861


# Output results

In [8]:
pd.DataFrame(results).T.to_csv('results.csv')