# Toy modelling demonstration

This notebook demonstrates the use of several other scripts contained in this directory:
 - **toy_data_generation.py**: generates "toy" datasets using known, specified generative processes
 - **param_dict_generation.py**: generates parameter dicts specifying candidate models to train on the toy (or other, real) datasets
 - **model_wrappers.py**: presents a common interface for a) scikit-learn and b) statsmodels models, allowing multiple model types to be specified using the parameter dicts generated by param_dict_generation.py
 
The output is a csv file evaluating the performance of several (25) different models on several (12) toy datasets.

# Imports and settings

In [13]:
import multiprocessing
import pandas as pd
from sklearn.model_selection import train_test_split

from model_wrappers import try_models
from toy_data_generation import generate_linear_data, generate_interaction_data
from param_dict_generation import generate_ols_list, generate_mlp_list, generate_xgb_list

In [14]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

# Dataset generation

In [15]:
datasets = []
for nrows in [1000, 10000]:
    for nvars in [10, 30, 100]:
        datasets.append((*generate_linear_data(nrows, nvars), f'linear_{nvars}vars_{nrows}rows'))
        datasets.append((*generate_interaction_data(nrows, nvars),
                         f'interaction_{nvars}vars_{nrows}rows'))
        print(f'Datasets with {nrows} rows and {nvars} vars successfully appended')

Datasets with 1000 rows and 10 vars successfully appended
Datasets with 1000 rows and 30 vars successfully appended
Datasets with 1000 rows and 100 vars successfully appended
Datasets with 10000 rows and 10 vars successfully appended
Datasets with 10000 rows and 30 vars successfully appended
Datasets with 10000 rows and 100 vars successfully appended


# Experiment list generation

## Statsmodels list

In [16]:
ols_list = generate_ols_list(alpha_list=[0.001, 0.01, 0.1, 1], L1_wt_list=[0, 0.5, 1])

statsmodels_param_list = ols_list

## Scikit list

In [17]:
n_cpus = multiprocessing.cpu_count()

In [18]:
mlp_list = generate_mlp_list(hidden_layout_list=[(100,), (500,), (50, 50)])

xgb_list = generate_xgb_list(depth_list = [3,5,7],
                             learning_rate_list = [0.03, 0.1, 0.3],
                             n_jobs=max(1, n_cpus-1),
                             objective='Linear',
                             early_stopping_rounds=10)

scikit_param_list = mlp_list + xgb_list

# Model training and evaluation

In [None]:
results = {}

for X, y, dataset_name in datasets:
    print(f'\n\n{dataset_name}\n')
    # update scikit_param_list for xgboost early stopping eval_sets if present
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=303)
    model_results = try_models(
        X_train, X_test, y_train, y_test,
        statsmodels_list=statsmodels_param_list, scikit_list=scikit_param_list
    )
    for key in model_results:
        results[(dataset_name, key)] = model_results[key]
    
display(pd.DataFrame(results).T)

# Output results

In [12]:
pd.DataFrame(results).T.to_csv('results.csv')