TO DO:
 - add simple mean evaluation back in
 - add datasets with interactions

# Multi-model comparison (initially on toy data)
 - Data is provided/generated in sci-kit learn friendly format (not including constant?)
 - Implements a train/test split or similar
 - Tries prediction using a list of models that is provided
 - Outputs metrics
  - RMSE, MAE, training and prediction time

# Imports

In [11]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import os
import time

import graphviz
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pygam import LinearGAM, PoissonGAM, s, f
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse
from xgboost import XGBRegressor

%aimport model_wrappers, toy_data_generation, param_dict_generation
from model_wrappers import try_scikit_model, try_statsmodels_model
from toy_data_generation import generate_linear_data, generate_poisson_data
from param_dict_generation import generate_ols_list, generate_glm_list, generate_mlp_list, generate_xgb_list

%matplotlib inline

In [13]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

# Data generation

In [14]:
datasets = []
for nrows in [10000, 30000]:
    for nvars in [30, 100]:
#         datasets.append((*generate_linear_data(nrows, nvars), f'linear_{nvars}vars_{nrows}rows'))
        datasets.append((*generate_poisson_data(nrows, nvars,
                                                binary_fraction=0.5,
                                                binary_imbalance=10),
                         f'poisson_{nvars}vars_{nrows}rows'))

# Experiment lists

## Statsmodels list

In [15]:
ols_list = generate_ols_list(alpha_list=[1,2,3], L1_wt_list=[0,0.5,1])

glm_list = generate_glm_list('Poisson', alpha_list=[0.001,0.01,0.1])

statsmodels_param_list = ols_list + glm_list

## Scikit list

In [16]:
mlp_list = generate_mlp_list(hidden_layout_list=[(50,), (20, 20)])

xgb_list = generate_xgb_list(depth_list = [2,4,6],
                             learning_rate_list = [0.01, 0.3],
                             objective='Poisson',
                             early_stopping_rounds=5)

scikit_param_list = mlp_list + xgb_list

# Run experiments on a single dataset

In [17]:
def try_models(X_train, X_test, y_train, y_test, statsmodels_list=None, scikit_list=None):
    results = {}
    for trial in statsmodels_list:
        try:
            results[f"{trial['model_name']}"] = try_statsmodels_model(X_train, X_test, y_train, y_test, trial)
        except:
            print(f'An error occured with {trial["model_name"]}')
    for trial in scikit_list:
        results[f"{trial['model_name']}"] = try_scikit_model(X_train, X_test, y_train, y_test, trial)
    return results

# Loop over datasets

In [18]:
results = {}

for X, y, dataset_name in datasets:
    print(f'\n\n{dataset_name}\n')
    # update scikit_param_list for xgboost early stopping eval_sets if present
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=303)
    model_results = try_models(
        X_train, X_test, y_train, y_test,
        statsmodels_list=statsmodels_param_list, scikit_list=scikit_param_list
    )
    for key in model_results:
        results[(dataset_name, key)] = model_results[key]
    
display(pd.DataFrame(results).T)



poisson_30vars_10000rows

OLS_no_reg trained in 0.03 seconds
OLS_alpha_1_L1_0 trained in 0.03 seconds
OLS_alpha_1_L1_0.5 trained in 0.04 seconds
OLS_alpha_1_L1_1 trained in 0.04 seconds
OLS_alpha_2_L1_0 trained in 0.03 seconds
OLS_alpha_2_L1_0.5 trained in 0.04 seconds
OLS_alpha_2_L1_1 trained in 0.04 seconds
OLS_alpha_3_L1_0 trained in 0.03 seconds
OLS_alpha_3_L1_0.5 trained in 0.04 seconds
OLS_alpha_3_L1_1 trained in 0.04 seconds
GLM_Poisson_no_reg trained in 0.17 seconds
GLM_Poisson_alpha_0.001 trained in 0.41 seconds
GLM_Poisson_alpha_0.01 trained in 0.25 seconds
GLM_Poisson_alpha_0.1 trained in 0.22 seconds
MLP_50 trained in 0.64 seconds
MLP_20_20 trained in 0.36 seconds
XGB_Poisson_maxd_2_lr_0.01 trained in 0.51 seconds
XGB_Poisson_maxd_2_lr_0.3 trained in 0.16 seconds
XGB_Poisson_maxd_4_lr_0.01 trained in 0.65 seconds
XGB_Poisson_maxd_4_lr_0.3 trained in 0.22 seconds
XGB_Poisson_maxd_6_lr_0.01 trained in 0.73 seconds
XGB_Poisson_maxd_6_lr_0.3 trained in 0.40 seconds


poisson_

Unnamed: 0,Unnamed: 1,mse_test,prediction_time,training_time
poisson_30vars_10000rows,OLS_no_reg,0.010586,0.001746,0.032323
poisson_30vars_10000rows,OLS_alpha_1_L1_0,0.01057,0.001529,0.02882
poisson_30vars_10000rows,OLS_alpha_1_L1_0.5,0.010667,0.001709,0.035631
poisson_30vars_10000rows,OLS_alpha_1_L1_1,0.010667,0.001563,0.035967
poisson_30vars_10000rows,OLS_alpha_2_L1_0,0.010591,0.00147,0.029215
poisson_30vars_10000rows,OLS_alpha_2_L1_0.5,0.010667,0.001565,0.039356
poisson_30vars_10000rows,OLS_alpha_2_L1_1,0.010667,0.001404,0.035228
poisson_30vars_10000rows,OLS_alpha_3_L1_0,0.010605,0.001427,0.029328
poisson_30vars_10000rows,OLS_alpha_3_L1_0.5,0.010667,0.001469,0.036009
poisson_30vars_10000rows,OLS_alpha_3_L1_1,0.010667,0.001501,0.037489


# Output results

In [19]:
pd.DataFrame(results).T.to_csv('results.csv')