TO DO:
 - add simple mean evaluation back in
 - create the model_params dict programmatically (in a script)
 - add datasets with interactions

# Multi-model comparison (initially on toy data)
 - Data is provided/generated in sci-kit learn friendly format (not including constant?)
 - Implements a train/test split or similar
 - Tries prediction using a list of models that is provided
 - Outputs metrics
  - RMSE, MAE, training and prediction time

# Imports

In [None]:
%load_ext autoreload
%autoreload 1

In [12]:
import os
import time

import graphviz
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pygam import LinearGAM, PoissonGAM, s, f
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse
from xgboost import XGBRegressor

%aimport model_wrappers, toy_data_generation
from model_wrappers import try_scikit_model, try_statsmodels_model
from toy_data_generation import generate_linear_data, generate_poisson_data

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

# Data generation

In [7]:
datasets = []
for nrows in [300, 1000]:
    for nvars in [10, 30]:
        datasets.append((*generate_linear_data(nrows, nvars), f'linear_{nvars}vars_{nrows}rows'))
        datasets.append((*generate_poisson_data(nrows, nvars, 0.5, 10), f'poisson_{nvars}vars_{nrows}rows'))

# Experiment lists

## Statsmodels list

In [8]:
statsmodels_param_list = [
    {
        'model': sm.OLS,
        'model_name': 'OLS_reg',
        'regularize': True,
        'reg_params': {
            'alpha': 0.01,
            'L1_wt': 0.5
        }
    },
    {
        'model': sm.OLS,
        'model_name': 'OLS',
        'regularize': False,
    },
#     {
#         'model': sm.GLM,
#         'model_name': 'Poisson GLM',
#         'regularize': False,
#         'model_params': {
#             'family': sm.families.Poisson()
#         }
#     }
]

## Scikit list

In [9]:
scikit_param_list = [
    {
        'model': MLPRegressor,
        'model_name': 'MLP_50',
        'model_params': {
            'hidden_layer_sizes': (50,),
        },
    },
    {
        'model': MLPRegressor,
        'model_name': 'MLP_20_20',
        'model_params': {
            'hidden_layer_sizes': (20,20),
        },
    },
#     {
#         'model': XGBRegressor,
#         'model_name': 'XGB_poisson',
#         'model_params': {
#             'max_depth': 5, 
#             'learning_rate': 0.1,
#             'objective': 'count:poisson',
#             'base_score': 0.05,
#             'n_estimators': 200,
#             'silent': True,
#         },
#         'fit_params': {
#             'early_stopping_rounds': 5,
#             'verbose': False,
#         },
#     },
#     {
#         'model': XGBRegressor,
#         'model_name': 'XGB_poisson_reg',
#         'model_params': {
#             'max_depth': 5, 
#             'learning_rate': 0.1,
#             'objective': 'count:poisson',
#             'base_score': 0.05,
#             'reg_alpha': 2,
#             'reg_lambda': 2,
#             'n_estimators': 200,
#         },
#         'fit_params': {
#             'early_stopping_rounds': 5,
#             'verbose': False,
#         },
#     },
    {
        'model': XGBRegressor,
        'model_name': 'XGB_linear',
        'model_params': {
            'max_depth': 5, 
            'learning_rate': 0.1,
            'base_score': 0.05,
        },
        'fit_params': {
            'early_stopping_rounds': 5,
            'verbose': False,
        },
    },
#     {
#         'model': LinearGAM,
#         'model_name': 'LinearGAM',
#         'model_params': {},
#     },
#     {
#         'model': PoissonGAM,
#         'model_name': 'PoissonGAM',
#         'model_params': {},
#     },
]

# Minimal Interesting Product

In [10]:
def mip(X_train, X_test, y_train, y_test, statsmodels_list=None, scikit_list=None):
    results = {}
    for trial in statsmodels_list:
        results[f"{trial['model_name']}"] = try_statsmodels_model(X_train, X_test, y_train, y_test, trial)
    for trial in scikit_list:
        results[f"{trial['model_name']}"] = try_scikit_model(X_train, X_test, y_train, y_test, trial)
    return results

# Loop over datasets

In [13]:
results = {}

for X, y, dataset_name in datasets:
    print(f'\n\n{dataset_name}\n')
    # update scikit_param_list for xgboost early stopping eval_sets if present
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=303)
    model_results = mip(
        X_train, X_test, y_train, y_test,
        statsmodels_list=statsmodels_param_list, scikit_list=scikit_param_list
    )
    for key in model_results:
        results[(dataset_name, key)] = model_results[key]
    
display(pd.DataFrame(results).T)



linear_10vars_300rows

OLS_reg trained in 0.10 seconds
OLS trained in 0.01 seconds
MLP_50 trained in 0.13 seconds
MLP_20_20 trained in 0.16 seconds
XGB_linear trained in 0.05 seconds


poisson_10vars_300rows

OLS_reg trained in 0.02 seconds
OLS trained in 0.01 seconds
MLP_50 trained in 0.05 seconds
MLP_20_20 trained in 0.03 seconds
XGB_linear trained in 0.00 seconds


linear_30vars_300rows

OLS_reg trained in 0.27 seconds
OLS trained in 0.01 seconds
MLP_50 trained in 0.04 seconds
MLP_20_20 trained in 0.17 seconds
XGB_linear trained in 0.09 seconds


poisson_30vars_300rows

OLS_reg trained in 0.02 seconds
OLS trained in 0.01 seconds
MLP_50 trained in 0.10 seconds
MLP_20_20 trained in 0.08 seconds
XGB_linear trained in 0.01 seconds


linear_10vars_1000rows

OLS_reg trained in 0.09 seconds
OLS trained in 0.01 seconds
MLP_50 trained in 0.38 seconds
MLP_20_20 trained in 0.33 seconds
XGB_linear trained in 0.11 seconds


poisson_10vars_1000rows

OLS_reg trained in 0.01 seconds
OLS trained i

Unnamed: 0,Unnamed: 1,mse_test,prediction_time,training_time
linear_10vars_300rows,OLS_reg,0.111662,0.001054,0.10052
linear_10vars_300rows,OLS,0.102532,0.000579,0.007711
linear_10vars_300rows,MLP_50,0.134248,0.00112,0.13189
linear_10vars_300rows,MLP_20_20,0.136781,0.000554,0.157334
linear_10vars_300rows,XGB_linear,0.223465,0.001309,0.050699
poisson_10vars_300rows,OLS_reg,0.000264,0.000981,0.017624
poisson_10vars_300rows,OLS,0.001076,0.00076,0.006786
poisson_10vars_300rows,MLP_50,0.007676,0.000979,0.051269
poisson_10vars_300rows,MLP_20_20,0.005174,0.001004,0.029562
poisson_10vars_300rows,XGB_linear,0.00207,0.000813,0.004445


In [40]:
np.random.seed(None)