In [1]:
import numpy as np
import matplotlib.pyplot as plt
from gplearn.gplearn.genetic import SymbolicRegressor
from load_data import load_data
from benchmarks import run_experiment, categorical_variables_per_dataset, create_categorical_variable_dict
from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor
from sklearn.linear_model import LinearRegression
from collections import defaultdict
import pandas as pd
from xgboost import XGBClassifier, XGBRegressor
import time

In [2]:
def get_model(model,task):
    if model == 'xgb':
        if task == 'regression':
            return XGBRegressor()
        elif task == 'classification':
            return XGBClassifier()
    elif model == 'ebm':
        if task == 'regression':
            return ExplainableBoostingRegressor()
        elif task == 'classification':
            return ExplainableBoostingClassifier()
    elif model == 'ebm_no_interactions':
        if task == 'regression':
            return ExplainableBoostingRegressor(interactions=0)
        elif task == 'classification':
            return ExplainableBoostingClassifier(interactions=0)

In [3]:
task = 'regression'
dataset_names = [
    'feynman_I_6_2b',
    'feynman_I_8_14',
    'feynman_I_9_18',
    'feynman_I_10_7',
    'feynman_I_12_2',
    'feynman_I_12_4',
    'feynman_I_12_11',
    'feynman_I_13_12',
    'feynman_I_14_3',
    'feynman_I_18_12',
    'feynman_I_24_6',
    'feynman_I_29_16',
    'feynman_I_30_5',
    'feynman_I_32_5',
    'feynman_I_34_8',
    'feynman_I_34_1',
    'feynman_I_37_4',
    'feynman_I_38_12',
    'feynman_I_39_11',
    'feynman_I_39_22',
    'feynman_I_40_1',
    'feynman_I_43_16',
    'feynman_I_43_31',
    'feynman_I_44_4',
    'feynman_I_47_23',
    'feynman_II_2_42',
    'feynman_II_3_24',
    'feynman_II_4_23',
    'feynman_II_6_11',
    'feynman_II_6_15a',
    'feynman_II_6_15b',
    'feynman_II_8_7',
    'feynman_II_10_9',
    'feynman_II_11_3',
    'feynman_II_11_20',
    'feynman_II_13_17',
    'feynman_II_13_23',
    'feynman_II_15_4',
    'feynman_II_15_5',
    'feynman_II_21_32',
    'feynman_II_24_17',
    'feynman_II_27_16',
    'feynman_II_27_18',
    'feynman_II_34_2a',
    'feynman_II_34_2',
    'feynman_II_34_11',
    'feynman_II_34_29a',
    'feynman_II_34_29b',
    'feynman_II_36_38',
    'feynman_II_37_1',
    'feynman_II_38_3',
    'feynman_III_4_32',
    'feynman_III_7_38',
    'feynman_III_8_54',
    'feynman_III_10_19',
    'feynman_III_13_18',
    'feynman_III_14_14',
    'feynman_III_15_12',
    'feynman_III_15_14',
    'feynman_III_15_27',
    'feynman_III_17_37',
    'feynman_III_19_51',
    'feynman_III_21_20'
]
global_seed = 0
model_names = ['xgb','ebm_no_interactions','ebm']

# First test all datasets
# for i, dataset_name in enumerate(dataset_names):
#     if i < 30:
#         continue
#     model = LinearRegression()
#     run_experiment(dataset_name, model, None, task, random_state=global_seed)

results = defaultdict(list)
for dataset_name in dataset_names:
    results['dataset_name'].append(dataset_name)
    for model_name in model_names:
        model = get_model(model_name,task)
        t1 = time.time()
        score_mean, score_std = run_experiment(dataset_name, model, None, task, random_state=global_seed)
        t2 = time.time()
        results[f'{model_name}_mean'].append(score_mean)
        results[f'{model_name}_std'].append(score_std)
        results[f'{model_name}_time'].append(t2-t1)
        
df = pd.DataFrame(results)
    

    

10it [00:24,  2.49s/it]
10it [00:13,  1.33s/it]
10it [00:26,  2.66s/it]
10it [00:29,  2.99s/it]
10it [00:12,  1.23s/it]
10it [00:44,  4.41s/it]
10it [00:33,  3.39s/it]
10it [00:24,  2.44s/it]
10it [00:55,  5.53s/it]
10it [00:28,  2.87s/it]
10it [00:09,  1.08it/s]
10it [00:15,  1.57s/it]
10it [00:28,  2.88s/it]
10it [00:11,  1.12s/it]
10it [00:41,  4.13s/it]
10it [00:28,  2.82s/it]
10it [00:09,  1.11it/s]
10it [00:24,  2.45s/it]
10it [00:30,  3.09s/it]
10it [00:13,  1.35s/it]
10it [00:55,  5.59s/it]
10it [00:30,  3.09s/it]
10it [00:13,  1.36s/it]
10it [01:04,  6.47s/it]
10it [00:28,  2.88s/it]
10it [00:09,  1.09it/s]
10it [00:23,  2.30s/it]
10it [00:29,  2.99s/it]
10it [00:09,  1.00it/s]
10it [00:32,  3.22s/it]
10it [00:29,  2.99s/it]
10it [00:11,  1.17s/it]
10it [00:37,  3.78s/it]
10it [00:30,  3.00s/it]
10it [00:11,  1.13s/it]
10it [01:20,  8.08s/it]
10it [00:29,  2.94s/it]
10it [00:08,  1.17it/s]
10it [00:22,  2.22s/it]
10it [00:28,  2.86s/it]
10it [00:10,  1.08s/it]
10it [00:39,  3.

In [4]:
df

Unnamed: 0,dataset_name,xgb_mean,xgb_std,xgb_time,ebm_no_interactions_mean,ebm_no_interactions_std,ebm_no_interactions_time,ebm_mean,ebm_std,ebm_time
0,feynman_I_6_2b,0.996671,0.000261,25.995540,0.730570,0.010095,14.045011,0.895938,0.003686,27.377342
1,feynman_I_8_14,0.989142,0.000397,31.512335,0.228597,0.011032,13.208467,0.966189,0.000272,45.140059
2,feynman_I_9_18,0.976900,0.000559,36.031110,0.909925,0.002275,25.836309,0.945126,0.001475,56.847583
3,feynman_I_10_7,0.999834,0.000005,30.198332,0.996535,0.000187,10.022070,0.999398,0.000047,16.572589
4,feynman_I_12_2,0.993360,0.000553,30.406873,0.675868,0.010803,12.147127,0.949692,0.003315,42.164870
...,...,...,...,...,...,...,...,...,...,...
58,feynman_III_15_14,0.997416,0.000280,30.632430,0.646541,0.004812,9.220850,0.953085,0.002092,24.905773
59,feynman_III_15_27,0.998038,0.000078,31.007622,0.820968,0.004969,9.123715,0.986825,0.000527,24.210151
60,feynman_III_17_37,0.998832,0.000051,31.584229,0.770900,0.002905,10.615647,0.981329,0.000532,39.784034
61,feynman_III_19_51,0.937512,0.011110,32.268627,0.220957,0.031657,11.942916,0.575101,0.044063,53.566086


In [None]:
df

In [5]:
df.to_csv('EBM_faills_results.csv')

In [116]:
df.sort_values(by='ebm_mean').head(40)

Unnamed: 0,dataset_name,xgb_mean,xgb_std,xgb_time,ebm_no_interactions_mean,ebm_no_interactions_std,ebm_no_interactions_time,ebm_mean,ebm_std,ebm_time
53,feynman_III_8_54,0.61101,0.01039,30.365184,0.020237,0.003095,7.734269,0.074298,0.003723,34.660742
61,feynman_III_19_51,0.937512,0.01111,32.268627,0.220957,0.031657,11.942916,0.575101,0.044063,53.566086
57,feynman_III_15_12,0.977261,0.001028,31.717607,0.252208,0.007971,11.166734,0.583533,0.003792,67.205912
13,feynman_I_32_5,0.988378,0.001017,30.570184,0.443666,0.014897,11.814825,0.835101,0.008541,40.087921
0,feynman_I_6_2b,0.996671,0.000261,25.99554,0.73057,0.010095,14.045011,0.895938,0.003686,27.377342
20,feynman_I_40_1,0.980615,0.000809,32.959921,0.736362,0.002952,16.916815,0.898646,0.002932,59.841904
11,feynman_I_29_16,0.983273,0.000888,31.651039,0.297988,0.00706,12.392869,0.902395,0.002095,81.819468
17,feynman_I_38_12,0.992669,0.000457,30.699962,0.578779,0.015849,12.52253,0.912578,0.006011,43.826944
56,feynman_III_14_14,0.985917,0.00135,33.078608,0.670101,0.014187,13.593013,0.917309,0.007047,63.893136
30,feynman_II_6_15b,0.995605,0.000251,31.303676,0.477612,0.00767,13.098211,0.919119,0.003125,60.59266


In [43]:
def extract_equation_number(dataset_name):
    parts = dataset_name.split("_")
    return ".".join(parts[1:])

In [112]:
def generate_latex_table_1(df,equations):
    df = df.set_index('dataset_name')
    res = r"""\begin{table}[]
\begin{tabular}{lllll}
\toprule
Eq. Num. & GAM & GA${^2}$M & XGB \\
\midrule
"""
    for equation in equations.keys():
        row = df.loc[equation,:]
        eq_num = extract_equation_number(equation)
        symbol = equations[equation]
        res += f"{eq_num} & {row['ebm_no_interactions_mean']: .3f} (" + f"{row['ebm_no_interactions_std']:.3f}"[1:]+f") &  {row['ebm_mean']:.3f} ("+f"{row['ebm_std']:.3f}"[1:]+f") & {row['xgb_mean']:.3f} ("+f"{row['xgb_std']:.3f}"[1:]+f") \\\\ \n"
    res+= r"""\bottomrule
\end{tabular}
\end{table}"""
    return res
    
    

In [113]:
equations_to_print = {
    'feynman_I_6_2b':r'f=e^{-\frac{(\theta-\theta_1)^2}{2\sigma^2}}/\sqrt{2\pi\sigma^2}',
    'feynman_I_8_14':r'd=\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}',
    'feynman_I_12_2':r'F=\frac{q_1 q_2}{4 \pi \epsilon r^2}',
    'feynman_I_12_11':r'F=q(E_f+Bv\sin(\theta))',
    'feynman_I_18_12':r'\tau=rF\sin(\theta)',
    'feynman_I_29_16':r'x=\sqrt{x_1^2+x_2^2-2x_1 x_2 \cos(\theta_1 - \theta_2)}',
    'feynman_I_32_5':r'P=\frac{q^2 a^2}{6\pi\epsilon c^3}',
    'feynman_I_40_1':r'n=n_0 e^{-\frac{magx}{k_b T}}',
    'feynman_II_2_42':r'P=\frac{\kappa(T_2-T_1)A}{d}'
}
print(generate_latex_table_1(df, equations_to_print))

\begin{table}[]
\begin{tabular}{lllll}
\toprule
Eq. Num. & GAM & GA${^2}$M & XGB \\
\midrule
I.6.2b &  0.731 (.010) &  0.896 (.004) & 0.997 (.000) \\ 
I.8.14 &  0.229 (.011) &  0.966 (.000) & 0.989 (.000) \\ 
I.12.2 &  0.676 (.011) &  0.950 (.003) & 0.993 (.001) \\ 
I.12.11 &  0.675 (.004) &  0.955 (.001) & 0.996 (.000) \\ 
I.18.12 &  0.760 (.002) &  0.981 (.000) & 0.999 (.000) \\ 
I.29.16 &  0.298 (.007) &  0.902 (.002) & 0.983 (.001) \\ 
I.32.5 &  0.444 (.015) &  0.835 (.009) & 0.988 (.001) \\ 
I.40.1 &  0.736 (.003) &  0.899 (.003) & 0.981 (.001) \\ 
II.2.42 &  0.615 (.006) &  0.937 (.002) & 0.990 (.000) \\ 
\bottomrule
\end{tabular}
\end{table}


In [114]:
def generate_latex_table_2(equations):
    res = r"""\begin{table}[]
\begin{tabular}{lllll}
\toprule
Eq. Num. & Equation \\
\midrule
"""
    for equation in equations.keys():
        eq_num = extract_equation_number(equation)
        symbol = equations[equation]
        res += f"{eq_num} & ${symbol}$ \\\\ \n"
    res+= r"""\bottomrule
\end{tabular}
\end{table}"""
    return res
    

In [115]:
print(generate_latex_table_2(equations_to_print))

\begin{table}[]
\begin{tabular}{lllll}
\toprule
Eq. Num. & Equation \\
\midrule
I.6.2b & $f=e^{-\frac{(\theta-\theta_1)^2}{2\sigma^2}}/\sqrt{2\pi\sigma^2}$ \\ 
I.8.14 & $d=\sqrt{(x_2-x_1)^2+(y_2-y_1)^2}$ \\ 
I.12.2 & $F=\frac{q_1 q_2}{4 \pi \epsilon r^2}$ \\ 
I.12.11 & $F=q(E_f+Bv\sin(\theta))$ \\ 
I.18.12 & $\tau=rF\sin(\theta)$ \\ 
I.29.16 & $x=\sqrt{x_1^2+x_2^2-2x_1 x_2 \cos(\theta_1 - \theta_2)}$ \\ 
I.32.5 & $P=\frac{q^2 a^2}{6\pi\epsilon c^3}$ \\ 
I.40.1 & $n=n_0 e^{-\frac{magx}{k_b T}}$ \\ 
II.2.42 & $P=\frac{\kappa(T_2-T_1)A}{d}$ \\ 
\bottomrule
\end{tabular}
\end{table}


In [118]:
dataset_name = 'feynman_I_18_12'
model = ExplainableBoostingRegressor()
score_mean, score_std, model = run_experiment(dataset_name, model, None, task, random_state=global_seed, return_model=True)
        

10it [00:34,  3.45s/it]


In [119]:
from interpret import show

ebm_global = model.explain_global()
show(ebm_global)

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt


In [120]:
model