In [1]:
import os
import sys
sys.path.append("../")

import pandas as pd
import numpy as np
import glob
import pickle
import matplotlib.pyplot as plt
from scipy.stats import norm
import scipy
from utils._bootstrap import bootstrap, CONF
import sympy

VAR_NAME = "D-CIPHER"
MSE_NAME = "Abl. D-CIPHER"

In [2]:
def combine(equation, var_or_mse):
    meta_reg = os.path.join(equation,var_or_mse,'*.p')
    meta_files = glob.glob(meta_reg)
    csv_files = [file.split('_meta.p')[0]+'_table.csv' for file in meta_files]
    dfs = []
    for meta_file, csv_file in zip(meta_files,csv_files):
        df = pd.read_csv(csv_file)
        with open(meta_file, 'rb') as f:
            setting = pickle.load(f)
            args = setting['arguments']
            gp_config = setting['gp_config']
            df['name'] = args.name
            df['equation_number'] = args.equation_number
            df['width'] = args.width
            df['frequency_per_dim'] = args.frequency_per_dim
            df['noise_ratio'] = args.noise_ratio

            if var_or_mse == 'var':
                df['full_grid_samples'] = args.full_grid_samples
                df['max_ind_basis'] = args.max_ind_basis
                df['basis'] = args.basis
            elif var_or_mse == 'mse':
                df['diff_engine'] = args.diff_engine

            df['conditions_set'] = args.conditions_set
            df['num_trials'] = args.num_trials
            df['normalization'] = args.normalization
            df['solver'] = args.solver
            df['global_seed'] = args.seed
            df['num_samples'] = args.num_samples
            df['source'] = setting['table']
            for key in gp_config.keys():
                if key not in ['function_set']:
                    df[key] = gp_config[key]
        dfs.append(df)    
    full_df = pd.concat(dfs,ignore_index=True)
    full_df.drop(columns=['Unnamed: 0'],inplace=True)
    return full_df

In [3]:
# Most equations are checked for correctness by the program but some may be miscategorized.
# This function allows for checking the correctness of equations according th the definition in Appendix E.8

def evaluate_correct(df, exprs, verbose=False):
    new_df = df.copy()
    for index, row in new_df.iterrows():
        truth_list = []
        eqC = row['eqC']
        for expr in exprs:
            truth_list.append(eqC == expr)
        if np.sum(truth_list) > 0:
            if (new_df.loc[index,'is_correct'] == False) and verbose:
                print(f"Changed to true: {eqC}")
            new_df.loc[index,'is_correct'] = True
        else:
            if (new_df.loc[index,'is_correct'] == True) and verbose:
                print(f"Changed to false: {eqC}")
            new_df.loc[index,'is_correct'] = False
    return new_df

In [4]:
# This function generates all equivalent functional forms of the given equation based on the given substitution dictionary as in Appendix E.8

X0,X1,X2,X3 = sympy.symbols('X0,X1,X2,X3',real=True)
C,C0,C1,C2,C3,C4,C5 = sympy.symbols('C,C0,C1,C2,C3,C4,C5')

import itertools
def generate_expr_list(f,sub_dict):
    keys, values = zip(*sub_dict.items())
    sub_variant_list = [dict(zip(keys, v)) for v in itertools.product(*values)]
    expr_list = []
    for sub_variant in sub_variant_list:
        g = f
        for key in sub_variant.keys():
            g = g.subs(key,sub_variant[key])
        expr_list.append(str(g))
    return expr_list

In [5]:
slm_var_df = combine('../results/SLM','var')
slm_mse_df = combine('../results/SLM','mse')

f = -(C0*X2 + C1) * sympy.exp(C2*X1+C3) + C4
sub_dict = {
    C0:[1,C],
    C1:[0,C,-C],
    C2:[1,C],
    C3:[0,C,-C],
    C4:[0,C,-C]
}

expr_list = generate_expr_list(f,sub_dict)

slm_var_df = evaluate_correct(slm_var_df,expr_list,verbose=True)
slm_mse_df = evaluate_correct(slm_mse_df,expr_list,verbose=True)

Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)
Changed to true: -X2*exp(C*X1)


In [6]:
num_operators = 5
noise_ratios = [0.001,0.01]
conf = CONF
means = {}
stds_res = {}

for ind, df in enumerate([slm_var_df,slm_mse_df]):

    for index, row in df.iterrows():
        if row['operator_0'] < 0:
            signs = np.ones(num_operators) * -1
        else:
            signs = np.ones(num_operators) 
        df.loc[index,'error'] = 0.0
        for i in range(num_operators):
            df.loc[index,'error'] += (df.loc[index,f'operator_{i}'] - signs[i]*df.loc[index,f'target_weights_{i}']) ** 2
        df.loc[index,'error'] = np.sqrt(df.loc[index,'error']/num_operators)
    
    ints = [bootstrap(df.loc[df['noise_ratio'] == noise_ratio,f"error"].to_numpy(float).reshape(1,-1),np.mean,vectorized=True,confidence_level=conf).confidence_interval for noise_ratio in noise_ratios]
    stds = [bootstrap(df.loc[df['noise_ratio'] == noise_ratio,f"error"].to_numpy(float).reshape(1,-1),np.mean,vectorized=True,confidence_level=conf).standard_error for noise_ratio in noise_ratios]

    lows = [i.low for i in ints]
    highs = [i.high for i in ints]


    means[f"{'var' if ind == 0 else 'mse'}"] = df.groupby('noise_ratio')[f"error"].mean()
    stds_res[f"{'var' if ind == 0 else 'mse'}"] = stds

result_df = pd.merge(means['var'],means['mse'],on='noise_ratio',suffixes=('_var','_mse'))
result_df['std_var'] = stds_res['var']
result_df['std_mse'] = stds_res['mse']

result_df.columns = ['D-CIPHER','Ablated','D-CIPHER std','Ablated std']
result_df = result_df[['D-CIPHER','D-CIPHER std','Ablated','Ablated std']]

print("Sucess Probability")
print(result_df)

Sucess Probability
             D-CIPHER  D-CIPHER std   Ablated  Ablated std
noise_ratio                                               
0.001        0.007047      0.000757  0.016721     0.000922
0.010        0.007595      0.001180  0.016523     0.000787


In [7]:
z = norm.ppf(1 - (1-CONF)/2)
var = slm_var_df.groupby('noise_ratio')['is_correct']
mse = slm_mse_df.groupby('noise_ratio')['is_correct']
var_int = list(z*np.sqrt((var.mean() * (1-var.mean()))/var.count()))
mse_int = list(z*np.sqrt((mse.mean() * (1-mse.mean()))/mse.count()))

result_df = pd.merge(var.mean(),mse.mean(),on='noise_ratio',suffixes=('_var','_mse'))
result_df['std_var'] = var_int
result_df['std_mse'] = mse_int

result_df.columns = ['D-CIPHER','Ablated','D-CIPHER std','Ablated std']
result_df = result_df[['D-CIPHER','D-CIPHER std','Ablated','Ablated std']]

print("Average RMSE")
print(result_df)


Average RMSE
             D-CIPHER  D-CIPHER std  Ablated  Ablated std
noise_ratio                                              
0.001             0.6      0.154919      0.2     0.126491
0.010             0.5      0.158114      0.2     0.126491
