# Sample Dataset

In [None]:
from sklearn import metrics
from src.utils import find_k2, find_k4, PRIMITIVES, plot_pair_histograms, sample_structure, replace_xs, replace_functions, str_to_sympy, sympy_to_dataset, SamplingError, wrap_expression, sample_and_replace
import numpy as np
from pathlib import Path
from IPython.display import display
import pandas as pd
import pickle

n_functions=len(PRIMITIVES) # == rows per k

# primitives combinations
matrix1 = np.arange(6).reshape(6,1)

matrix3 = np.array([
[0, 1, 5],
[1, 2, 0],
[2, 3, 1],
[3, 4, 2],
[4, 5, 3],
[5, 0, 4],
])

matrix2 = np.array(find_k2(matrix3))

## remaining unordered pairs
pairs = [
    (0,3), (0,5),
    (1,0), (1,4),
    (2,1), (2,4),
    (3,2), (3,5),
    (4,0), (4,3),
    (5,1), (5,2)
]

## takes a bit
## matrix4 = find_k4(pairs)
matrix4 = [[0, 3, 2, 1],
           [1, 0, 5, 2],
           [2, 4, 3, 5],
           [5, 1, 4, 0]
          ]

## find last two rows by hand
matrix4.append([3, 2, 0, 4])
matrix4.append([4, 5, 1, 3])
matrix4=np.array(matrix4)

combinations=[matrix1, matrix2, matrix3, matrix4]

## plot distribution of pairs and ordered pairs
plot_pair_histograms([matrix2, matrix3, matrix4], n_functions)

# random mapping index -> primitive
primitives=np.array(PRIMITIVES)
np.random.seed(0)
np.random.shuffle(primitives)
#print(primitives)

# for now, there is no k>4 implementation
for k in range(5):
    ## up to three variables
    for n_v in range(1,4):
        # equation structure
        if k==0:
            equations=[" + ".join([f"x_{i+1}" for i in range(n_v)]) for _ in range(n_functions)]
        else:
            equations=[sample_structure(k, n_v, seed=i) for i in range(n_functions)]
            #if k==1 and (n_v==2):
            #print(*equations, sep="\n")
            
            # fill in actual variables
            equations=[replace_xs(equations[i], n_v, seed=i+1) for i in range(n_functions)]
            
        # needs arbitrary but distinct and replicateable seeds
        seed_0 = k*100+n_v*10
        linearized_equations=[wrap_expression(equation, seed) for equation, seed in zip(equations, np.arange(seed_0, seed_0+len(equations)))]
        mult_equations=[sample_and_replace(equation, seed) for equation, seed in zip(equations, np.arange(seed_0, seed_0+len(equations)))]
        linearized_mult_equations=[wrap_expression(sample_and_replace(equation, seed), seed) for equation, seed in zip(equations, np.arange(seed_0, seed_0+len(equations)))]
            
            
        #print(*[str(eq)+"\n"+str(lineq)+"\n"+str(linmulteq)+"\n"+str(multeq) for eq, lineq, linmulteq, multeq in zip(equations, linearized_equations, linearized_mult_equations, mult_equations)], sep="\n\n")
            
            
        for mult in (False, True):
            
            # no mult of the same var
            if mult and (n_v==1):
                continue
            
            for lin in (False, True):
                #print(mult, lin)
                if (not mult) and (not lin):
                    equations_=equations
                elif (not mult) and lin:
                    equations_=linearized_equations
                if mult and (not lin):
                    equations_=mult_equations
                if mult and lin:
                    equations_=linearized_mult_equations

                # fill in actual functions
                equations_=[replace_functions(equations_[i], primitives[combinations[k-1][i]]) for i in range(n_functions)]

            
                # serialize
                file_path = Path(f"datasets/mult_{mult}-lin_{lin}") / f"equations_k{k}_nv{n_v}.txt"

                file_path.parent.mkdir(parents=True, exist_ok=True)
                with open(file_path, "wt") as f:
                    for eq in equations_:
                        # ugly syntax, but apparently intended pythonic usage
                        print(eq, file=f)
                        
                equations_debug=equations_
                # sample datasets
                equations_ = [str_to_sympy(eq) for eq in equations_]
                #print(*equations, sep="\n")
                '''for eq in equations:
                    display(eq)'''
                try:
                    datasets = [sympy_to_dataset(equations_[i], n=200, domain=(1e-10, 5.0), seed=i+2)[0] for i in range(n_functions)]
                except SamplingError:
                    print(*equations_debug, sep="\n")
                    print("\n")
                    print(*equations_, sep="\n")
                    
                for i, dataset in enumerate(datasets):
                    try:
                        extrapolation_dataset_neg=sympy_to_dataset(equations_[i], n=int(200/5/2), domain=(-2.5, -1e-10), seed=i+3)[0]
                    except SamplingError:
                        extrapolation_dataset_pos=sympy_to_dataset(equations_[i], n=int(200/5), domain=(5.0, 7.5), seed=i+3)[0]
                        dataset["extrapolation_input"]=extrapolation_dataset_pos["train_input"]
                        dataset["extrapolation_label"]=extrapolation_dataset_pos["train_label"]
                    else:
                        extrapolation_dataset_pos=sympy_to_dataset(equations_[i], n=int(200/5/2), domain=(5.0, 7.5), seed=i+3)[0]
                        dataset["extrapolation_input"]=np.concatenate(
                            (extrapolation_dataset_neg["train_input"], extrapolation_dataset_pos["train_input"]))
                        dataset["extrapolation_label"]=np.concatenate(
                            (extrapolation_dataset_neg["train_label"], extrapolation_dataset_pos["train_label"]))

                # datsets to pickle
                ## no csv, because we'll need a dict for kans, anyways
                folder = Path(f"datasets/mult_{mult}-lin_{lin}/datasets_k{k}_nv{n_v}")
                folder.mkdir(parents=True, exist_ok=True)

                for i, dataset in enumerate(datasets):
                    path=folder/f"{i}.pkl"
                    with open(path, 'wb') as f:
                        pickle.dump(dataset, f)


# sanity check

In [None]:
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
seed=0

mult=False
lin=True

for nv in (1,2,3):#(1,2,3):
    for k in (1,2,3,4):
        path=Path(f"datasets/mult_{mult}-lin_{lin}/equations_k{k}_nv{nv}.txt")
        with open(path, 'rt') as f:
            equation=f.read().splitlines()[seed]
        print(equation)
        path=Path(f"datasets/mult_False-lin_False/datasets_k{k}_nv{nv}/{seed}.pkl")
        with open(path, 'rb') as f:
            dataset=pickle.load(f)
        print(dataset["extrapolation_input"].shape)
        print(dataset["extrapolation_label"].shape)

        plt.scatter([x[0] for x in dataset["train_input"]], dataset["train_label"])
        plt.scatter([x[0] for x in dataset["extrapolation_input"]], dataset["extrapolation_label"])
        plt.show()

# datasets df
quick and dirty, run the experiment script (without training) and collect all runs

In [None]:
import json
import time
import numpy as np
import traceback
import threading
import matplotlib.pyplot as plt
from src.utils import PRIMITIVES, rmse, nrmse, round_and_simplify, sl_rmse, rmspe, r2, make_vectorized, NumericTensorJSONEncoder
from pathlib import Path
import pickle
import sympy as sp
import re

import os
import sys

'''import warnings
warnings.simplefilter("error")'''
#os.environ["PYTHONWARNINGS"] = "error"

# sys.path.append(os.path.normpath(os.getcwd() + "/autora-theorist-darts/src"))


#print(train)
import src.utils

success_metric=nrmse
metric_norm="std"
success_threshold=0.2
scale_steps=1.0#/100

success_threshold = 0.01

comment = "lower th (0.01)"

'''# List the variable names we want to record
keys = ('success_metric', 'metric_norm', 'success_threshold',
        'scale_steps', 'comment')

# Grab their current values in *this* scope.  
# If any name is missing, Python will raise a KeyError/NameError as requested.
vals = {k: locals()[k] for k in keys}'''

import os

#os.environ["WANDB_DEBUG"] = "0"

#from autora.theorist.darts.model_search import Network

'''
???
def create_int_bins(start, end, n_edges):
    bin_edges = np.linspace(start, end, n_edges)
    # Round edges to integers
    bin_edges = np.round(bin_edges).astype(int)
    return bin_edges
'''

def stop_on_enter():
    input("Press [Enter] at any time to stop the sweep...\n")
    print("Exiting on user request.")
    # os._exit(1) exits the whole process, works even in wandb agent
    os._exit(1)

# random mapping index -> primitive
primitives=np.array(PRIMITIVES)
np.random.seed(0)
np.random.shuffle(primitives)


# darts hps
config2={
  'arch_discretization': 'softmax',
  'arch_learning_rate_max': 0.6473364090755143,
  'arch_momentum': 0.0017282364618274,
  'batch_size': 20,
  'coeff_discretization': 'max',
  'finetune_epochs': 10,
  'param_learning_rate_max': 0.0006232415860704,
  'param_momentum': 1.9581913155e-06,
  'primitives': ["none","power_two","power_three","exp","ln","reciprocal","sin"],
  'ratio_train_val': 1.0,
  'safety': 'ramped',
  'steps': 720,
  'train_output_layer': False,}

config2={'Name': 'fallen-sweep-100',
 'arch_discretization': 'softmax',
 'arch_learning_rate_max': 0.8893474836112625,
 'arch_momentum': 0.0066695070207013,
 'arch_weight_decay': 0.0002014087677275,
 'batch_size': 7,
 'coeff_discretization': 'max',
 'coeff_lr_min_scale': 1.0,#0.0009878495458311,
 'finetune_epochs': 60,#8
 'init_range': 1.0120530236610816,
 'param_learning_rate_max': 5e-4,#3.373269557527869e-09,
 'param_momentum': 1.509044847784784e-09,
 'param_weight_decay': 7.2956488739e-06,
 'pruning': 'none',
 'ratio_train_val': 4.0,#2.0,
 'safety': 'safe',
 'size': 2,
 'train_output_layer': True,
 'primitives': ["none","power_two","power_three","exp","ln","reciprocal","sin","id"],
 'loss_fn':"mse"}

config4={
  'arch_discretization': 'softmax',
  'arch_learning_rate_max': 1.6284305674366029,
  'arch_momentum': 0.0022416242214032,
  'batch_size': 20,
  'coeff_discretization': 'max',
  'finetune_epochs': 10,
  'param_learning_rate_max': 0.0012008188033979,
  'param_momentum': 0.0027466356242729,
  'primitives': ["none","power_two","power_three","exp","ln","reciprocal","sin"],
  'ratio_train_val': 1.0,
  'safety': 'ramped',
  'steps': 720,
  'train_output_layer': False,
}

config6={
  'arch_discretization': 'softmax',
  'arch_learning_rate_max': 9.928562713532394,
  'arch_momentum': 8.784944800579377e-09,
  'batch_size': 20,
  'coeff_discretization': 'max',
  'finetune_epochs': 10,
  'param_learning_rate_max': 0.0654276023875558,
  'param_momentum': 5.318695973457925e-10,
  'primitives': ["none","power_two","power_three","exp","ln","reciprocal","sin"],
  'ratio_train_val': 1.0,
  'safety': 'ramped',
  'steps': 720,
  'train_output_layer': False,}
    
config4=config2
config6=config2

configs=(config2, config4, config6)

sizes=[1,2,3,4,5]
        
# conditions
c_multiplication = (False, True)
c_prior_knowledge = (True, False)
c_linear_transformations = (False, True)

# for simulatin results for debugging
test_rng=np.random.default_rng(42)

def main():
    df=None
    for multiplication in c_multiplication:
        for prior_knowledge in c_prior_knowledge:
            for linear_transformations in c_linear_transformations:
                if (not linear_transformations) and (not multiplication):
                    continue
                # k=0 should be trivial
                for k in range(0,5):
                    all_failed=True
                    for n_v in range(1,4):                            
                        # no mult between at least two independent variables
                        if multiplication and (n_v==1):
                            continue
                        for seed in range(3):
                            # for now we concider each seed an individual trial (iid problem!)
                            dataset_folder=Path(f"datasets/mult_{multiplication}-lin_{linear_transformations}")
                            with open(dataset_folder/f"equations_k{k}_nv{n_v}.txt", "rt") as f:
                                equations=[line.rstrip() for line in f]
                            for combination in range(6):
                                # for continuing when interrupted

                                    
                                # start a trial
                                path=f"Results/mult_{multiplication}-prior_{prior_knowledge}-lin_{linear_transformations}/k_{k}-nv_{n_v}-comb{combination}-seed{seed}.json"
                                #print(f"--------------------{path}--------------------")
                                path=Path(path)
                                run_logs=[]
                                
                                #primtitives = 
                                
                                text        = equations[combination]
                                substrings  = ['ln', 'exp', '1/', "**2", "**3", "sin"]

                                pattern = '|'.join(map(re.escape, substrings))     #  foo|bar|baz  (escaped)

                                actual_prims = [m.group(0) for m in re.finditer(pattern, text)]
                                
                                print(equations[combination])

                                # for k=0, all structures are the same
                                if k==0 and (combination>0) and (not linear_transformations):
                                    continue
                                for size in (1,2):#range(max(k-1,1),k+3):
                                    data_dict={"mult":multiplication,"linear":linear_transformations, 
                                               "prior knowledge": prior_knowledge,"k":k, "n_v": n_v, "structure": actual_prims, "seed": seed, "size factor": size
                                              }
                                    if df is None:                 # first iteration → create the DataFrame
                                        df = pd.DataFrame([data_dict])
                                    else:                          # subsequent iterations → append one row
                                        df.loc[len(df)] = data_dict 
                    
    return df


df=main()

In [None]:
df