Skip to content

Commit

Permalink
update to operon rev. 382b68d8
Browse files Browse the repository at this point in the history
  • Loading branch information
foolnotion committed Aug 11, 2023
1 parent 8b7595d commit 4ee56f9
Show file tree
Hide file tree
Showing 13 changed files with 284 additions and 221 deletions.
7 changes: 4 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ find_package(Ceres CONFIG)
find_package(Eigen3 REQUIRED)
find_package(FastFloat REQUIRED)
find_package(Threads REQUIRED)
find_package(lbfgs REQUIRED)
find_package(operon REQUIRED)
find_package(outcome REQUIRED)
find_package(pratt-parser REQUIRED)
find_package(pybind11 REQUIRED)
find_package(unordered_dense REQUIRED)
Expand All @@ -43,7 +45,6 @@ pybind11_add_module(
pyoperon_pyoperon
MODULE
source/algorithm.cpp
source/autodiff.cpp
source/benchmark.cpp
source/creator.cpp
source/crossover.cpp
Expand Down Expand Up @@ -129,8 +130,8 @@ target_link_libraries(pyoperon_pyoperon PRIVATE
if (MSVC)
target_compile_options(pyoperon_pyoperon PRIVATE "/std:c++latest")
else ()
if (UNIX AND NOT MAC)
target_link_options(pyoperon_pyoperon PRIVATE "-Wl,--no-undefined")
if (UNIX AND NOT APPLE)
target_link_options(pyoperon_pyoperon PUBLIC "-Wl,--no-undefined")
endif()
endif()

Expand Down
7 changes: 4 additions & 3 deletions example/operon-bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,12 @@
crossover = Operon.SubtreeCrossover(crossover_internal_probability, maxD, maxL)

# define fitness evaluation
interpreter = Operon.Interpreter() # tree interpreter
dtable = Operon.DispatchTable()
error_metric = Operon.R2() # use the coefficient of determination as fitness
evaluator = Operon.Evaluator(problem, interpreter, error_metric, True) # initialize evaluator, use linear scaling = True
evaluator = Operon.Evaluator(problem, dtable, error_metric, True) # initialize evaluator, use linear scaling = True
evaluator.Budget = 1000 * 1000 # computational budget
evaluator.LocalOptimizationIterations = 0 # number of local optimization iterations (coefficient tuning using gradient descent)

optimizer = Operon.Optimizer(dtable, problem, optimizer="lbfgs", likelihood="gaussian", iterations=10, batchsize=50)

# define how new offspring are created
generator = Operon.BasicOffspringGenerator(evaluator, crossover, mutation, selector, selector)
Expand Down
46 changes: 21 additions & 25 deletions example/operon-sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,30 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, make_scorer
from sklearn.metrics import r2_score, make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from pyoperon.sklearn import SymbolicRegressor
from pyoperon import R2, MSE, InfixFormatter, FitLeastSquares, Interpreter

df_train = pd.read_csv('/home/bogdb/src/poetryenv/notebooks/sr-workshop/postprocessing/data/stage1/data/3946_extrapolation_easy_data_train.csv')
df_test = pd.read_csv('/home/bogdb/src/poetryenv/notebooks/sr-workshop/postprocessing/data/stage1/data/3946_extrapolation_easy_data_train.csv')
df = pd.read_csv('/home/bogdb/src/operon/data/Poly-10.csv')
X = df.iloc[:,:-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)

print(df_train.columns)
y_pred = RandomForestRegressor(n_estimators=100).fit(X_train, y_train).predict(X_train)
sErr = np.sqrt(mean_squared_error(y_train, y_pred))

D_train = np.asarray(df_train)
D_test = np.asarray(df_test)
# print(df_train.columns)

X_train, y_train = D_train[:,:-1], D_train[:,-1]
X_test, y_test = D_train[:,:-1], D_train[:,-1]
# D_train = np.asarray(df_train)
# D_test = np.asarray(df_test)

# X_train, y_train = D_train[:,:-1], D_train[:,-1]
# X_test, y_test = D_train[:,:-1], D_train[:,-1]

from sympy import parse_expr
import matplotlib.pyplot as plt
import seaborn as sns

reg = SymbolicRegressor(
allowed_symbols= "add,sub,mul,div,constant,variable",
Expand All @@ -37,7 +42,8 @@
initialization_max_length= 10,
initialization_method= "btc",
irregularity_bias= 0.0,
local_iterations= 0,
local_iterations= 5,
optimizer='lm',
male_selector= "tournament",
max_depth= 10,
max_evaluations= 1000000,
Expand All @@ -54,26 +60,16 @@
reinserter= "keep-best",
time_limit= 900,
tournament_size= 3,
uncertainty= [sErr]
)

print(X_train.shape, y_train.shape)

reg.fit(X_train, y_train)
values = [s['objective_values'] for s in reg.pareto_front_]
for v in values:
print(v)
res = [(s['objective_values'], s['tree'], s['minimum_description_length']) for s in reg.pareto_front_]
for obj, expr, mdl in res:
print(obj, mdl, reg.get_model_string(expr, 16))

m = reg.model_
s = reg.get_model_string(m, 3, ['a'])
s = reg.get_model_string(m, 3)
print(s)


fig, ax = plt.subplots(figsize=(18,8))
ax.grid(True, linestyle='dotted')
ax.set(xlabel='Obj 1', ylabel='Obj 2')
sns.scatterplot(ax=ax, x=[x[1] for x in values], y=[x[0] for x in values])

from pyoperon import RankSorter
rs = RankSorter()
fronts = rs.Sort(values)
print(fronts)
20 changes: 10 additions & 10 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
stdenv_ = pkgs.overrideCC pkgs.llvmPackages_16.stdenv (
pkgs.clang_16.override { gccForLibs = pkgs.gcc13.cc; }
);
python = pkgs.python310;
python_ = pkgs.python310;

operon = pkgs.callPackage ./nix/operon {
enableShared = enableShared;
Expand All @@ -49,14 +49,14 @@
cmake
ninja
pkg-config
python
python.pkgs.pybind11
python_
python_.pkgs.pybind11
];

buildInputs = with pkgs; [
python.pkgs.setuptools
python.pkgs.wheel
python.pkgs.requests
python_.pkgs.setuptools
python_.pkgs.wheel
python_.pkgs.requests
operon
] ++ operon.buildInputs;
};
Expand All @@ -78,10 +78,10 @@
devShells.default = stdenv_.mkDerivation {
name = "pyoperon-dev";
nativeBuildInputs = pyoperon.nativeBuildInputs;
buildInputs = pyoperon.buildInputs ++ (with pkgs; [ gdb valgrind ])
++ (with python.pkgs; [ scikit-build ] ) # cmake integration and release preparation
++ (with python.pkgs; [ numpy scikit-learn pandas ipdb sympy requests ])
++ (with pkgs; [ (pmlb.override { pythonPackages = python.pkgs; }) ]);
buildInputs = pyoperon.buildInputs ++ (with pkgs; [ gdb valgrind gcc13 ])
++ (with python_.pkgs; [ scikit-build ] ) # cmake integration and release preparation
++ (with python_.pkgs; [ numpy scikit-learn pandas ipdb sympy requests matplotlib ])
++ (with pkgs; [ (pmlb.override { pythonPackages = python_.pkgs; }) ]);
};

# backwards compatibility
Expand Down
12 changes: 9 additions & 3 deletions nix/operon/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@ stdenv.mkDerivation rec {
pname = "operon";
version = "0.3.1";

#src = /home/bogdb/src/operon-mdl-fix;

src = fetchFromGitHub {
owner = "heal-research";
repo = "operon";
rev = "0e359494b0239f4427d9518097bb304641ea7990";
sha256 = "sha256-WXV295CF1fqG7KrOS+uSi7S7+yLAQ6r44rU+RTGivxA=";
rev = "382b68d83a6c693dca5852d251e7991250f09b3f";
hash = "sha256-zUlQ5bPjHWim7XA3JXYERiwM1YsA97dSPbRBBy2XD5Y=";
};

nativeBuildInputs = [ cmake git ];
Expand All @@ -52,10 +54,14 @@ stdenv.mkDerivation rec {
eve
fast_float
git
lbfgs
ned14-outcome
ned14-quickcpplib
ned14-status-code
pkg-config
pratt-parser
unordered_dense
taskflow
unordered_dense
vstat
xxHash
(scnlib.override { enableShared = enableShared; })
Expand Down
62 changes: 44 additions & 18 deletions pyoperon/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ def __init__(self,
offspring_generator = 'basic',
reinserter = 'replace-worst',
objectives = ['r2'],
optimizer = 'lbfgs',
optimizer_likelihood = 'gaussian',
optimizer_likelihood_loginput = False,
optimizer_batch_size = 0,
max_length = 50,
max_depth = 10,
initialization_method = 'btc',
Expand All @@ -57,6 +61,7 @@ def __init__(self,
irregularity_bias = 0.0,
epsilon = 1e-5,
model_selection_criterion = 'minimum_description_length',
uncertainty = [1],
n_threads = 1,
time_limit = None,
random_state = None
Expand All @@ -72,6 +77,10 @@ def __init__(self,
self.offspring_generator = offspring_generator
self.reinserter = reinserter
self.objectives = objectives
self.optimizer = optimizer
self.optimizer_likelihood = optimizer_likelihood
self.optimizer_likelihood_loginput = optimizer_likelihood_loginput
self.optimizer_batch_size = optimizer_batch_size
self.max_length = max_length
self.max_depth = max_depth
self.initialization_method = initialization_method
Expand All @@ -92,6 +101,7 @@ def __init__(self,
self.epsilon = epsilon
self.n_threads = n_threads
self.model_selection_criterion = model_selection_criterion
self.uncertainty = uncertainty
self.time_limit = time_limit
self.random_state = random_state

Expand All @@ -107,6 +117,10 @@ def __check_parameters(self):
self.offspring_generator = check(self.offspring_generator, 'basic')
self.reinserter = check(self.reinserter, 'replace-worst')
self.objectives = check(self.objectives, [ 'r2' ])
self.optimizer = check(self.optimizer, 'lbfgs')
self.optimizer_likelihood = check(self.optimizer_likelihood, 'gaussian')
self.optimizer_likelihood_loginput = check(self.optimizer_likelihood_loginput, False)
self.optimizer_batch_size = check(self.optimizer_batch_size, 0)
self.max_length = check(self.max_length, 50)
self.max_depth = check(self.max_depth, 10)
self.initialization_method = check(self.initialization_method, 'btc')
Expand All @@ -126,6 +140,7 @@ def __check_parameters(self):
self.irregularity_bias = check(self.irregularity_bias, 0.0)
self.epsilon = check(self.epsilon, 1e-5)
self.model_selection_criterion = check(self.model_selection_criterion, 'minimum_description_length')
self.uncertainty = check(self.uncertainty, [1])
self.n_threads = check(self.n_threads, 1)
self.time_limit = check(self.time_limit, sys.maxsize)
self.random_state = check(self.random_state, random.getrandbits(64))
Expand Down Expand Up @@ -207,30 +222,30 @@ def __init_selector(self, selection_method, comp):
raise ValueError('Unknown selection method {}'.format(selection_method))


def __init_evaluator(self, objective, problem, interpreter):
def __init_evaluator(self, objective, problem, dtable):
if objective == 'r2':
err = op.R2()
return op.Evaluator(problem, interpreter, err, True), err
return op.Evaluator(problem, dtable, err, True), err

elif objective == 'c2':
err = op.C2()
return op.Evaluator(problem, interpreter, err, False), err
return op.Evaluator(problem, dtable, err, False), err

elif objective == 'nmse':
err = op.NMSE()
return op.Evaluator(problem, interpreter, err, True), err
return op.Evaluator(problem, dtable, err, True), err

elif objective == 'rmse':
err = op.RMSE()
return op.Evaluator(problem, interpreter, err, True), err
return op.Evaluator(problem, dtable, err, True), err

elif objective == 'mse':
err = op.MSE()
return op.Evaluator(problem, interpreter, err, True), err
return op.Evaluator(problem, dtable, err, True), err

elif objective == 'mae':
err = op.MAE()
return op.Evaluator(problem, interpreter, err, True), err
return op.Evaluator(problem, dtable, err, True), err

elif objective == 'length':
return op.LengthEvaluator(problem), None
Expand All @@ -241,7 +256,7 @@ def __init_evaluator(self, objective, problem, interpreter):
elif objective == 'diversity':
return op.DiversityEvaluator(problem), None

raise ValueError('Unknown objective {}'.format(objectives))
raise ValueError('Unknown objective {}'.format(objective))


def __init_generator(self, generator_name, evaluator, crossover, mutator, female_selector, male_selector):
Expand Down Expand Up @@ -375,31 +390,38 @@ def fit(self, X, y):

single_objective = True if len(self.objectives) == 1 else False

interpreter = op.Interpreter()
dtable = op.DispatchTable()

# these lists are used as placeholders in order to extend the lifetimes of the objects
error_metrics = [] # placeholder for the error metric
evaluators = [] # placeholder for the evaluator(s)

optimizer = op.Optimizer(dtable=dtable, problem=problem, optimizer=self.optimizer, likelihood=self.optimizer_likelihood, iterations=self.local_iterations, batchsize=self.optimizer_batch_size, loginput=self.optimizer_likelihood_loginput)

# evaluators for minimum description length and information criteria
mld_eval = op.MinimumDescriptionLengthEvaluator(problem, interpreter)
bic_eval = op.BayesianInformationCriterionEvaluator(problem, interpreter)
aik_eval = op.AkaikeInformationCriterionEvaluator(problem, interpreter)
mdl_eval = op.MinimumDescriptionLengthEvaluator(problem, dtable)
mdl_eval.Sigma = self.uncertainty

bic_eval = op.BayesianInformationCriterionEvaluator(problem, dtable)
aik_eval = op.AkaikeInformationCriterionEvaluator(problem, dtable)

for eval in [mdl_eval, bic_eval, aik_eval]:
eval.Optimizer = optimizer

for obj in self.objectives:
eval_, err_ = self.__init_evaluator(obj, problem, interpreter)
eval_, err_ = self.__init_evaluator(obj, problem, dtable)
eval_.Budget = self.max_evaluations
eval_.LocalOptimizationIterations = self.local_iterations
evaluators.append(eval_)
error_metrics.append(err_)

evaluators[0].Optimizer = optimizer

if single_objective:
evaluator = evaluators[0]
else:
evaluator = op.MultiEvaluator(problem)
for eval_ in evaluators:
evaluator.Add(eval_)
evaluator.LocalOptimizationIterations = self.local_iterations
evaluator.Budget = self.max_evaluations

comparison = op.SingleObjectiveComparison(0) if single_objective else op.CrowdedComparison()
Expand Down Expand Up @@ -453,9 +475,13 @@ def fit(self, X, y):
def get_solution_stats(solution):
"""Takes a solution (operon individual) and computes a set of stats"""
# perform linear scaling
y_pred = op.Evaluate(interpreter, solution.Genotype, ds, training_range)
y_pred = op.Evaluate(dtable, solution.Genotype, ds, training_range)
scale, offset = op.FitLeastSquares(y_pred, y)
nodes = solution.Genotype.Nodes + [ op.Node.Constant(scale), op.Node.Mul(), op.Node.Constant(offset), op.Node.Add() ]
nodes = solution.Genotype.Nodes
if scale != 1:
nodes += [ op.Node.Constant(scale), op.Node.Mul() ]
if offset != 0:
nodes += [ op.Node.Constant(offset), op.Node.Add() ]
solution.Genotype = op.Tree(nodes).UpdateNodes()

# get solution variables
Expand All @@ -467,7 +493,7 @@ def get_solution_stats(solution):
'tree' : solution.Genotype,
'objective_values' : evaluator(rng, solution),
'mean_squared_error' : mean_squared_error(y, scale * y_pred + offset),
'minimum_description_length' : mld_eval(rng, solution)[0],
'minimum_description_length' : mdl_eval(rng, solution)[0],
'bayesian_information_criterion' : bic_eval(rng, solution)[0],
'akaike_information_criterion' : aik_eval(rng, solution)[0],
}
Expand Down
Loading

0 comments on commit 4ee56f9

Please sign in to comment.