In [2]:
from gplearn_sgpa._program import _Program
import numpy as np
from gplearn_sgpa.sgpa import *
from gplearn_sgpa.genetic import _convert_to_sympy, _check_if_finite
from gplearn_sgpa.functions import _function_map
import sympy as sp
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import pearsonr, spearmanr

In [3]:
n_equations = 10000

FUNCTION_SET = ('add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'exp')
function_set = [_function_map[f] for f in FUNCTION_SET]

arities = {}
for function in function_set:
    arity = function.arity
    arities[arity] = arities.get(arity, [])
    arities[arity].append(function)

init_depth=(2, 6)
init_method='half and half'
const_range=(-1, 1)
metric='mean absolute error'
p_point_replace=0.05
parsimony_coefficient=0.001


# Generate a list of random equations
gen = np.random.default_rng(seed=0)

# Generate a list of random states
seeds = gen.integers(0, 1e9, size=n_equations)
n_features_list = gen.integers(1, 10, size=n_equations)
states = [np.random.RandomState(seed) for seed in seeds]

programs = [_Program(function_set,arities,init_depth,init_method,n_features,const_range,metric,p_point_replace,parsimony_coefficient,state) for state, n_features in zip(states, n_features_list)]

In [4]:
c1_list = []
new_programs = []
sympy_exp_list = []
for program in programs:
    sympy_exp = _convert_to_sympy(program.program)
    
    if _check_if_finite(sympy_exp):
        new_programs.append(program)
        c1_list.append(complexity_1(sympy_exp, empirical=True))
        sympy_exp_list.append(sympy_exp)

  return -sqrt(exp(1.94924425764298*exp(X1)/(log(abs(X0))*log(abs(X1))))/(sqrt(abs(X1))*sqrt(abs(log(abs(X1))))))*exp((1/2)*X0)*abs(X1 + sin(X0) - 0.788703945680358)**(1/4) + sqrt(exp(1.94924425764298*exp(X1 + l)/(log(abs(X0))*log(abs(X1 + l))))/(sqrt(abs(X1 + l))*sqrt(abs(log(abs(X1 + l))))))*exp((1/2)*X0)*abs(X1 + l + sin(X0) - 0.788703945680358)**(1/4)
  return -sqrt(exp(1.94924425764298*exp(X1)/(log(abs(X0))*log(abs(X1))))/(sqrt(abs(X1))*sqrt(abs(log(abs(X1))))))*exp((1/2)*X0)*abs(X1 + sin(X0) - 0.788703945680358)**(1/4) + sqrt(exp(1.94924425764298*exp(X1)/(log(abs(X1))*log(abs(X0 + l))))/(sqrt(abs(X1))*sqrt(abs(log(abs(X1))))))*exp((1/2)*X0 + (1/2)*l)*abs(X1 + sin(X0 + l) - 0.788703945680358)**(1/4)
  return -exp((log(abs(X0)) + 0.903266733865905)*exp(-X0) - 0.493431189718617/abs(X0)) + exp((log(abs(X0 + l)) + 0.903266733865905)*exp(-X0 - l) - 0.493431189718617/abs(X0 + l))
  return -log(exp(sqrt(abs((X5 + 0.300106711724445)*sin(X5)))/log(abs((X0 - X3)*(X1 - X2))))) + log(exp(sqrt

In [5]:
c1_max_list = []
for c1 in c1_list:
    if len(c1) == 0:
        c1_max_list.append(0)
    else:
        c1_max = max(c1.values())
        c1_max_list.append(c1_max)

c1_mean_list = []
for c1 in c1_list:
    if len(c1) == 0:
        c1_mean_list.append(0)
    else:
        c1_mean = np.mean(list(c1.values()))
        c1_mean_list.append(c1_mean)

length_list = []
for program in new_programs:
    length_list.append(len(program.program))

c1_sum_list = []
for c1 in c1_list:
    if len(c1) == 0:
        c1_sum_list.append(0)
    else:
        c1_sum = np.sum(list(c1.values()))
        c1_sum_list.append(c1_sum)


In [6]:
from gplearn_sgpa.functions import _Function


def count_ops(expr):
    """ Counts the number of operations in the formula """
    formula = str(expr).replace('**','^')
    ops = 0
    for sim in  ['+', '-', '*', '/', '^', 'sin', 'cos', 'exp', 'log', 'sqrt']:
        ops += formula.count(sim)
    return ops

def count_na_ops(expr):
    """ Counts the number of non-arithmetic operations (^,sqrt,sin,cos) in the formula """
    formula = str(expr).replace('**','^')
    ops = 0
    for sim in  ['^', 'sin', 'cos', 'exp', 'log', 'sqrt']:
        ops += formula.count(sim)
    return ops

n_o = []
for expr in sympy_exp_list:
    n = count_ops(expr)
    n_o.append(n)

n_nao = []
for expr in sympy_exp_list:
    n = count_na_ops(expr)
    n_nao.append(n)

def count_nodes(expr, n_nodes = None):
    """ Counts the number of nodes in the simpy tree representation of the formula """
    if n_nodes is None:
        n_nodes = 1
    if len(expr.args)>0:
        n_nodes += len(expr.args)
        if str(type(expr)) == 'Abs':
            n_nodes -= 1
        # a/x is read as a*(1/x)
        if expr.is_Mul and any([arg.is_Pow for arg in expr.args]):
            for arg in expr.args:
                if arg.is_Pow and arg.args[1] == -1:
                    n_nodes -= 2
        # -x is read as -1*x
        if expr.is_Mul and (expr.args[0] == -1):
            n_nodes -= 1
        for arg in expr.args:
            n_nodes = count_nodes(arg, n_nodes)
    return n_nodes

n_l = []
for expr in sympy_exp_list:
    n = count_nodes(expr)
    n_l.append(n)


def count_na_comp(expr, count=None):
    """ Counts the maximum number of non-arithmetic chained operations in the formula, e.g. for log(sin(cos(x)) 
    the function returns 3 """
    if count == None:
        count = 0
    if (expr.is_Pow and expr.args[1]!=-1) or (str(type(expr)) in ['sin','log','exp','Abs']):
        if str(type(expr)) != 'Abs':
            count += 1
        count_args = []
        for arg in expr.args:
            count_args.append(count_na_comp(arg, count))
        count = max(count_args)
        return count
    else:
        if expr.args:
            count_args = []
            for arg in expr.args:
                count_args.append(count_na_comp(arg, count))
            count = max(count_args)
        return count

n_naoc = []
for expr in sympy_exp_list:
    n_naoc.append(count_na_comp(expr))

In [7]:
df = pd.DataFrame({'expr':[str(s) for s in sympy_exp_list],'length': n_l, 'c1_max': c1_max_list, 'c1_mean': c1_mean_list, 'c1_sum':c1_sum_list, 'n_o': n_o, 'n_nao': n_nao, 'n_naoc': n_naoc})

In [8]:
df.head()

Unnamed: 0,expr,length,c1_max,c1_mean,c1_sum,n_o,n_nao,n_naoc
0,sqrt(exp(1.94924425764298*exp(X1)/(log(Abs(X0)...,31,2,2.0,4,22,11,3
1,exp((log(Abs(X0)) + 0.903266733865905)*exp(-X0...,13,1,1.0,1,8,3,2
2,0.965234885451848,1,0,0.0,0,0,0,0
3,0.518070542841815*X0*X1**2*(X0 + 0.11613157053...,9,1,0.5,1,5,1,1
4,exp(Abs(X1*sin(X0))**(1/4)),7,2,2.0,4,5,3,3


In [9]:
df['proxy'] = 79.1 - 0.2*df['length'] - 0.5*df['n_o'] - 3.4*df['n_nao'] - 4.5*df['n_naoc']

In [11]:
import matplotlib as mpl

# Enable LaTeX rendering
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams['text.usetex'] = False
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Subplot 1: c1_max vs proxy
axes[0].scatter(df['c1_max'], df['proxy'], alpha=0.5)
axes[0].set_title(r"Correlation between max($c_1$) and $\mathcal{M}^{\phi}$")
axes[0].set_xlabel(r'max($c_1$)')
axes[0].set_ylabel(r'$\mathcal{M}^{\phi}$')
pearson_corr, _ = pearsonr(df['c1_max'], df['proxy'])
spearman_corr, _ = spearmanr(df['c1_max'], df['proxy'])
axes[0].text(0.6, 0.95, f'Pearson: {pearson_corr:.2f}\nSpearman: {spearman_corr:.2f}', 
             transform=axes[0].transAxes, fontsize=12, verticalalignment='top')

# Subplot 2: c1_mean vs proxy
axes[1].scatter(df['c1_mean'], df['proxy'], alpha=0.5)
axes[1].set_title(r'Correlation between mean($c_1$) and $\mathcal{M}^{\phi}$')
axes[1].set_xlabel(r'mean($c_1$)')
axes[1].set_ylabel(r'$\mathcal{M}^{\phi}$')
pearson_corr, _ = pearsonr(df['c1_mean'], df['proxy'])
spearman_corr, _ = spearmanr(df['c1_mean'], df['proxy'])
axes[1].text(0.6, 0.95, f'Pearson: {pearson_corr:.2f}\nSpearman: {spearman_corr:.2f}', 
             transform=axes[1].transAxes, fontsize=12, verticalalignment='top')

# Subplot 3: c1_sum vs proxy
axes[2].scatter(df['c1_sum'], df['proxy'], alpha=0.5)
axes[2].set_title(r'Correlation between sum($c_1$) and $\mathcal{M}^{\phi}$')
axes[2].set_xlabel(r'sum($c_1$)')
axes[2].set_ylabel(r'$\mathcal{M}^{\phi}$')
pearson_corr, _ = pearsonr(df['c1_sum'], df['proxy'])
spearman_corr, _ = spearmanr(df['c1_sum'], df['proxy'])
axes[2].text(0.6, 0.95, f'Pearson: {pearson_corr:.2f}\nSpearman: {spearman_corr:.2f}', 
             transform=axes[2].transAxes, fontsize=12, verticalalignment='top')

plt.tight_layout()

import os
# Create a folder if it does not exist
if not os.path.exists('results'):
    os.makedirs('results')

# Save the figure
plt.savefig('results/correlation.png')