In [1]:
import ast
import pandas as pd

from plotly.subplots import make_subplots

from gptree import GPTree

In [5]:
base_path = '/home/ines/Documents/tese/tiny_gp'
dataset = 'LD50'
method = 'Double Tournament (Prob=1)'

MAPPTING_METHODS = {
    'StdGP': 'results_elitism/results',
    'Double Tournament (Prob=1)': 'results_elitism/results_nested',
    'Inverted Double Tournament (Prob=1)': 'results_elitism/results_inverted_tournament',
    'Double Tournament (Prob=0.7)': 'results_elitism/results_nested_prob_0.7',
    'Inverted Double Tournament (Prob=0.7)': 'results_elitism/results_inverted_nested_prob_0.7',
    'Double Tournament (Prob=0.5)': 'results_elitism/results_nested_prob_0.5',
    'Inverted Double Tournament (Prob=0.5)': 'results_elitism/results_inverted_nested_prob_0.5',
    'MMOTS': 'results_elitism/mmots',
    'Subsampled': 'results_elitism/results_nested_subsampled',
    'Oversampled': 'results_elitism/results_nested_oversampled',
    'DT_2_2': 'results_elitism/results_nested_2_2',
    'DT_2_4': 'results_elitism/results_nested_2_4',
    'DT_4_2': 'results_elitism/results_nested',
    'DT_4_4': 'results_elitism/results_nested_4_4',
    'DT_10_2': 'results_elitism/results_nested_10_2',
    'DT_10_4': 'results_elitism/results_nested_10_4',
    'Double Tournament Complexity Limit': 'results_elitism/results_nested_limit',
}

In [7]:
from sympy import symbols, lambdify, div

In [20]:
def custom_division(a, b):
    return div(a, b) if b != 0 else 1

In [17]:
terminals = ['x1', 'x2']

variables = symbols(", ".join(terminals))

variables

(x1, x2)

In [26]:
expr = '( ( x1 + x2 ) + x1 ) / x1'
expr = expr.replace('/', 'custom_division')
fn = lambdify(variables, expr, modules={"custom_division": custom_division})


input = [1, 3]

fn(*input)

SyntaxError: invalid syntax (<lambdifygenerated-19>, line 2)

In [15]:
GENS = [100, 300, 500]

stdgp_path = base_path + f'/{MAPPTING_METHODS["StdGP"]}/{dataset}/'
method_path = base_path + f'/{MAPPTING_METHODS[method]}/{dataset}/'

# Read dataset
df = pd.read_csv(base_path + f'/data/{dataset}/train_1.csv', index_col=0)
X = df.drop('Target', axis=1)
Y = df['Target']

terminals = X.columns
print(terminals)

# Read and plot function
best_of_run = pd.read_csv(stdgp_path + 'best_in_run1.csv', index_col = 0)
best_of_runs = best_of_run.iloc[0, GENS]
print(best_of_runs)

# Define symbols for the lambda functions
variables = symbols(", ".join(terminals))

# Create lambdas using custom division
tree_lambdas = [lambdify(variables, best_of_runs[i], modules={"/": custom_division}) for i in range(len(GENS))]

outputs = [[], [], []]

for idx, tree_lambda in enumerate(tree_lambdas):
    for obs in X.values:
        outputs[idx].append(tree_lambda(*obs))

# Read and plot slopes

fig = make_subplots(rows=1, cols=3,
                    subplot_titles=[f'Generation {gen}' for gen in GENS])

Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       ...
       'x617', 'x618', 'x619', 'x620', 'x621', 'x622', 'x623', 'x624', 'x625',
       'x626'],
      dtype='object', length=626)
100    ((((((((x221) + (x453)) / ((x321) * (x518))) *...
300    ((((((x558) / (x105)) * ((((x209) * (x249)) / ...
500    ((((((x558) / (x105)) * ((((x453) + (x453)) / ...
Name: 0, dtype: object



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


invalid value encountered in scalar divide


divide by zero encountered in scalar divide


invalid value encountered in scalar multiply


divide by zero encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar multiply


invalid value encountered in scalar add


divide by zero encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar multiply


invalid value encountered in scalar subtract


invalid value encountered in scalar add



In [103]:
source = "x1+ x2 / ((x1 - x2))"

terminals = ['x1', 'x2']

def tokenize(s : str) -> list[str]:
    tokens = []
    lexeme = ''
    for ch in s:
        if ch == ' ':
            if lexeme:
                tokens.append(lexeme)
                lexeme = ''
            continue
        elif not (ch.isalpha() or ch.isdigit()):
            if lexeme:
                tokens.append(lexeme)
                lexeme = ''
            tokens.append(ch)
        else:
            lexeme += ch
    return tokens                

In [104]:
def parse_operator(op: str) -> int:
    from ops import add, sub, mul, div
    match op:
        case '+':
            return (add, 1)
        case '-':
            return (sub, 1)
        case '*':
            return (mul, 2)
        case '/':
            return (div, 2)
        case _:
            return (None, 0)

def parse(tokens: list[str], precedence= 0) -> GPTree:
    lhs = parse_prefix(tokens)
    while True:
        if not tokens:
            break
        op, prec = parse_operator(tokens[0])
        if op is None or prec < precedence:
            break
        tokens.pop(0)
        lhs = GPTree(op, lhs, parse(tokens, prec), terminals=terminals)
    return lhs

def parse_prefix(tokens: list[str]) -> GPTree:
    match tokens[0]:
        case '(':
            tokens.pop(0)
            tree = parse(tokens, 0)
            assert tokens.pop(0) == ')'
            return tree
        case _:
            assert tokens[0].isidentifier()
            return GPTree(tokens.pop(0), terminals=terminals)


In [105]:
tokens = tokenize(source)

tokens

['x1', '+', 'x2', '/', '(', '(', 'x1', '-', 'x2', ')', ')']

In [106]:
gptree = parse(tokens)

gptree.print_tree()

add
   x1
   div
      x2
      sub
         x1
         x2


In [107]:
gptree.create_expression()

'(x1) + ((x2) / ((x1) - (x2)))'

In [108]:
gptree.create_lambda_function()

In [109]:
gptree.compute_tree([0, 0])

1

In [112]:
GENS = [100, 300, 500]

stdgp_path = base_path + f'/{MAPPTING_METHODS["StdGP"]}/{dataset}/'
method_path = base_path + f'/{MAPPTING_METHODS[method]}/{dataset}/'

# Read dataset
df = pd.read_csv(base_path + f'/data/{dataset}/train_1.csv', index_col=0)
X = df.drop('Target', axis=1)
Y = df['Target']

terminals = X.columns
print(terminals)

# Read and plot function
best_of_run = pd.read_csv(stdgp_path + 'best_in_run1.csv', index_col = 0)
best_of_runs = best_of_run.iloc[0, GENS]
print(best_of_runs)

outputs = []

# Define symbols for the lambda functions
for idx, gen in enumerate(GENS):
    tokens = tokenize(best_of_runs[idx])
    gptree = parse(tokens)
    gptree.create_lambda_function()
    outputs.append([gptree.compute_tree(obs) for obs in X.values])

fig = make_subplots(rows=1, cols=3,
                    subplot_titles=[f'Generation {gen}' for gen in GENS])

Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
       ...
       'x617', 'x618', 'x619', 'x620', 'x621', 'x622', 'x623', 'x624', 'x625',
       'x626'],
      dtype='object', length=626)
100    ((((((((x221) + (x453)) / ((x321) * (x518))) *...
300    ((((((x558) / (x105)) * ((((x209) * (x249)) / ...
500    ((((((x558) / (x105)) * ((((x453) + (x453)) / ...
Name: 0, dtype: object



Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

