<a href="https://colab.research.google.com/github/janoPig/HROCH/blob/main/examples/Symbolic_Regression_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Symbolic Regression Demo


1.   Setup
2.   Basic example ground-truth problem
3.   Basic example blackbox problem
4.   Use feature importances from bbox model
5.   Custom instructions set
6.   Simple binary clasification with lt/gt
7.   Fuzzy regression
8.   Classification with fuzzy logic - parity dataset



## Setup

In [61]:
%%capture
%pip install -U HROCH
#Penn Machine Learning Benchmarks
%pip install -U git+https://github.com/EpistasisLab/pmlb



In [62]:
import pandas as pd
import numpy as np
import sympy as sp
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from pmlb import fetch_data
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from HROCH import SymbolicRegressor, FuzzyRegressor

## Basic example ground-truth problem

feynman_III_7_38 dataset from pmlb

Formula: omega = 2 * mom * B/(h/(2 * pi))


In [63]:
def get_eq(X : pd.DataFrame, expr : str):
    model_str = str(sp.parse_expr(expr))
    mapping = {'x'+str(i+1): k for i, k in enumerate(X.columns)}
    new_model = model_str
    for k, v in reversed(mapping.items()):
        new_model = new_model.replace(k, v)

    return new_model

dataset = fetch_data('feynman_III_7_38')
Y = np.ravel(pd.DataFrame(dataset, columns=['target']).to_numpy())
X = dataset.drop(columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), Y, train_size=0.75, test_size=0.25, random_state=42)

In [64]:
reg = SymbolicRegressor(num_threads=1, time_limit = 0.0, iter_limit=100000, random_state=42)
reg.fit(X_train, y_train)

yp_train = reg.predict(X_train)
r2_train = r2_score(y_train, yp_train)
rms_train = np.sqrt(mean_squared_error(y_train, yp_train))

yp = reg.predict(X_test)
r2 = r2_score(y_test, yp)
rms = np.sqrt(mean_squared_error(y_test, yp))

print(f'train: r2={r2_train} rms={rms_train} test: r2={r2} rms={rms}')
print(f'eq: {get_eq(X, reg.sexpr_)}')

train: r2=0.999999999999998 rms=1.6125012744918447e-06 test: r2=0.999999999999998 rms=1.6204839497481079e-06
eq: 12.566370964050293*x0*mom/B


## Basic example blackbox problem

588_fri_c4_1000_100 dataset from pmlb

In [65]:
dataset = fetch_data('588_fri_c4_1000_100')
Y = np.ravel(pd.DataFrame(dataset, columns=['target']).to_numpy())
X = dataset.drop(columns=['target'])
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), Y, train_size=0.75, test_size=0.25, random_state=42)

In [66]:
reg = SymbolicRegressor(num_threads=1, time_limit = 0.0, iter_limit=100000, random_state=42)
reg.fit(X_train, y_train)

yp_train = reg.predict(X_train)
r2_train = r2_score(y_train, yp_train)
rms_train = np.sqrt(mean_squared_error(y_train, yp_train))

yp = reg.predict(X_test)
r2 = r2_score(y_test, yp)
rms = np.sqrt(mean_squared_error(y_test, yp))

print(f'SymbolicRegressor train: r2={r2_train} rms={rms_train} test: r2={r2} rms={rms}')
print(f'eq: {get_eq(X, reg.sexpr_)}')

SymbolicRegressor train: r2=0.7569770202772456 rms=0.49166305989611125 test: r2=0.6983115730729911 rms=0.552502089377683
eq: 0.3931830823421478*oz3 + 0.3931830823421478*(x0 + 2.346113681793213)*(0.12121981414771422*oz2*oz4 - sin(2.176326115047859*oz1))


## Use feature importances from bbox model

For example, we can use the feature importances from RandomForestRegressor to try to speed up the search process. During mutation, the SymbolicSolver will select the most important features with higher probability. 

In [67]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

yp_train = rf.predict(X_train)
r2_train = r2_score(y_train, yp_train)
rms_train = np.sqrt(mean_squared_error(y_train, yp_train))

yp = rf.predict(X_test)
r2 = r2_score(y_test, yp)
rms = np.sqrt(mean_squared_error(y_test, yp))

print(f'RandomForestRegressor train: r2={r2_train} rms={rms_train} test: r2={r2} rms={rms}')

RandomForestRegressor train: r2=0.9830541104179017 rms=0.12983031029807188 test: r2=0.8303892163473732 rms=0.4142679448325175


In [68]:
probs = np.power(rf.feature_importances_, 2.0)
reg = SymbolicRegressor(num_threads=1, time_limit = 0.0, iter_limit=100000, random_state=42, feature_probs=probs)

reg.fit(X_train, y_train)
yp_train = reg.predict(X_train)
r2_train = r2_score(y_train, yp_train)
rms_train = np.sqrt(mean_squared_error(y_train, yp_train))

yp = reg.predict(X_test)
r2 = r2_score(y_test, yp)
rms = np.sqrt(mean_squared_error(y_test, yp))

print(f'SymbolicRegressor train: r2={r2_train} rms={rms_train} test: r2={r2} rms={rms}')
print(f'eq: {get_eq(X, reg.sexpr_)}')

SymbolicRegressor train: r2=0.8919199732812224 rms=0.3278812866387748 test: r2=0.8700743468791842 rms=0.3625785048723392
eq: 0.2813378870487213*oz3 + 0.3380877375602722*oz4 - 0.2813378870487213*(x0 + oz1)*(sin(x0 + oz1)**2 - 0.5105604529380798) - sin(x0 + oz1)**2 - sin(x0 + oz1) + 0.5105604529380798


## Custom instructions set

Limit the search to specific mathematical operations. Each math instruction has a defined probability used by the mutation operator.

|**Supported instructions**||
| ----------- | ----------- |
|**math**|add, sub, mul, div, inv, minv, sq2, pow, exp, log, sqrt, cbrt, aq|
|**goniometric**|sin, cos, tan, asin, acos, atan, sinh, cosh, tanh|
|**other**|nop, max, min, abs, floor, ceil, lt, gt, lte, gte|
|**fuzzy**|f_and, f_or, f_xor, f_impl, f_not, f_nand, f_nor, f_nxor, f_nimpl|

In [69]:
instr_set={'add': 1.0, 'mul': 1.0, 'div':0.01, 'sin':0.1}
reg = SymbolicRegressor(num_threads=1, time_limit = 0.0, iter_limit=100000, random_state=42, feature_probs=probs, problem=instr_set)

reg.fit(X_train, y_train)
yp_train = reg.predict(X_train)
r2_train = r2_score(y_train, yp_train)
rms_train = np.sqrt(mean_squared_error(y_train, yp_train))

yp = reg.predict(X_test)
r2 = r2_score(y_test, yp)
rms = np.sqrt(mean_squared_error(y_test, yp))

print(f'train: r2={r2_train} rms={rms_train} test: r2={r2} rms={rms}')
print(f'eq: {get_eq(X, reg.sexpr_)}')

train: r2=0.7719326794473848 rms=0.47629436022682353 test: r2=0.7184290551818527 rms=0.5337630650788203
eq: 0.9881536364555359*(0.26769205927848816*oz1*(x0 + oz4 + 0.10410416126251221) - 0.80549142255765815)*(x0 + sin(2*oz1) + sin(oz1*(x0 + 0.10410416126251221)))


## Simple binary clasification with lt/gt

In [70]:
X = np.random.normal(loc=0.0, scale=10.0, size=(4000, 100))
y = (0.5*X[:, 0]**2 >= 1.5*X[:, 1])*1.0

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_predicted = dtc.predict(X_test)
test_mse = mean_squared_error(y_predicted, y_test)
test_r2 = r2_score(y_predicted, y_test)
print(f'DecisionTreeClassifier: mse= {test_mse} r2= {test_r2}')

probs = np.power(dtc.feature_importances_, 2.0)
instr_set={'add': 1.0, 'sub': 1.0, 'mul': 1.0, 'lt':0.1, 'gt':0.1, 'lte':0.1, 'gte':0.1}
reg = SymbolicRegressor(num_threads=1, time_limit = 0.0, iter_limit=100000, random_state=42,feature_probs=probs, problem=instr_set)
reg.fit(X_train, y_train)

# predict
y_predicted = reg.predict(X_test)
y_predicted = (y_predicted > 0.5)*1.0
test_mse = mean_squared_error(y_predicted, y_test)
test_r2 = r2_score(y_predicted, y_test)

print(f'SymbolicRegressor: mse= {test_mse} r2= {test_r2} eq= {str(reg.sexpr_)} ')

DecisionTreeClassifier: mse= 0.022 r2= 0.8528536361873039
SymbolicRegressor: mse= 0.0 r2= 1.0 eq= (((-0.5468273758888245)+(x0*x0))>((x1-0.5520243644714355)+(x1+x1))) 


## Classification with fuzzy logic - parity dataset

A good simple example is the parity5 and parity5+5 dataset from pmlb. 
FuzzyRegressor will find equations (((x5^(x4^x3))^x1)^x2) or similar that can be simplified to this form. The equation Xor fits the parity calculation perfectly. The DecisionTreeClassifier and RandomForestClassifier fit the training data with an r2 score of 1.0, but absolutely not the test data.

Because the parity5 dataset is very small we repeat the experiment 10 times

In [71]:
datasets = [(fetch_data('parity5'), 'parity5'), (fetch_data('parity5+5'), 'parity5+5')]
random_states = [42, 1083, 20133, 35879, 45688, 211565, 1212248, 58985945, 48994485, 5454544]
classifiers = {FuzzyRegressor: {'iter_limit':100000, 'num_threads':1}, DecisionTreeClassifier: {}, RandomForestClassifier: {}}

for classifier, params in classifiers.items():
  print(classifier.__name__)
  print('='*20)
  for dataset, dataset_name in datasets:
    print(dataset_name)
    print('-'*20)
    Y = np.ravel(pd.DataFrame(dataset, columns=['target']).to_numpy())
    X = dataset.drop(columns=['target']).to_numpy()
    for rs in random_states:
      X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.75, test_size=0.25, random_state=rs)
      clf = classifier(random_state=rs, **params)
      clf.fit(X_train, y_train)
      yp_train = clf.predict(X_train)
      if classifier is FuzzyRegressor:
        yp_train = (yp_train > 0.5)*1.0
      r2_train = r2_score(y_train, yp_train)
      rms_train = np.sqrt(mean_squared_error(y_train, yp_train))
      yp = clf.predict(X_test)
      if classifier is FuzzyRegressor:
        yp = (yp > 0.5)*1.0
      r2 = r2_score(y_test, yp)
      rms = np.sqrt(mean_squared_error(y_test, yp))
      print(f'train: r2={r2_train} rms={rms_train} test: r2={r2} rms={rms}')
      if classifier is FuzzyRegressor:
        print(f'eq: {clf.sexpr_}')

FuzzyRegressor
parity5
--------------------
train: r2=1.0 rms=0.0 test: r2=-1.6666666666666665 rms=0.7071067811865476
eq: (((x0&x2)^(~(x4^(x1^x3))))|(~(x2|x0)))
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: ((x0^(((x3^x2)^x1)&((x3^x2)^x1)))^(x4&x4))
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: ((0.0052818614058196545|(~(x1&(~x1))))^((~(x1&(~x1)))^((~((~x1)))^((x4^x2)^(~(~(x3^x0)))))))
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: ((~((~((x1^x0)^x2))^x3))^x4)
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: (((x3^x2)^((x4^x1)&(x4^x1)))^x0)
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: (~((1.0&(((~x2)&(~x2))^(x3^x0)))^(x4^x1)))
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: ((x4^(~((x0|x0)|(x0|x0))))^((((~x3)|(0.9993218183517456|x2))|x1)&(x2^((~x3)^x1))))
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: (~(1.0&((x2^(~(x1^(x3^x0))))^(((x1^(x3^x0))&(~(x1^(x3^x0))))|x4))))
train: r2=1.0 rms=0.0 test: r2=1.0 rms=0.0
eq: ((x2^((((x3|x4)&x4)^x3)^x1))^x0)
train: r2=1.0 rms=0.0 test: r2=1.0 rms