In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes

from itertools import combinations
from collections import deque

In [2]:
X,y = load_diabetes(return_X_y=True, as_frame=True)

In [3]:
t1 = np.square
t2 = np.sin
t3 = lambda x: np.log(
    np.clip(x, 1e-4, None)
)
    
ts = (t1,t2,t3)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [5]:
print(
    LinearRegression().fit(X_train, y_train).score(X_train, y_train),
    LinearRegression().fit(X_train, y_train).score(X_test, y_test),
    sep="\n")

0.5015516188475683
0.5675895725793205


In [6]:
def softmax(w):
    exp = np.exp(w) + 1e-4
    return exp/(exp.sum())

In [7]:
class Node:
    def __init__(self, level, pipeline, n_features, children = {}, parent=None):
        self.level = level
        self.pipeline = pipeline
        self.children = children
        self.parent = parent
        self.n_columns = n_features
        self.score = None
    
    def get_columns(self):
        n_cols = np.random.randint(1, self.n_columns)
        cols = np.random.choice(
            list(range(self.n_columns)), size=n_cols, replace=False
        )
        
        return cols
    
    def get_data(self, X):
        temp = X.copy()
        for t in self.pipeline:
            if type(t) == tuple:
                temp = temp[:, t[1]]
            else:
                temp = ts[t](temp)
        return temp
    
    def evaluate(self, model, X, y):
        if self.score==None:
            tempX = self.get_data(X)
            model = model.fit(tempX, y)
            self.score = cross_val_score(
                model,
                tempX, 
                y,
                scoring="r2",
                cv=5,
                n_jobs=-1
            ).mean()
        return self.score
        
    def get_child(self, ϵ, model, X, y):
        if len(self.children)==0:
            node = self.expand(model, X, y)
            
        else:
            if (1 - ϵ) > np.random.rand():
                node = self.select()
            else:
                node = self.expand(model, X, y)
        return node
    
    def expand(self, model, X, y):
        if self.n_columns > 1:
            op = np.random.randint(0,4)
        else:
            op = np.random.randint(0,3)
        if op in self.children:
            if op!=3:
                return self.children[op]
            else:
                cols = self.get_columns()
                cols = tuple(list(cols))
                return self.children[cols]
        
        else:
            if op!=3:
                node = Node(
                    self.level+1, 
                    self.pipeline + [op],
                    self.n_columns, 
                    children={},
                    parent=self
                )
                node.evaluate(model, X, y)
                self.add_child(op,node)
                return node
            else:
                cols = self.get_columns()
                cols = tuple(list(cols))
                
                node = Node(
                    self.level+1, 
                    self.pipeline + [(op, cols)], 
                    len(cols),
                    children={},
                    parent=self
                )
                node.evaluate(model, X, y)
                self.add_child(cols, node)
                return node
                
    def select(self):
        probs = softmax(
            list(map(lambda node: node.score, self.children.values()))
        )
        
        try:
            selected_node = np.random.choice(
                list(self.children.values()), size=1, p=probs
            )[0]
        except:
            return self

        return selected_node
        
    def add_child(self, operation, child_node):
        self.children[operation] = child_node

In [8]:
def rl_fe(X, y, n_iters, max_depth):
    root = Node(0, [], X.shape[1])
    root.evaluate(LinearRegression(), X, y)
    best_score = root.score
    best_node = root
    
    for i in tqdm(range(n_iters)):
        node = root
        depth = 0
        ϵ = (n_iters - i)/n_iters
        while depth < max_depth:
            node = node.get_child(ϵ, LinearRegression(), X, y)
            
            if best_score<node.score:
                best_score = node.score
                best_node = node
            
            if node.level==depth:
                print("erro")
                return node
            depth = node.level
            
            
    return best_score, best_node

In [9]:
res = rl_fe(X_train.to_numpy(), y_train, 10000, 15)

In [10]:
root = res[1]
while root.parent!=None:
    root = root.parent

In [11]:
n_nodes = 0

queue = deque([root])
while len(queue)>0:
    node = queue.popleft()
    n_nodes+=1
    queue += deque(node.children.values())

n_nodes

100681

In [12]:
res[1].pipeline

[(3, (3, 4, 2, 8, 1, 9, 5)), 1, 1, 1]

In [13]:
res[1].get_data(X_train.to_numpy())

array([[ 0.01154306,  0.06434306, -0.02559818, ...,  0.05061513,
         0.01962905,  0.04841984],
       [ 0.06648191, -0.05103663, -0.01805894, ...,  0.05061513,
        -0.01350279, -0.01665584],
       [ 0.00121528, -0.03731772, -0.02236754, ...,  0.05061513,
        -0.01764338, -0.02635659],
       ...,
       [ 0.06193125,  0.02456673,  0.06157903, ...,  0.05061513,
         0.08149213, -0.0360499 ],
       [-0.02287869, -0.02357765,  0.06050725, ..., -0.0445972 ,
         0.03617756, -0.07252007],
       [-0.01255513,  0.00943824, -0.03851172, ..., -0.0445972 ,
         0.09785998,  0.00526217]])

In [14]:
lr = LinearRegression().fit(res[1].get_data(X_train.to_numpy()), y_train)
lr.score(res[1].get_data(X_test.to_numpy()), y_test)

0.5751092434324445