In [1]:
import numpy as np
import random
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression as LR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.svm import SVR 
from sklearn.ensemble import AdaBoostRegressor as ADA
from sklearn.ensemble import BaggingRegressor as BAG
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.linear_model import RANSACRegressor as RAN
from sklearn.linear_model import PassiveAggressiveRegressor as PAR
from sklearn.linear_model import SGDRegressor as SGD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingRegressor
from sklearn.datasets import make_regression
from sklearn import preprocessing

In [11]:
class ensemble_search:
    def __init__(self, X_train, y_train, X_test, y_test,
                 size_pop=20, epochs=5, verbose=True):
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.size_pop = size_pop
        self.epochs = epochs
        self.fitness_array_ = np.array([])
        self.best_of_all_ = None
        self.verbose_ = verbose

    def gen_population(self):
        
        population = [[]]*self.size_pop
        
        for i in range(self.size_pop):
            
            qt_regressor = np.random.randint(2,9)
            lista_LR = ['LR',LR(), {}]
            
            lista_RFR = ['RFR',RFR(), 
                         {'n_estimators':np.random.randint(1,100),
                          'max_depth':np.random.randint(1,20),
                          'min_samples_split':np.random.randint(2,5),      
                          'min_samples_leaf':np.random.randint(2,10),   
                          'min_weight_fraction_leaf':np.random.rand(1)[0]/2}]
            
            lista_SVR = ['SVR',SVR(),
                         {'kernel':random.choice(['linear','rbf','poly','sigmoid']),     
                          'epsilon':np.random.rand(1)[0]/4,
                          'C':random.choice([1,10,100,1000]),'gamma':'auto'}]
            
            lista_ADA = ['ADA',ADA(), 
                         {'n_estimators':np.random.randint(1,50)}]
            
            lista_BAG = ['BAG',BAG(), 
                         {'n_estimators':np.random.randint(1,50),'max_samples':np.random.randint(1,20)}]
            
            lista_GBR = ['GBR',GBR(), 
                         {'n_estimators':np.random.randint(1,100),'max_depth':np.random.randint(1,20),        
                          'min_samples_split':np.random.randint(2,5),      
                          'min_samples_leaf':np.random.randint(2,10),     
                          'min_weight_fraction_leaf':np.random.rand(1)[0]/2}]
            
            lista_RAN = ['RAN',RAN(), {}]
            
            lista_PAR = ['PAR',PAR(), 
                         {'C': np.random.randint(1,10), 'early_stopping':True,        
                          'n_iter_no_change':np.random.randint(1,10)}]
            
            lista_SGD = ['SGD',SGD(), {}]
            
            lista_regressors = [lista_LR,lista_RFR,lista_SVR,lista_ADA,lista_BAG,
                                lista_GBR,lista_RAN,lista_PAR,lista_SGD]
            
            random.shuffle(lista_regressors)
            
            lista_regressors = lista_regressors[0:qt_regressor]
            
            for j in range(len(lista_regressors)):
                lista_regressors[j][1] = lista_regressors[j][1].set_params(**lista_regressors[j][2])

            population[i] = [qt_regressor, lista_regressors, 'voting_regressor', np.inf]
            
        return population

    def set_fitness(self, population):
        for i in range(len(population)):
            
            lista_tuplas_VR = []
            nomes = []
            for indv in population[i][1]:
                
                while indv[0] in nomes: #adionar X se o nome já estiver dentro
                    indv[0] = indv[0]+'X'
                nomes.append(indv[0])
                
                lista_tuplas_VR.append((indv[0],indv[1])) #aqui vai pegando cada regressor do indivíduo (lista de regressores),
                                                          #que é formado pelo nome do regressor e o objeto.
                
            Voting_regressor = VotingRegressor(lista_tuplas_VR)
            Voting_regressor.fit(self.X_train, self.y_train)
            
            mae_vr = mae(Voting_regressor.predict(self.X_test), self.y_test)
            population[i][-1] = mae_vr
            population[i][-2] = Voting_regressor
            
        return population
    
    def next_population(self, population):
        
        for i in range(1, int(len(population)/2)):
            qt_regs_pai1 = population[i][0]
            qt_regs_pai2 = population[2*i][0]
            
            #aqui mistura os regressores
            if qt_regs_pai1<=qt_regs_pai2:    
                population[i][1][:int(qt_regs_pai1/2)] = population[2*i][1][:int(qt_regs_pai1/2)]
            else:
                population[i][1][:int(qt_regs_pai2/2)] = population[2*i][1][:int(qt_regs_pai2/2)]
                
            #modificar nomes dos regressores se houver repetido
            nomes = []
            for reg in population[i][1]:
                while reg[0] in nomes: #adionar X se o nome já estiver dentro
                    reg[0] = reg[0]+'X'
                nomes.append(reg[0])
        
        return population
    
    def early_stop(self):
        array = self.fitness_array_
        to_break=False
        if len(array) > 4:
            array_diff1_1 = array[1:] - array[:-1]
            array_diff2 = array_diff1_1[1:] - array_diff1_1[:-1]
            
            if (self.verbose_):
                print('second derivative: ', array_diff2[-2:].mean()) 
                print('first derivative: ', abs(array_diff1_1[-2:].mean()))
                print('featness: ', array[-1])
                
            if (array_diff2[-2:].mean()) > 0 and (abs(array_diff1_1[-3:].mean()) <1e-3):
                to_break = True
        
        return to_break

    def search_best(self):
        population = self.gen_population()
        population = self.set_fitness(population)
        population.sort(key = lambda x: x[-1])  
        self.fitness_array_ = np.append(self.fitness_array_, population[0][-1])
        self.best_of_all_ = population[0][-2]
        
        for i in tqdm(range(self.epochs)):
            population = self.next_population(population)
            population = self.set_fitness(population)
            population.sort(key = lambda x: x[-1])
            
            #pegar o melhor de todas as épocas
            
            if population[0][-1] < min(self.fitness_array_):
                self.best_of_all_ = population[0][-2]
            
            #adicionar ao array de fitness o atual
            self.fitness_array_ = np.append(self.fitness_array_, population[0][-1])

            if self.early_stop():
                break
            
        return self


n_samples = 1000
n_outliers = 50
X, y, coef = make_regression(n_samples=n_samples, n_features=1,n_informative=1, noise=10,coef=True, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

Ensearch = ensemble_search(X_train, y_train, X_test, y_test, size_pop=10, epochs=50).search_best()
print(mae(Ensearch.best_of_all_.predict(X_test), y_test))


  0%|          | 0/50 [00:00<?, ?it/s]
  2%|▏         | 1/50 [00:00<00:32,  1.52it/s]
  4%|▍         | 2/50 [00:01<00:32,  1.48it/s]
  6%|▌         | 3/50 [00:02<00:31,  1.48it/s]

second derivative:  -0.0012201155652138418
first derivative:  0.0012201155652138418
featness:  7.235882632629755



  8%|▊         | 4/50 [00:02<00:29,  1.53it/s]

second derivative:  0.0012201155652138418
first derivative:  0.0
featness:  7.238322863760183
7.235882632629755
