<a href="https://colab.research.google.com/github/koba341/AutoML/blob/main/RF_NSGAII.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary

In [68]:
print('Manually tuned Random Forest R2 on Testset: ', rfr.score(X_test, y_test))
print('Manually tuned Random Forest CV: ', scores, 'Mean: ', sum(scores)/10)
print('NSGA2+RF R2 on Testset: ', rfr_nsga2.score(X_test, y_test))
print('NSGA2+RF CV: ', cross_val_score(rfr_nsga2, X_train, y_train, cv=10), 'Mean: ', sum(cross_val_score(rfr_nsga2, X_train, y_train, cv=10))/10)

Manually tuned Random Forest R2 on Testset:  0.93690770347663
Manually tuned Random Forest CV:  [0.74846417 0.86161926 0.85131862 0.74472037 0.66573246 0.81449942
 0.63027282 0.78725852 0.86241726 0.2283725 ] Mean:  0.7194675393179206
NSGA2+RF R2 on Testset:  0.9288816767105856
NSGA2+RF CV:  [0.75906659 0.89094697 0.87915517 0.78182392 0.65967713 0.75128555
 0.72591799 0.77943598 0.84684182 0.22024594] Mean:  0.7245013939684533


Import, dataset etc..

In [None]:
# install pymoo for NSGA II
!pip install pymoo
# implement a problem
import numpy as np
from pymoo.core.problem import ElementwiseProblem # one of three possible ways to implement a problem. Means that the _evaluate function is called for each solution x at a time
# initialize the algorithm
from pymoo.algorithms.moo.nsga2 import NSGA2
from pymoo.factory import get_sampling, get_crossover, get_mutation
from pymoo.factory import get_termination
from pymoo.optimize import minimize
import matplotlib.pyplot as plt

In [6]:
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.ensemble
import pandas as pd

In [41]:
#import the data set
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AutoML/slump_test.csv")
X = data.iloc[:, 1:10].to_numpy()
y = data.iloc[:, 10].to_numpy()


X_train, X_test, y_train, y_test = \
    sklearn.model_selection.train_test_split(X, y, train_size=0.90,random_state=1) # create training and test dataset. 

# print some information
print('Size of X: ', X.shape)
print('Size of X_train: ', X_train.shape)
print('Size of X_test: ', X_test.shape)

Size of X:  (103, 9)
Size of X_train:  (92, 9)
Size of X_test:  (11, 9)


# RF

In [29]:
from sklearn.model_selection import cross_val_score
rfr = sklearn.ensemble.RandomForestRegressor(max_depth = 9, random_state = 0)
rfr.fit(X_train, y_train)
print('R2: ', rfr.score(X_test, y_test))
scores = cross_val_score(rfr, X_train, y_train, cv=10)

R2:  0.93690770347663


# RF + NSGA II

In [56]:
class MyProblem(ElementwiseProblem):

    def __init__(self):
        super().__init__(n_var=2,
                         n_obj=1, # number of objectives: min f_1
                         n_constr=0, # number of constraints
                         xl=np.array([1, 1]), # lower bound
                         xu=np.array([300, 9])) # upper bound

    def _evaluate(self, x, out, *args, **kwargs):
        rfr = sklearn.ensemble.RandomForestRegressor(n_estimators = x[0], max_depth = x[1], random_state = 0)
        scores = cross_val_score(rfr, X_train, y_train, cv=10)
        to_min = 1-max(scores)

        out["F"] = [to_min] # output is supposed to be written as dictionary. Should be written as list of numpy arrays of length of number of objectives

problem = MyProblem()

In [57]:
# initialize the algorithm. pymoo was mainly made for continious problems. but it's possible to use it for discrete variables too
algorithm = NSGA2(
    pop_size=100,
    n_offsprings=20,
    sampling=get_sampling("int_random"),
    crossover=get_crossover("int_sbx"),
    mutation=get_mutation("int_pm"),
    eliminate_duplicates=True
)
# termination criterion

from pymoo.factory import get_termination

#termination = get_termination("n_gen", 10) # terminate after 40 iterations of the algorithm
termination = get_termination("time", "01:00:00") # terminate after 1 hour training

In [58]:
import time
start_time = time.time()
res = minimize(problem, 
               algorithm,
               termination,
               seed=1,
               save_history=True,
               verbose=True)
end_time = time.time()

print("Time: ", (end_time-start_time)/60, "min")

X_ = res.X # minimize returns result objects
F_ = res.F

n_gen |  n_eval |  n_nds  |     eps      |  indicator  
    1 |      97 |       1 |            - |            -
    2 |     117 |       1 |  0.00000E+00 |            f
    3 |     137 |       1 |  0.00000E+00 |            f
    4 |     157 |       1 |  0.000199375 |            f
    5 |     177 |       1 |  0.00000E+00 |            f
    6 |     197 |       1 |  0.00000E+00 |            f
    7 |     217 |       1 |  0.000305941 |            f
    8 |     237 |       1 |  0.00000E+00 |            f
    9 |     257 |       1 |  0.000541700 |            f
   10 |     277 |       1 |  0.00000E+00 |            f
   11 |     297 |       1 |  0.00000E+00 |            f
   12 |     317 |       1 |  0.00000E+00 |            f
   13 |     337 |       1 |  0.00000E+00 |            f
   14 |     357 |       1 |  0.00000E+00 |            f
   15 |     377 |       1 |  0.00000E+00 |            f
   16 |     397 |       1 |  0.00000E+00 |            f
   17 |     417 |       1 |  0.00000E+00 |      

In [59]:
rfr_nsga2 = sklearn.ensemble.RandomForestRegressor(n_estimators=X_[0], max_depth = X_[1])
rfr_nsga2.fit(X_train, y_train)
print(cross_val_score(rfr_nsga2, X_train, y_train, cv=10))
print(rfr_nsga2.score(X_test, y_test))

[0.74924635 0.86588794 0.82651609 0.7676958  0.64583324 0.75800429
 0.71215815 0.8022594  0.85258357 0.21162496]
0.9288816767105856
