In [None]:
from genetic_selection import GeneticSelectionCV
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("cleaned_pc.csv")
#remove wrongly labeled molecules
df.drop([113,396,241,256],inplace=True)
df.dropna(axis=1,inplace=True)
df['active'] = df['T'].apply(lambda x:0 if x<1.5 else 1)
df.drop(['SMILES','T'],inplace=True,axis=1)

In [None]:
X = df.loc[:, df.columns != 'active']
y = df['active']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [None]:
ros = RandomOverSampler(random_state=0)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=0, random_state=42, scoring='f1_macro')
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
model2 = RandomForestClassifier(n_estimators=600,
                               min_samples_split=5,
                               min_samples_leaf = 2,
                               max_depth=50,
                               bootstrap=False)

In [None]:
selectors = GeneticSelectionCV(model2,
                                  cv=3,
                                  verbose=1,
                                  scoring='accuracy',
                                  max_features=5,
                                  n_population=100,
                                  crossover_proba=0.5,
                                  mutation_proba=0.2,
                                  n_generations=50,
                                  crossover_independent_proba=0.5,
                                  mutation_independent_proba=0.04,
                                  tournament_size=4,
                                  n_gen_no_change=10,
                                  caching=True,
                                  n_jobs=-2)
selectors = selectors.fit(X_train, y_train)

In [None]:
X.columns[selectors.support_]