In [2]:
!pip install sklearn-genetic-opt

Collecting sklearn-genetic-opt
  Downloading sklearn_genetic_opt-0.10.1-py3-none-any.whl (33 kB)
Collecting deap>=1.3.3 (from sklearn-genetic-opt)
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deap, sklearn-genetic-opt
Successfully installed deap-1.4.1 sklearn-genetic-opt-0.10.1


In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn_genetic import GAFeatureSelectionCV
from sklearn_genetic.space import Categorical
from sklearn_genetic.callbacks import ProgressBar
from sklearn_genetic.plots import plot_fitness_evolution
from sklearn.metrics import make_scorer, accuracy_score

x_train = pd.read_csv('x_train.txt', header=None, sep=' ')
y_train = pd.read_csv('y_train.txt', header=None, sep=' ')
x_test = pd.read_csv('x_test.txt', header=None, sep=' ')
x_train.columns = [f"x{i+1}" for i in x_train.columns]
x_test.columns = [f"x{i+1}" for i in x_test.columns]


In [9]:
initial_features = [
    "x1",
    "x10",
    "x101",
    "x102",
    "x103",
    "x104",
    "x105",
    "x106",
    "x132",
    "x140",
    "x149",
    "x153",
    "x156",
    "x176",
    "x191",
    "x2",
    "x22",
    "x221",
    "x229",
    "x253",
    "x286",
    "x3",
    "x304",
    "x322",
    "x323",
    "x324",
    "x329",
    "x336",
    "x35",
    "x352",
    "x36",
    "x4",
    "x40",
    "x404",
    "x413",
    "x423",
    "x459",
    "x463",
    "x499",
    "x5",
    "x58",
    "x6",
    "x65",
    "x7",
    "x74",
    "x8",
    "x81",
    "x9",
    "x99",
]

x_train = x_train[initial_features]
x_test = x_test[initial_features]
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42, stratify=y_train)


In [10]:
import pandas as pd
import numpy as np

def calculate_score(estimator, X, y_true, reward=10, punishment=200):
    number_of_features = X.shape[1]
    number_1s = sum(y_true==1)

    y_pred = estimator.predict_proba(X)

    probabilities = y_pred[:,1]
    top_indices = np.argpartition(probabilities, -number_1s)[-number_1s:]
    labels = np.zeros_like(probabilities, dtype=int)
    labels[top_indices] = 1
    labels = labels.tolist()
    y_pred = labels
    correct_class_1 = sum((np.array(y_pred)==1) & (np.array(list(y_true))==1))
    score = reward * correct_class_1*(1000/number_1s) - punishment * number_of_features
    return score

# Genetic Algorithm part 1

In [None]:
from functools import partial

params_list = [{'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.2},
          {'population_size': 25, 'mutation_probability': 0.6, 'crossover_probability': 0.3},
          {'population_size': 15, 'mutation_probability': 0.5, 'crossover_probability': 0.4}]

scoring_list = ['accuracy',
           partial(calculate_score, reward=10, punishment=200),
           partial(calculate_score, reward=10, punishment=100)]

results = []

for params in params_list:
  for scoring in scoring_list:
    estimator = LogisticRegression()
    selector = GAFeatureSelectionCV(
        estimator=estimator,
        cv=3,
        scoring=scoring,
        generations=100,
        elitism=True,
        n_jobs=-1,
        verbose=2,
        keep_top_k=8,
        criteria="max",
        **params
    )
    print(f'Selecting for params: {params} and scoring: {scoring}')
    # Fit the genetic algorithm feature selector
    selector.fit(X_train, np.ravel(y_train))
    best_features = selector.support_

    # Transform the dataset to keep only the best features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    estimator.fit(X_train_selected, y_train)
    accuracy = estimator.score(X_test_selected, y_test)
    results.append({'params': params, 'scoring': scoring, 'features': best_features, 'test_accuracy': accuracy,
                    'test_score': calculate_score(estimator, X_test_selected, np.ravel(y_test))})

    print("Selected features:", best_features)
    print("Accuracy with selected features:", accuracy)

Selecting for params: {'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.2} and scoring: accuracy


  pid = os.fork()
  pid = os.fork()


gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	20    	0.536161	0.00595634 	0.546001   	0.52325    
1  	27    	0.538849	0.00303151 	0.545501   	0.532998   
2  	33    	0.5423  	0.00370771 	0.54825    	0.53675    
3  	27    	0.543462	0.00328699 	0.54825    	0.536997   
4  	29    	0.546037	0.00371835 	0.552253   	0.538997   
5  	26    	0.546537	0.00357078 	0.548999   	0.535499   
6  	33    	0.548212	0.00306159 	0.552751   	0.535748   
7  	28    	0.548987	0.00316812 	0.552751   	0.542249   
8  	31    	0.549062	0.00372068 	0.552751   	0.534246   
9  	33    	0.549225	0.00316745 	0.552749   	0.538253   
10 	30    	0.549161	0.00478134 	0.552749   	0.534      
11 	30    	0.551486	0.00386331 	0.5595     	0.540751   
12 	20    	0.551149	0.00481751 	0.5595     	0.53175    
13 	30    	0.551074	0.00327154 	0.552749   	0.539247   
14 	30    	0.549974	0.00496327 	0.552749   	0.539748   
15 	26    	0.550874	0.00435124 	0.553001   	0.539748   
16 	27    	0.550763	0.00448879 	0.553001   	0.53

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Selected features: [False False False False False False False  True  True  True  True False
  True False  True  True False  True False  True False  True False  True
 False  True  True  True  True  True False False  True  True  True  True
  True False  True  True  True  True  True False False False  True  True
 False]
Accuracy with selected features: 0.554
Selecting for params: {'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	387.259	574.395    	1768.01    	-492.06    
1  	24    	905.945	471.894    	2087.87    	122.902    
2  	27    	1305.92	431.484    	2092.92    	563.047    
3  	32    	1590.2 	406.044    	2092.92    	563.047    
4  	28    	1873.74	265.876    	2512.93    	1158       
5  	24    	2055.18	369.916    	2512.93    	808.025    
6  	24    	2238.17	292.008    	2512.93    	1558.

  y = column_or_1d(y, warn=True)


Selected features: [ True  True False False False False False False False False False False
 False False  True False False False False False False False False False
 False  True False  True False False False False False False False False
 False False False False False False False False False False  True False
 False]
Accuracy with selected features: 0.508
Selecting for params: {'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=100)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	2930.47	358.247    	3518.07    	2302.91    
1  	25    	3330.73	141.48     	3518.07    	3027.81    
2  	32    	3387.71	119.561    	3572.87    	3182.92    
3  	25    	3451.69	178.94     	3572.87    	2733.06    
4  	25    	3519.18	162.015    	3767.88    	2958.06    
5  	26    	3566.17	159.607    	3767.88    	3117.86    
6  	32    	3524.96	282.877    	3767.88    	2603.

  y = column_or_1d(y, warn=True)


Selected features: [False False  True False False False False False False False False False
 False False False False  True False False False  True False False  True
 False False False False False False  True False False  True False False
 False  True False False  True  True False False False False False False
 False]
Accuracy with selected features: 0.543
Selecting for params: {'population_size': 25, 'mutation_probability': 0.6, 'crossover_probability': 0.3} and scoring: accuracy
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	25    	0.532979	0.00699387 	0.544747   	0.521502   
1  	43    	0.538479	0.00616492 	0.546501   	0.522498   
2  	43    	0.54403 	0.00497639 	0.552999   	0.534999   
3  	41    	0.54744 	0.00471913 	0.557748   	0.538499   
4  	43    	0.550089	0.00398139 	0.555747   	0.541248   
5  	46    	0.550449	0.00658547 	0.555747   	0.526248   
6  	45    	0.552889	0.00342656 	0.556747   	0.538747   
7  	44    	0.553669	0.00170896 	0.556747   	0.548501   
8  	44    	

  y = column_or_1d(y, warn=True)


Selected features: [ True  True False False False False False  True False  True  True  True
  True  True  True  True False  True False  True False False False False
  True  True False  True  True  True False  True False  True  True  True
  True False False  True  True False  True False  True False  True  True
  True]
Accuracy with selected features: 0.545
Selecting for params: {'population_size': 25, 'mutation_probability': 0.6, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	25    	554.616	599.935    	1953.03    	-211.935   
1  	43    	1080.82	551.753    	1953.03    	-36.9151   
2  	45    	1602.63	404.167    	1963.08    	593.13     
3  	46    	1807.62	373.469    	2092.95    	398.067    
4  	44    	1741.82	400.345    	2092.95    	532.957    
5  	45    	1933.81	337.727    	2378.05    	507.932    
6  	42    	2146.79	402.817    	2753.04    	822.7

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False  True False False False False False False
  True False False False False False False False False False False False
 False False False  True False False False False False  True False False
 False False False False False False False False  True False  True False
  True]
Accuracy with selected features: 0.537
Selecting for params: {'population_size': 25, 'mutation_probability': 0.6, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=100)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	25    	2921.93	252.192    	3542.96    	2462.99    
1  	46    	3144.17	201.148    	3542.96    	2807.99    
2  	43    	3258.77	204.355    	3622.95    	2933.01    
3  	44    	3306.36	198.375    	3622.95    	2933.01    
4  	41    	3459.95	146.208    	3742.96    	3212.91    
5  	43    	3554.57	195.478    	3868.02    	2947.96    
6  	43    	3670.95	166.496    	3903.03    	3232.

  y = column_or_1d(y, warn=True)


Selected features: [False False False  True False False False False False False False False
 False False False False False False False  True False False False  True
 False False False False False False False False False False False False
 False False False False False False False  True False False False False
 False]
Accuracy with selected features: 0.52
Selecting for params: {'population_size': 15, 'mutation_probability': 0.5, 'crossover_probability': 0.4} and scoring: accuracy
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	15    	0.533266	0.00588773 	0.5455     	0.52175    
1  	28    	0.5421  	0.00411444 	0.549001   	0.534754   
2  	25    	0.541549	0.00517566 	0.549001   	0.532249   
3  	25    	0.542799	0.003035   	0.549001   	0.535999   
4  	29    	0.543916	0.00418929 	0.55025    	0.53425    
5  	28    	0.544732	0.00370424 	0.55025    	0.53425    
6  	29    	0.546865	0.00330183 	0.55325    	0.538749   
7  	25    	0.548532	0.0026472  	0.55325    	0.5435     
8  	26    	0

  y = column_or_1d(y, warn=True)


Selected features: [False  True  True False  True  True False  True False  True False False
 False False  True  True  True  True False  True  True False  True  True
  True  True  True  True  True  True  True  True  True False  True False
 False False False False  True False  True  True False False  True False
  True]
Accuracy with selected features: 0.538
Selecting for params: {'population_size': 15, 'mutation_probability': 0.5, 'crossover_probability': 0.4} and scoring: functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	15    	530.978	727.315    	1692.93    	-771.888   
1  	26    	1241.31	432.776    	1692.93    	338        
2  	28    	1475.34	438.313    	2337.95    	338        
3  	27    	1736.32	354.289    	2337.95    	917.942    
4  	28    	1989.27	283.4      	2467.91    	1363.02    
5  	27    	2243.25	145.799    	2467.91    	2017.89    
6  	28    	2287.92	237.513    	2733.06    	2017.

  y = column_or_1d(y, warn=True)


Selected features: [False False  True False False  True False False False False False False
  True False False False  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False  True False False False False  True False
 False]
Accuracy with selected features: 0.496
Selecting for params: {'population_size': 15, 'mutation_probability': 0.5, 'crossover_probability': 0.4} and scoring: functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=100)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	15    	2700.99	361.878    	3283.04    	2183.03    
1  	27    	3105.29	237.39     	3392.92    	2668.01    
2  	27    	3255.68	184.881    	3602.94    	2973.06    
3  	26    	3345.98	172.466    	3602.94    	2823.09    
4  	28    	3431.64	86.4298    	3542.95    	3277.95    
5  	30    	3552.31	118.807    	3737.88    	3392.92    
6  	28    	3687.93	140.077    	3912.91    	3423.

  y = column_or_1d(y, warn=True)


In [None]:
results

[{'params': {'population_size': 20,
   'mutation_probability': 0.5,
   'crossover_probability': 0.2},
  'scoring': 'accuracy',
  'features': array([False, False, False, False, False, False, False,  True,  True,
          True,  True, False,  True, False,  True,  True, False,  True,
         False,  True, False,  True, False,  True, False,  True,  True,
          True,  True,  True, False, False,  True,  True,  True,  True,
          True, False,  True,  True,  True,  True,  True, False, False,
         False,  True,  True, False]),
  'test_accuracy': 0.554,
  'test_score': -88.97795591182421},
 {'params': {'population_size': 20,
   'mutation_probability': 0.5,
   'crossover_probability': 0.2},
  'scoring': functools.partial(<function calculate_score at 0x78b445b73e20>, reward=10, punishment=200),
  'features': array([ True,  True, False, False, False, False, False, False, False,
         False, False, False, False, False,  True, False, False, False,
         False, False, False, False,

In [None]:
pd.DataFrame(results).to_csv("genetic_results_29052024.csv")

# 2nd test

In [43]:
from functools import partial

params_list = []

for population_size in [20, 30, 40]:
  for mutation_probability in [0.6, 0.5, 0.4]:
    for crossover_probability in [0.1, 0.2, 0.3]:
      params_list.append({'population_size': population_size, 'mutation_probability': mutation_probability, 'crossover_probability': crossover_probability})

scoring_list = [partial(calculate_score, reward=10, punishment=200)]

results = []

for params in params_list:
  for scoring in scoring_list:
    estimator = LogisticRegression()
    selector = GAFeatureSelectionCV(
        estimator=estimator,
        cv=3,
        scoring=scoring,
        generations=70,
        elitism=True,
        n_jobs=-1,
        verbose=2,
        keep_top_k=8,
        criteria="max",
        **params
    )
    print(f'Selecting for params: {params} and scoring: {scoring}')
    # Fit the genetic algorithm feature selector
    selector.fit(X_train, np.ravel(y_train))
    best_features = selector.support_

    # Transform the dataset to keep only the best features
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    estimator.fit(X_train_selected, y_train)
    accuracy = estimator.score(X_test_selected, y_test)
    results.append({'params': params, 'scoring': scoring, 'features': best_features, 'test_accuracy': accuracy,
                    'test_score': calculate_score(estimator, X_test_selected, np.ravel(y_test))})

    print("Selected features:", best_features)
    print("Accuracy with selected features:", accuracy)

Selecting for params: {'population_size': 20, 'mutation_probability': 0.6, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)


  pid = os.fork()
  pid = os.fork()


gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	542.189	482.209    	1532.94    	-106.77    
1  	29    	772.952	474.518    	1532.94    	-72.0179   
2  	24    	949.444	436.487    	1532.94    	223.115    
3  	27    	1128.73	396.084    	1758.03    	553.03     
4  	24    	1449.48	286.755    	1758.05    	778.025    
5  	30    	1325.45	488.532    	1758.05    	-242.041   
6  	33    	1466.97	373.633    	1758.05    	92.9998    
7  	26    	1669.75	213.759    	2183.13    	1338.02    
8  	27    	1814.75	338.834    	2343.04    	1357.94    
9  	30    	1614.24	563.286    	2343.04    	318.04     
10 	27    	1832.99	446.529    	2343.04    	1012.75    
11 	26    	2022.76	302.623    	2392.82    	1532.94    
12 	30    	2142.77	257.012    	2392.82    	1532.94    
13 	35    	1973.5 	707.674    	2392.82    	-297.088   
14 	27    	2078.95	534.542    	2392.82    	203.042    
15 	33    	2183.64	487.604    	2392.82    	332.965    
16 	27    	2382.34	45.7024    	2392.82    	2183.13    
17 	26    

  y = column_or_1d(y, warn=True)


Selected features: [ True False False False False False False False False  True False False
  True False False False  True False  True  True False False False  True
  True False False False  True False False  True  True False  True False
 False False False False False False False False False False False  True
 False]
Accuracy with selected features: 0.524
Selecting for params: {'population_size': 20, 'mutation_probability': 0.6, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	425.248	707.665    	2042.83    	-966.958   
1  	30    	906.76 	450.587    	1543.02    	142.982    
2  	31    	1219.49	313.234    	1563.04    	683.19     
3  	32    	1247.99	340.245    	1563.04    	418.042    
4  	34    	1313   	335.044    	1563.04    	442.829    
5  	33    	1399.04	282.005    	1563.04    	833.162    
6  	32    	1397.01	353.046    	1563.04    	237.7

  y = column_or_1d(y, warn=True)


Selected features: [False False  True False False False False False False  True  True False
  True False False False False False  True False False  True False False
 False False False False False  True False False False False False False
 False False False  True False False False False False False False False
 False]
Accuracy with selected features: 0.51
Selecting for params: {'population_size': 20, 'mutation_probability': 0.6, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	457.947	533.241    	1817.83    	-451.99    
1  	38    	939.709	463.595    	2163.06    	87.9571    
2  	34    	1093.19	366.507    	2163.06    	293.007    
3  	34    	1311.94	466.535    	2163.06    	293.007    
4  	38    	1457.73	422.693    	2163.06    	727.952    
5  	35    	1734.72	299.765    	2163.06    	1097.92    
6  	36    	1834.97	323.412    	2163.06    	553.00

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False False  True False False  True  True
 False False False False False False False False False False False False
 False False False False  True False False False False False False  True
  True  True False False  True False False False False False False False
 False]
Accuracy with selected features: 0.522
Selecting for params: {'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	370.736	775.25     	1582.78    	-1322.11   
1  	22    	896.704	513.64     	1582.78    	187.774    
2  	21    	1217.47	445.008    	1732.99    	393.055    
3  	24    	1504.21	229.952    	1897.9     	727.982    
4  	27    	1494.95	139.201    	1732.99    	1247.84    
5  	29    	1553.41	234.683    	1958.07    	993.055    
6  	19    	1627.4 	231.371    	1958.07    	1117.

  y = column_or_1d(y, warn=True)


Selected features: [ True False  True  True  True False False  True False  True False False
 False False False False  True False False False False  True False False
 False False False False False False  True False  True False  True False
 False False False False False  True  True False False False False False
 False]
Accuracy with selected features: 0.524
Selecting for params: {'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	526.447	808.663    	2417.8     	-1052.07   
1  	28    	1167.42	605.866    	2417.8     	-141.812   
2  	28    	1705.39	522.665    	2417.8     	892.925    
3  	27    	1827.44	451.218    	2417.8     	847.842    
4  	23    	2069.63	334.372    	2652.84    	1632.77    
5  	27    	1991.4 	652.089    	2652.84    	358.08     
6  	29    	2377.61	546.422    	2652.84    	132.9

  y = column_or_1d(y, warn=True)


Selected features: [ True False False False  True  True False  True False False  True False
 False False False  True False False False False False False False  True
 False False  True  True False False False False False False False False
 False False False False False False False False False False False False
  True]
Accuracy with selected features: 0.533
Selecting for params: {'population_size': 20, 'mutation_probability': 0.5, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	749.942	758.891    	2612.73    	-467.051   
1  	33    	1257.65	593.945    	2612.73    	492.932    
2  	32    	1679.42	493.122    	2612.73    	1017.76    
3  	34    	1915.72	490.994    	2612.73    	1112.98    
4  	29    	2376.63	491.4      	3203.03    	1148.02    
5  	29    	2643.66	411.705    	3203.03    	2112.97    
6  	30    	2786.41	498.092    	3203.03    	1497.

  y = column_or_1d(y, warn=True)


Selected features: [False False False False  True False False False False  True False False
 False  True False False False False False False False False  True False
 False False False False False False False False False  True False False
 False False False False  True False False False False False  True False
 False]
Accuracy with selected features: 0.541
Selecting for params: {'population_size': 20, 'mutation_probability': 0.4, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	390.473	437.641    	1337.98    	-682.088   
1  	21    	800.443	419.732    	1702.97    	213.097    
2  	18    	1055.92	395.454    	1702.97    	303.01     
3  	17    	1313.37	236.214    	1702.97    	1067.86    
4  	19    	1565.4 	228.271    	2232.86    	1203.09    
5  	18    	1787.9 	310.389    	2232.86    	1382.76    
6  	16    	2055.16	281.007    	2523.02    	1568.

  y = column_or_1d(y, warn=True)


Selected features: [ True False False False False False False False False  True False False
 False False False False False False False False False  True False False
 False  True  True False  True False False False False False False False
 False False False False False False False False False  True  True False
 False]
Accuracy with selected features: 0.489
Selecting for params: {'population_size': 20, 'mutation_probability': 0.4, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	176.265	419.613    	908.007    	-716.825   
1  	32    	662.987	291.571    	1192.97    	82.9747    
2  	22    	1035.21	347.425    	1542.96    	427.852    
3  	24    	1469.48	149.785    	1747.95    	1092.91    
4  	28    	1425   	239.655    	1747.95    	583.097    
5  	20    	1678.46	322.012    	2497.89    	1192.97    
6  	26    	1889.67	407.915    	2497.89    	1542.

  y = column_or_1d(y, warn=True)


Selected features: [ True False False False False  True  True False  True False False  True
  True False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.506
Selecting for params: {'population_size': 20, 'mutation_probability': 0.4, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	358.455	611.864    	2002.82    	-386.948   
1  	29    	605.643	551.987    	2002.82    	-337.098   
2  	29    	1150.88	514.917    	2002.82    	402.789    
3  	30    	1450.58	511.242    	2002.82    	188.042    
4  	31    	1661.32	392.85     	2498       	942.744    
5  	25    	1940.06	292.863    	2498       	1262.84    
6  	26    	2155.84	250.151    	2498       	1597.

  y = column_or_1d(y, warn=True)


Selected features: [ True  True False False False False False False False False False  True
 False False False False False False False False False False False  True
 False False False False False  True False False False False False False
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.511
Selecting for params: {'population_size': 30, 'mutation_probability': 0.6, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	499.623	556.682    	1637.86    	-482.066   
1  	43    	902.939	293.494    	1637.86    	507.917    
2  	42    	1009.8 	234.596    	1302.96    	413.105    
3  	45    	1177.93	171.578    	1617.78    	813.075    
4  	38    	1256.13	110.949    	1383.03    	932.987    
5  	42    	1316.31	78.6061    	1383.03    	1062.87    
6  	47    	1339.17	108.969    	1477.91    	968.0

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False  True False False  True False False
  True  True False False False False False False False False False False
 False False  True False False  True False  True  True  True  True False
 False False False False False  True False  True False False False False
 False]
Accuracy with selected features: 0.528
Selecting for params: {'population_size': 30, 'mutation_probability': 0.6, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	224.332	529.12     	1323.04    	-871.727   
1  	44    	795.474	473.85     	2072.87    	138.06     
2  	45    	986.96 	305.533    	1482.86    	398.075    
3  	54    	1232.46	315.893    	2383.1     	642.874    
4  	45    	1398.63	484.499    	2383.1     	-52.0054   
5  	48    	1475.28	474.116    	2383.1     	452.854    
6  	43    	1664.11	384.171    	2383.1     	538.0

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False  True False False False False  True
  True  True  True False False False False False False False False False
  True False False False False False  True False False False False False
 False False False False False False  True False False False False False
 False]
Accuracy with selected features: 0.534
Selecting for params: {'population_size': 30, 'mutation_probability': 0.6, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	253.789	680.475    	1598.07    	-867.02    
1  	55    	756.629	491.989    	2373.03    	143.042    
2  	52    	908.633	358.286    	1437.91    	-67.0505   
3  	54    	1084.64	310.245    	1773.03    	577.734    
4  	55    	1157.62	330.827    	1917.89    	707.992    
5  	53    	1363.95	409.418    	2028.03    	367.985    
6  	55    	1561.8 	368.214    	2097.91    	797.7

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False  True False False False False  True
 False False False False  True False False False False False False False
 False False False False False False False False False  True False  True
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.53
Selecting for params: {'population_size': 30, 'mutation_probability': 0.5, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	376.311	706.61     	1627.81    	-1026.99   
1  	34    	1046.33	605.956    	2032.84    	-441.98    
2  	33    	1496.32	383.004    	2032.84    	442.882    
3  	40    	1743.82	341.732    	2617.77    	1027.9     
4  	37    	1795.99	284.621    	2617.77    	1042.91    
5  	35    	1784.17	226.964    	2032.84    	1042.91    
6  	30    	1821.35	210.061    	2032.84    	953.02

  y = column_or_1d(y, warn=True)


Selected features: [ True False False False False False  True False False  True  True False
 False False  True False False  True False False  True False  True False
 False False False  True False False False False False  True False False
 False False False False False  True False False False False False False
 False]
Accuracy with selected features: 0.553
Selecting for params: {'population_size': 30, 'mutation_probability': 0.5, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	314.966	587.901    	1687.92    	-1057.07   
1  	43    	786.462	609.755    	1783.1     	-711.82    
2  	43    	1303.17	535.194    	2128.03    	427.912    
3  	45    	1573   	370.648    	2128.03    	497.914    
4  	41    	1813.67	190.446    	2128.03    	1382.73    
5  	46    	2013.14	256.265    	2632.87    	1692.97    
6  	47    	2233.6 	328.851    	2632.87    	1462.

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False  True False False False False  True False
 False False False False False False False False False False False False
 False False  True  True False  True False False False False False False
 False  True False False False False False False False False False False
 False]
Accuracy with selected features: 0.512
Selecting for params: {'population_size': 30, 'mutation_probability': 0.5, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	256.498	648.197    	1662.93    	-821.908   
1  	53    	774.457	405.386    	1662.93    	-156.88    
2  	48    	1171.76	275.939    	1662.93    	603.072    
3  	52    	1316.74	340.268    	1877.91    	262.872    
4  	51    	1484.22	475.363    	2647.85    	262.872    
5  	43    	1962.89	433.686    	2922.94    	1287.82    
6  	49    	2198.89	432.799    	3172.79    	1168.

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False  True  True False False False False False False False  True False
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.493
Selecting for params: {'population_size': 30, 'mutation_probability': 0.4, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	401.471	625.071    	1677.9     	-1051.94   
1  	34    	1044.47	374.574    	1677.9     	228.157    
2  	26    	1239.48	230.888    	1657.78    	558.325    
3  	22    	1414.25	158.452    	1657.78    	1158       
4  	25    	1603.88	85.601     	1672.97    	1322.99    
5  	30    	1593.83	302.318    	1892.8     	162.927    
6  	34    	1702.82	165.838    	2562.96    	1657.

  y = column_or_1d(y, warn=True)


Selected features: [False  True False False False False False  True False False  True False
 False False False  True  True False False  True False False False False
 False  True False False False False  True False False False False  True
 False False False False  True False False False False False False False
 False]
Accuracy with selected features: 0.52
Selecting for params: {'population_size': 30, 'mutation_probability': 0.4, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	212.99 	626.917    	1583.01    	-1166.82   
1  	34    	813.3  	440.769    	1583.01    	-401.955   
2  	36    	1180.96	403.894    	2027.81    	512.975    
3  	37    	1357.61	380.182    	2027.81    	537.924    
4  	37    	1608.61	304.566    	2027.81    	912.967    
5  	39    	1691.93	291.679    	2027.81    	652.884    
6  	31    	1734.07	395.82     	2292.83    	652.88

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False  True False False False False False False
 False False False False False False False False False False  True False
 False False False False  True  True False False False False False False
 False False False False False False False False False  True False  True
  True]
Accuracy with selected features: 0.547
Selecting for params: {'population_size': 30, 'mutation_probability': 0.4, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	30    	359.482	735.306    	1707.96    	-1526.84   
1  	41    	773.486	535.294    	2052.85    	-342.133   
2  	43    	1170.47	378.982    	2052.85    	618.095    
3  	43    	1395.61	358.018    	2052.85    	887.942    
4  	43    	1583.12	288.708    	2122.97    	1143       
5  	43    	1901.21	210.527    	2122.97    	1487.87    
6  	46    	2053.72	223.704    	2682.92    	1598.

  y = column_or_1d(y, warn=True)


Selected features: [False  True False False False False False False False False False False
 False False False False False False False  True False False False False
  True False  True False False False False False False False False False
 False  True False False False False False False False False False False
 False]
Accuracy with selected features: 0.504
Selecting for params: {'population_size': 40, 'mutation_probability': 0.6, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	409.852	736.947    	1528.03    	-1761.96   
1  	56    	1075.17	508.845    	2102.9     	-342.126   
2  	54    	1430.18	443.994    	2102.9     	332.987    
3  	47    	1720.08	345.674    	2102.93    	892.985    
4  	60    	1754.49	489.195    	2102.93    	183.052    
5  	59    	1906.14	432.464    	2102.93    	282.877    
6  	54    	1976.94	389.768    	2102.93    	388.0

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False False False False  True False False
 False False False False False  True False False  True False False False
 False False False  True False False  True False False False False False
 False  True False  True False False False False False False False False
 False]
Accuracy with selected features: 0.539
Selecting for params: {'population_size': 40, 'mutation_probability': 0.6, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	367.23 	692.577    	1593.05    	-1247.02   
1  	65    	1018.45	496.217    	1897.85    	-291.993   
2  	63    	1179.16	387.294    	1692.97    	42.8368    
3  	70    	1231.13	425.058    	1798.07    	107.947    
4  	67    	1416.26	369.626    	1877.92    	337.985    
5  	62    	1670.22	304.596    	2257.85    	502.965    
6  	61    	1859.46	549.112    	2817.71    	168.0

  y = column_or_1d(y, warn=True)


Selected features: [False False False  True False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False  True False False  True False False
 False False False False False False  True False False False False False
 False]
Accuracy with selected features: 0.523
Selecting for params: {'population_size': 40, 'mutation_probability': 0.6, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	447.457	638.924    	1782.84    	-796.852   
1  	70    	918.199	445.373    	1782.84    	-37.013    
2  	75    	1143.17	454.276    	2002.81    	72.8593    
3  	72    	1434.15	474.663    	2377.82    	408.055    
4  	74    	1800.24	322.992    	2477.57    	852.832    
5  	75    	2020.98	359.372    	3417.72    	912.96     
6  	71    	2290.37	515.615    	3417.72    	1318.

  y = column_or_1d(y, warn=True)


Selected features: [False False  True False False False False False False False False  True
 False False False False False False False False False False False False
 False  True False False False False False False False  True False False
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.52
Selecting for params: {'population_size': 40, 'mutation_probability': 0.5, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	333.755	571.275    	1502.87    	-757.141   
1  	55    	739.748	509.586    	2232.81    	-531.855   
2  	58    	1024.57	399.336    	2232.81    	203.087    
3  	33    	1369.52	381.825    	2232.81    	627.852    
4  	44    	1729.38	386.146    	2232.81    	1187.75    
5  	53    	1866.49	402.445    	2232.81    	847.804    
6  	53    	1843.37	498.972    	2232.81    	758.05

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False False False False False False False
 False False False False False  True False False False  True False  True
 False False False False  True False False False False False False False
  True False False  True False False  True False False False False False
  True]
Accuracy with selected features: 0.539
Selecting for params: {'population_size': 40, 'mutation_probability': 0.5, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	303.625	639.406    	1757.97    	-1181.85   
1  	57    	796.085	403.696    	1757.97    	208.115    
2  	56    	1125.44	384.049    	1757.97    	122.992    
3  	53    	1438.64	345.517    	2607.78    	257.912    
4  	51    	1614.28	379.198    	2607.78    	273.195    
5  	58    	1760.15	323.665    	2447.8     	822.824    
6  	56    	1851.53	248.616    	2447.8     	958.0

  y = column_or_1d(y, warn=True)


Selected features: [False False  True False False  True False False False False  True False
 False False False False False  True False False  True False  True False
 False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.555
Selecting for params: {'population_size': 40, 'mutation_probability': 0.5, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	470.847	610.856    	2142.94    	-666.99    
1  	60    	926.205	458.255    	2142.94    	-206.968   
2  	61    	1248.11	310.818    	1757.97    	537.977    
3  	67    	1418.03	326.611    	2037.85    	398.037    
4  	65    	1680.35	317.456    	2452.85    	973.072    
5  	68    	1874.58	293.944    	2622.78    	1097.86    
6  	62    	2074.16	332.523    	2747.7     	1237.

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False False False False False False False False False False False
 False]
Accuracy with selected features: 0.517
Selecting for params: {'population_size': 40, 'mutation_probability': 0.4, 'crossover_probability': 0.1} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	291.839	659.691    	1968.04    	-1061.98   
1  	41    	874.955	519.716    	1968.04    	117.919    
2  	53    	1237.71	412.493    	1968.04    	408.107    
3  	41    	1614.43	345.076    	2352.74    	912.96     
4  	35    	1784.95	282.216    	2553.01    	1107.9     
5  	46    	1911.21	306.441    	2553.01    	707.947    
6  	39    	2093.93	220.504    	2553.01    	1477.

  y = column_or_1d(y, warn=True)


Selected features: [False False  True False  True False False False  True False False False
 False False False False False  True False  True  True False False False
  True False False  True False False False False False False False False
 False False False False False False False False False False False False
 False]
Accuracy with selected features: 0.545
Selecting for params: {'population_size': 40, 'mutation_probability': 0.4, 'crossover_probability': 0.2} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	541.981	633.269    	1907.98    	-651.968   
1  	53    	1107.34	470.858    	2177.83    	142.99     
2  	43    	1316.55	342.798    	1758.07    	383.097    
3  	44    	1590.12	264.36     	2112.97    	1072.92    
4  	50    	1746.22	236.519    	2112.97    	1098       
5  	52    	1872.75	269.836    	2192.79    	623.077    
6  	53    	2017.01	182.841    	2337.75    	1732.

  y = column_or_1d(y, warn=True)


Selected features: [False False False False False False False False  True False False False
 False False False  True False  True False False False False False False
 False False False False False False False False  True False False False
 False False False False False False False  True False False False False
 False]
Accuracy with selected features: 0.511
Selecting for params: {'population_size': 40, 'mutation_probability': 0.4, 'crossover_probability': 0.3} and scoring: functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200)
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	40    	498.22 	566.132    	1527.96    	-811.868   
1  	56    	817.833	372.667    	1603.11    	52.8694    
2  	58    	1235.31	386.984    	2332.91    	592.994    
3  	60    	1580.41	367.819    	2492.94    	877.902    
4  	53    	1779.31	363.305    	2492.94    	1052.81    
5  	60    	2118.3 	333.441    	2882.97    	1517.67    
6  	53    	2309.82	279.442    	3103.01    	1622.

  y = column_or_1d(y, warn=True)


In [45]:
results

[{'params': {'population_size': 20,
   'mutation_probability': 0.6,
   'crossover_probability': 0.1},
  'scoring': functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200),
  'features': array([ True, False, False, False, False, False, False, False, False,
          True, False, False,  True, False, False, False,  True, False,
          True,  True, False, False, False,  True,  True, False, False,
         False,  True, False, False,  True,  True, False,  True, False,
         False, False, False, False, False, False, False, False, False,
         False, False,  True, False]),
  'test_accuracy': 0.524,
  'test_score': 2650.501002004007},
 {'params': {'population_size': 20,
   'mutation_probability': 0.6,
   'crossover_probability': 0.2},
  'scoring': functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200),
  'features': array([False, False,  True, False, False, False, False, False, False,
          True,  True, False,

[{'params': {'population_size': 20,
   'mutation_probability': 0.6,
   'crossover_probability': 0.1},
  'scoring': functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200),
  'features': array([ True, False, False, False, False, False, False, False, False,
          True, False, False,  True, False, False, False,  True, False,
          True,  True, False, False, False,  True,  True, False, False,
         False,  True, False, False,  True,  True, False,  True, False,
         False, False, False, False, False, False, False, False, False,
         False, False,  True, False]),
  'test_accuracy': 0.524,
  'test_score': 2650.501002004007},
 {'params': {'population_size': 20,
   'mutation_probability': 0.6,
   'crossover_probability': 0.2},
  'scoring': functools.partial(<function calculate_score at 0x7f256d9bcdc0>, reward=10, punishment=200),
  'features': array([False, False,  True, False, False, False, False, False, False,
          True,  True, False,

In [46]:
pd.DataFrame(results).to_csv("genetic_results_30052024.csv")

In [18]:
df1 = pd.read_csv("genetic_results_29052024.csv")
df2 = pd.read_csv("genetic_results_30052024.csv")

df = pd.concat([df1, df2], axis=0)
best_features = list(df.sort_values(by='test_score', ascending=False)['features'][:10])
best_features

['[False False False False False False False False False False False False\n False False False False False False False False False False False False\n False False False False False False False False False False False  True\n False False False False False False False False False False False False\n False]',
 '[False False False False False False False False False False False False\n False False False False False False False False False False False False\n False False False False False False False False False False False False\n  True False False False False False False False False False False False\n False]',
 '[False False False False False False False False False False False False\n False False False False False False False False False False False False\n False False  True False False  True False False False False  True False\n False False False False False False False False False False False False\n False]',
 '[False False False  True False False False False False False False False\n

In [19]:
for features in best_features:
  tmp = [x for x in features.replace("\n", "").replace("[", "").replace("]", "").split()]
  tmp = [val == 'True' for val in tmp]
  print(np.array(initial_features)[tmp])

['x423']
['x459']
['x329' 'x352' 'x413']
['x102' 'x253' 'x322' 'x7']
['x101' 'x153' 'x324' 'x404']
['x105' 'x153' 'x22' 'x404' 'x423']
['x102' 'x2' 'x36' 'x404' 'x65']
['x153' 'x324' 'x329' 'x413']
['x132' 'x2' 'x221' 'x40' 'x7']
['x101' 'x104' 'x149' 'x221' 'x286' 'x304' 'x423']


In [None]:
from functools import partial

# Define the estimator
estimator = LogisticRegression()

# Convert custom scoring function to a scorer
# custom_scorer = make_scorer(penalized_score, greater_is_better=True)

# Define the genetic algorithm feature selector
selector = GAFeatureSelectionCV(
    estimator=estimator,
    cv=3,
    scoring=partial(calculate_score, reward=10, punishment=50),
    population_size=20,
    generations=100,
    mutation_probability=0.6,
    crossover_probability=0.3,
    elitism=True,
    n_jobs=-1,
    verbose=2,
    keep_top_k=8,
    criteria="max"
)

# Fit the genetic algorithm feature selector
selector.fit(X_train, np.ravel(y_train))

# Plot the evolution of the fitness
# plot_fitness_evolution(selector)

# Get the best features
best_features = selector.support_

# Transform the dataset to keep only the best features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Train a model with the selected features
estimator.fit(X_train_selected, y_train)
accuracy = estimator.score(X_test_selected, y_test)

print("Selected features:", best_features)
print("Accuracy with selected features:", accuracy)


gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	20    	4204.2 	179.091    	4453.12    	3878.15    
1  	36    	4290.69	129.387    	4453.12    	3997.91    
2  	35    	4389.7 	134.588    	4642.84    	4123       
3  	36    	4391.65	143.31     	4563.04    	4123       
4  	36    	4392.12	144.905    	4563.04    	4063.02    
5  	37    	4420.69	145.591    	4587.73    	4148.1     
6  	39    	4415.69	171.711    	4627.65    	4067.98    
7  	38    	4437.96	147.256    	4627.65    	4047.99    
8  	39    	4498.94	103.146    	4627.65    	4222.98    
9  	37    	4456.35	179.48     	4627.65    	4008.15    
10 	36    	4520.79	168.594    	4627.65    	4038.09    
11 	37    	4561.5 	92.7278    	4627.65    	4317.95    
12 	37    	4510.56	175.309    	4667.96    	3942.85    
13 	32    	4499.83	193.702    	4667.96    	4023.04    
14 	35    	4566.01	133.078    	4667.96    	4048.16    
15 	36    	4587.74	109.781    	4637.75    	4142.95    
16 	38    	4605.23	111.079    	4697.7     	4142.95    
17 	34    

  y = column_or_1d(y, warn=True)


In [None]:
selector.support_

array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False,  True, False, False,  True, False,
       False,  True, False,  True, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False])

In [None]:
features_selected = [
    [False,  True, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False,  True, False, False,
       False, False, False, False],
    [False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False],
    [False, False, False, False, False, False, False,  True, False,
       False, False, False, False,  True, False, False,  True, False,
       False,  True, False,  True, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False]
]

# Random Forest Most Important Features Selector

In [11]:
from collections import defaultdict

importances_dict = defaultdict(list)

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 2, 3],
    'min_samples_split': [2, 5, 10]
}

all_importances = []
for n_estimators in params['n_estimators']:
  for max_depth in params['max_depth']:
    for min_samples_split in params['min_samples_split']:
      importances_dict = defaultdict(list)
      for i in range(3):
        model = RandomForestClassifier(random_state=i, n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
        model.fit(X_train, np.ravel(y_train))
      for j in range(len(model.feature_names_in_)):
        importances_dict[model.feature_names_in_[j]] = model.feature_importances_[j]
      mean_importances = pd.Series(importances_dict).groupby(level=0).mean()
      all_importances.append(mean_importances)
      print(f"Finished - n_estimators: {n_estimators}, max_depth: {max_depth}, min_samples_split: {min_samples_split}")

Finished - n_estimators: 50, max_depth: None, min_samples_split: 2
Finished - n_estimators: 50, max_depth: None, min_samples_split: 5
Finished - n_estimators: 50, max_depth: None, min_samples_split: 10
Finished - n_estimators: 50, max_depth: 2, min_samples_split: 2
Finished - n_estimators: 50, max_depth: 2, min_samples_split: 5
Finished - n_estimators: 50, max_depth: 2, min_samples_split: 10
Finished - n_estimators: 50, max_depth: 3, min_samples_split: 2
Finished - n_estimators: 50, max_depth: 3, min_samples_split: 5
Finished - n_estimators: 50, max_depth: 3, min_samples_split: 10
Finished - n_estimators: 100, max_depth: None, min_samples_split: 2
Finished - n_estimators: 100, max_depth: None, min_samples_split: 5
Finished - n_estimators: 100, max_depth: None, min_samples_split: 10
Finished - n_estimators: 100, max_depth: 2, min_samples_split: 2
Finished - n_estimators: 100, max_depth: 2, min_samples_split: 5
Finished - n_estimators: 100, max_depth: 2, min_samples_split: 10
Finished - 

[x1      0.015927
 x10     0.017882
 x101    0.042112
 x102    0.034067
 x103    0.041231
 x104    0.031301
 x105    0.031085
 x106    0.042342
 x132    0.019102
 x140    0.017967
 x149    0.017337
 x153    0.016563
 x156    0.016782
 x176    0.018234
 x191    0.017842
 x2      0.017476
 x22     0.018827
 x221    0.018617
 x229    0.019727
 x253    0.018052
 x286    0.018795
 x3      0.019098
 x304    0.017083
 x322    0.018566
 x323    0.017650
 x324    0.019173
 x329    0.017336
 x336    0.017705
 x35     0.016347
 x352    0.018720
 x36     0.018644
 x4      0.019787
 x40     0.019203
 x404    0.020049
 x413    0.016342
 x423    0.018630
 x459    0.016970
 x463    0.018928
 x499    0.018706
 x5      0.016791
 x58     0.017050
 x6      0.021857
 x65     0.016777
 x7      0.017816
 x74     0.017519
 x8      0.015848
 x81     0.016934
 x9      0.023360
 x99     0.015843
 dtype: float64,
 x1      0.015826
 x10     0.017720
 x101    0.044833
 x102    0.035061
 x103    0.041396
 x104    0.

In [27]:
best_features = [set(x.sort_values(ascending=False)[:7].keys()) for x in all_importances]
best_features

[{'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x105', 'x106', 'x6', 'x9'},
 {'x101', 'x102', 'x103', 'x105', 'x106', 'x6', 'x9'},
 {'x101', 'x102', 'x103', 'x105', 'x106', 'x6', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x9'},
 {'x101', 'x102', 'x103', 'x104', 'x1

In [28]:
whole_set = set()
for x in best_features:
  whole_set = whole_set.union(x)

whole_set

{'x101', 'x102', 'x103', 'x104', 'x105', 'x106', 'x6', 'x9'}

In [35]:
from itertools import combinations

scores = []

for i in range(len(whole_set), 2, -1):
  for features in combinations(whole_set, i):
    X_train_tmp = X_train[list(features)]
    X_test_tmp = X_test[list(features)]
    model = RandomForestClassifier()
    model.fit(X_train_tmp, np.ravel(y_train))
    score = calculate_score(model, X_test_tmp, np.ravel(y_test))
    score_train = calculate_score(model, X_train_tmp, np.ravel(y_train))
    scores.append({'features': features, 'score_test': score, 'score_train': score_train})
    print(f"Features: {features}, score test: {score}, score train: {score_train}")

Features: ('x105', 'x9', 'x102', 'x6', 'x103', 'x106', 'x101', 'x104'), score test: 5133.466933867735, score train: 8400.0
Features: ('x105', 'x9', 'x102', 'x6', 'x103', 'x106', 'x101'), score test: 5233.266533066131, score train: 8600.0
Features: ('x105', 'x9', 'x102', 'x6', 'x103', 'x106', 'x104'), score test: 5253.306613226452, score train: 8600.0
Features: ('x105', 'x9', 'x102', 'x6', 'x103', 'x101', 'x104'), score test: 5213.226452905811, score train: 8600.0
Features: ('x105', 'x9', 'x102', 'x6', 'x106', 'x101', 'x104'), score test: 4912.625250501002, score train: 8600.0
Features: ('x105', 'x9', 'x102', 'x103', 'x106', 'x101', 'x104'), score test: 5313.426853707414, score train: 8600.0
Features: ('x105', 'x9', 'x6', 'x103', 'x106', 'x101', 'x104'), score test: 5333.466933867735, score train: 8600.0
Features: ('x105', 'x102', 'x6', 'x103', 'x106', 'x101', 'x104'), score test: 5313.426853707414, score train: 8600.0
Features: ('x9', 'x102', 'x6', 'x103', 'x106', 'x101', 'x104'), scor

In [42]:
pd.DataFrame(scores).sort_values(by='score_test', ascending=False).head(10)['features'].to_list()

[('x105', 'x102', 'x103', 'x106', 'x101'),
 ('x102', 'x103', 'x106', 'x101', 'x104'),
 ('x102', 'x106', 'x101', 'x104'),
 ('x102', 'x103', 'x106', 'x101'),
 ('x102', 'x106', 'x104'),
 ('x102', 'x103', 'x101', 'x104'),
 ('x105', 'x9', 'x103', 'x101', 'x104'),
 ('x102', 'x6', 'x103', 'x106', 'x104'),
 ('x105', 'x102', 'x103', 'x104'),
 ('x105', 'x9', 'x102', 'x103', 'x104')]