### calculate fitness value for the chromosome of 0s and 1s

In [1]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [4]:
import numpy as np
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from genetic_selection import GeneticSelectionCV

def compare_feature_selection_methods(X, y, clf, pop_size=50, mutation_prob=0.1, crossover_prob=0.8, n_generations=50, n_tournament=5, n_jobs=-1):
    n_features = X.shape[1]

    # Handle missing values
    imputer = SimpleImputer()
    X = imputer.fit_transform(X)

    # Initialize population
    population = np.random.choice([0, 1], size=(pop_size, n_features))

    # Fitness function
    def fitness(chromosome):
        selected_features = X[:, chromosome == 1]
        score = np.mean(cross_val_score(clf, selected_features, y, cv=5))
        
        if selected_features.shape[1] == 0:
            return 0  # Return a score of 0 if no features are selected
    
        return score

    # Selection, crossover, and mutation functions
    def tournament_selection(population, n_tournament):
        indices = np.random.randint(0, len(population), n_tournament)
        selected = population[indices]
        scores = [fitness(chromosome) for chromosome in selected]
        return selected[np.argmax(scores)]

    def crossover(parent1, parent2, crossover_prob):
        if np.random.rand() < crossover_prob:
            point = np.random.randint(1, len(parent1) - 1)
            return np.hstack([parent1[:point], parent2[point:]]), np.hstack([parent2[:point], parent1[point:]])
        else:
            return parent1, parent2

    def mutate(chromosome, mutation_prob):
        mutation_indices = np.random.rand(len(chromosome)) < mutation_prob
        mutated_chromosome = chromosome.copy()
        mutated_chromosome[mutation_indices] = 1 - chromosome[mutation_indices]
    
        # Ensure at least one feature is selected
        if np.sum(mutated_chromosome) == 0:
            mutated_chromosome[np.random.randint(0, len(chromosome))] = 1

        return mutated_chromosome


    # Adaptive genetic feature selection algorithm
    for generation in range(n_generations):
        new_population = []

        for i in range(pop_size // 2):
            parent1 = tournament_selection(population, n_tournament)
            parent2 = tournament_selection(population, n_tournament)
            child1, child2 = crossover(parent1, parent2, crossover_prob)
            child1, child2 = mutate(child1, mutation_prob), mutate(child2, mutation_prob)
            new_population.extend([child1, child2])

        population = np.array(new_population)

    # Parallelize the fitness function evaluation using joblib
    fitness_scores = Parallel(n_jobs=n_jobs)(delayed(fitness)(chromosome) for chromosome in population)
    
    # Get the best solution
    best_solution = population[np.argmax(fitness_scores)]
    selected_features = X[:, best_solution == 1]
    
    X_train, X_test, y_train, y_test = train_test_split(selected_features, y, test_size=0.2, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1_score_adaptive = f1_score(y_test, y_pred)

    selector = GeneticSelectionCV(clf, cv=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    selector.fit(X_train, y_train)
    selected_features_genetic = X_train[:, selector.support_]
    
    clf.fit(selected_features_genetic, y_train)
    y_pred_genetic = clf.predict(X_test[:, selector.support_])
    f1_score_genetic = f1_score(y_test, y_pred_genetic)

    return f1_score_adaptive, f1_score_genetic


In [5]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
data = pd.read_csv('heart.csv')

# Separate features and labels
X = data.drop('target', axis=1).values 
y = data['target'].values

# Perform PCA
n_components = 10  
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X)

# Choose a classifier
clf = RandomForestClassifier()

# Call the function
f1_score_adaptive, f1_score_genetic = compare_feature_selection_methods(X_pca, y, clf)

# Print the results
print("F1-score of the adaptive genetic feature selection:", f1_score_adaptive)
print("F1-score of the GeneticSelectionCV:", f1_score_genetic)


F1-score of the adaptive genetic feature selection: 0.9852216748768473
F1-score of the GeneticSelectionCV: 0.9852216748768473


In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from genetic_selection import GeneticSelectionCV
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the logistic regression classifier
clf = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=1000)

# GeneticSelectionCV
selector = GeneticSelectionCV(clf,
                              cv=5,
                              verbose=1,
                              scoring='accuracy',
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=10,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              n_gen_no_change=3,
                              n_jobs=-1)

selector = selector.fit(X_train, y_train)
X_train_genetic = selector.transform(X_train)
X_test_genetic = selector.transform(X_test)
clf_genetic = clf.fit(X_train_genetic, y_train)
y_pred_genetic = clf_genetic.predict(X_test_genetic)
acc_genetic = accuracy_score(y_test, y_pred_genetic)

# GridSearchCV
param_grid = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_test)
acc_grid = accuracy_score(y_test, y_pred_grid)

print("Accuracy of GeneticSelectionCV: {:.2f}%".format(acc_genetic * 100))
print("Accuracy of GridSearchCV: {:.2f}%".format(acc_grid * 100))


Selecting features with genetic algorithm.
gen	nevals	avg                               	std                            	min                            	max                               
0  	50    	[  0.904308  15.88       0.024218]	[ 0.083987  8.860339  0.008051]	[ 0.628571  1.        0.004396]	[  0.951648  30.         0.039682]
1  	37    	[  0.945626  22.1        0.024816]	[ 0.006892  5.675385  0.00641 ]	[ 0.914286  4.        0.01615 ]	[  0.951648  30.         0.043068]
2  	32    	[  0.948967  23.08       0.022671]	[ 0.002498  4.073524  0.005126]	[  0.940659  11.         0.017855]	[  0.951648  30.         0.03304 ]
3  	25    	[  0.950681  23.22       0.020855]	[ 0.001401  3.074345  0.004396]	[  0.947253  17.         0.017855]	[  0.953846  29.         0.03304 ]
4  	34    	[  0.95156   22.48       0.023129]	[ 0.00298   2.968097  0.004743]	[  0.938462  15.         0.017855]	[  0.958242  27.         0.034331]
5  	29    	[  0.953363  21.14       0.02597 ]	[ 0.003413  3.358631  0.00427 ]	

In [22]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from genetic_selection import GeneticSelectionCV

# Load the dataset
df = pd.read_csv('heart.csv')
X = df.drop('target', axis=1)
y = df['target']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the logistic regression classifier
clf = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=1000)

# Set up the GeneticSelectionCV object
selector = GeneticSelectionCV(clf,
                              cv=5,
                              verbose=1,
                              scoring='accuracy',
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=10,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              n_gen_no_change=3,
                              n_jobs=-1)

# Fit the selector
selector.fit(X_train, y_train)

# Transform the train and test sets with the selected features
X_train_genetic = pd.DataFrame(selector.transform(X_train), columns=X.columns[selector.support_])
X_test_genetic = pd.DataFrame(selector.transform(X_test), columns=X.columns[selector.support_])

# Fit the logistic regression classifier on the selected features
clf_genetic = clf.fit(X_train_genetic, y_train)

# Generate predictions and accuracy scores
y_pred_genetic = clf_genetic.predict(X_test_genetic)
acc_genetic = accuracy_score(y_test, y_pred_genetic)

# GridSearchCV
param_grid = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_test)
acc_grid = accuracy_score(y_test, y_pred_grid)

print("Accuracy of GeneticSelectionCV: {:.2f}%".format(acc_genetic * 100))
print("Accuracy of GridSearchCV: {:.2f}%".format(acc_grid * 100))


Selecting features with genetic algorithm.
gen	nevals	avg                            	std                            	min                            	max                               
0  	50    	[ 0.762634  6.12      0.033915]	[ 0.073385  3.519318  0.011657]	[ 0.512195  1.        0.012556]	[  0.856098  13.         0.058942]
1  	28    	[ 0.813732  8.62      0.030934]	[ 0.035732  2.770487  0.010885]	[ 0.693902  2.        0.012556]	[  0.856098  13.         0.056756]
2  	24    	[  0.836488  10.46       0.031573]	[ 0.018907  1.813395  0.007802]	[ 0.779268  6.        0.016632]	[  0.862195  13.         0.054946]
3  	28    	[  0.852293  11.3        0.03459 ]	[ 0.00947   1.526434  0.005166]	[ 0.818293  7.        0.019512]	[  0.862195  13.         0.05268 ]
4  	31    	[  0.857878  11.8        0.034344]	[ 0.004577  0.489898  0.003479]	[  0.843902  11.         0.019512]	[  0.862195  13.         0.042526]
5  	25    	[  0.858171  11.82       0.03514 ]	[ 0.00784   0.653911  0.002831]	[ 0.829268  9. 



Accuracy of GeneticSelectionCV: 80.00%
Accuracy of GridSearchCV: 78.54%


In [21]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from genetic_selection import GeneticSelectionCV

# Load the dataset
df = pd.read_csv('parkinsons.data')

# Drop the 'name' column
df = df.drop('name', axis=1)

X = df.drop('status', axis=1)
y = df['status']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the logistic regression classifier
clf = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=1000)

# Set up the GeneticSelectionCV object
selector = GeneticSelectionCV(clf,
                              cv=5,
                              verbose=1,
                              scoring='accuracy',
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=10,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              n_gen_no_change=3,
                              n_jobs=-1)

# Fit the selector
selector.fit(X_train, y_train)

# Get the selected features and pass feature names
selected_features = selector.support_
feature_names = X_train.columns[selected_features]
X_train_genetic = selector.transform(X_train)
X_train_genetic = pd.DataFrame(X_train_genetic, columns=feature_names)

# Transform the test set with the selected features and pass feature names
X_test_genetic = selector.transform(X_test)
X_test_genetic = pd.DataFrame(X_test_genetic, columns=feature_names)

# Fit the logistic regression classifier on the selected features
clf_genetic = clf.fit(X_train_genetic, y_train)

# Generate predictions and accuracy scores
y_pred_genetic = clf_genetic.predict(X_test_genetic)
acc_genetic = accuracy_score(y_test, y_pred_genetic)

# GridSearchCV
param_grid = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_test)
acc_grid = accuracy_score(y_test, y_pred_grid)

print("Accuracy of GeneticSelectionCV: {:.2f}%".format(acc_genetic * 100))
print("Accuracy of GridSearchCV: {:.2f}%".format(acc_grid * 100))


Selecting features with genetic algorithm.
gen	nevals	avg                               	std                            	min                            	max                               
0  	50    	[  0.806024  13.28       0.043785]	[ 0.044486  6.431298  0.017228]	[ 0.730847  2.        0.009274]	[  0.865121  22.         0.07579 ]
1  	24    	[  0.837073  16.16       0.054034]	[ 0.017872  4.153842  0.010139]	[ 0.763105  2.        0.030185]	[  0.865121  22.         0.093514]
2  	32    	[  0.844423  17.26       0.055307]	[ 0.013409  2.855241  0.011372]	[  0.782056  11.         0.030185]	[  0.865323  22.         0.094079]
3  	31    	[  0.846379  16.52       0.054942]	[ 0.011425  2.467711  0.010526]	[  0.807863  12.         0.030185]	[  0.865323  22.         0.078288]
4  	33    	[  0.85173   15.4        0.052272]	[ 0.008632  1.865476  0.010009]	[  0.833468  10.         0.025378]	[  0.865323  19.         0.078288]
5  	29    	[  0.85294  15.04      0.05132]   	[ 0.015882  1.865047  0.011324]	



Accuracy of GeneticSelectionCV: 87.18%
Accuracy of GridSearchCV: 92.31%


In [23]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from genetic_selection import GeneticSelectionCV

# Load the dataset
df = pd.read_csv('cervical.csv')
X = df.drop('ca_cervix', axis=1)
y = df['ca_cervix']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the logistic regression classifier
clf = LogisticRegression(solver='liblinear', multi_class='ovr', max_iter=1000)

# Set up the GeneticSelectionCV object
selector = GeneticSelectionCV(clf,
                              cv=5,
                              verbose=1,
                              scoring='accuracy',
                              n_population=50,
                              crossover_proba=0.5,
                              mutation_proba=0.2,
                              n_generations=10,
                              crossover_independent_proba=0.5,
                              mutation_independent_proba=0.05,
                              n_gen_no_change=3,
                              n_jobs=-1)

# Fit the selector
selector.fit(X_train, y_train)

# Transform the train and test sets with the selected features
X_train_genetic = pd.DataFrame(selector.transform(X_train), columns=X.columns[selector.support_])
X_test_genetic = pd.DataFrame(selector.transform(X_test), columns=X.columns[selector.support_])

# Fit the logistic regression classifier on the selected features
clf_genetic = clf.fit(X_train_genetic, y_train)

# Generate predictions and accuracy scores
y_pred_genetic = clf_genetic.predict(X_test_genetic)
acc_genetic = accuracy_score(y_test, y_pred_genetic)

# GridSearchCV
param_grid = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_test)
acc_grid = accuracy_score(y_test, y_pred_grid)

print("Accuracy of GeneticSelectionCV: {:.2f}%".format(acc_genetic * 100))
print("Accuracy of GridSearchCV: {:.2f}%".format(acc_grid * 100))


Selecting features with genetic algorithm.
gen	nevals	avg                               	std                            	min                            	max                               
0  	50    	[  0.846333  11.32       0.085427]	[ 0.055195  5.139805  0.022583]	[ 0.754545  1.        0.033402]	[  0.948485  19.         0.151014]
1  	30    	[  0.886697  14.         0.083724]	[ 0.047194  3.959798  0.025034]	[ 0.75303   2.        0.007423]	[  0.948485  19.         0.152301]
2  	30    	[  0.918333  16.18       0.077412]	[ 0.031148  2.471356  0.02193 ]	[ 0.825758  8.        0.034015]	[  0.965152  19.         0.126549]
3  	24    	[  0.933     16.78       0.071232]	[ 0.022953  1.345957  0.016402]	[  0.825758  12.         0.035013]	[  0.965152  19.         0.112203]
4  	25    	[  0.948606  17.06       0.064568]	[ 0.013309  1.138596  0.016505]	[  0.892424  13.         0.035013]	[  0.965152  18.         0.10632 ]
5  	27    	[  0.95603   16.34       0.054268]	[ 0.009709  1.031698  0.01357 ]	[  



Accuracy of GeneticSelectionCV: 80.00%
Accuracy of GridSearchCV: 86.67%
