In [12]:
%pip install deap

Defaulting to user installation because normal site-packages is not writeable
Collecting deap
  Downloading deap-1.4.1.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 2.9 MB/s eta 0:00:01
Building wheels for collected packages: deap
  Building wheel for deap (setup.py) ... [?25ldone
[?25h  Created wheel for deap: filename=deap-1.4.1-cp39-cp39-macosx_10_9_universal2.whl size=111773 sha256=6ead710c75546146f218ef0a60d3da975f9fa1fccf7283f8b51b4aea490394fb
  Stored in directory: /Users/haily/Library/Caches/pip/wheels/d9/58/fd/a64ce13f5e2324689e8bc1857c9562d53020d96cc88f5bf787
Successfully built deap
Installing collected packages: deap
Successfully installed deap-1.4.1
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
df = pd.read_csv('./data/processed_mrna_zscore.csv')

In [3]:
X = df.iloc[:, :-2]
y = df.iloc[:, -1]

Select 500 features by filtering methods.

In [8]:
file_path = './mrmr_top500_feats.txt'

# Read the content of the file and split it into lines
with open(file_path, 'r') as file:
    content = file.read().splitlines()

# Convert the content back to an array
top500 = content

The search space is now in 500 top features selected by a filtering methods

In [9]:
sel_X = X[top500]

In [14]:
import random
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from deap import base, creator, tools, algorithms

# Assuming X and y are your feature matrix and target variable
# X is a pandas DataFrame with 1800 samples and 500 features
# y is a pandas Series with the corresponding labels

# DEAP initialization
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# Function to create an individual (binary representation of features)
def create_individual():
    return [random.randint(0, 1) for _ in range(sel_X.shape[1])]

# Function to evaluate the fitness of an individual
def evaluate_individual(individual):
    selected_features = [i for i, gene in enumerate(individual) if gene]
    if not selected_features:
        return 0.0,  # Avoid all-zero individuals
    clf = RandomForestClassifier(random_state=42)
    clf.fit(sel_X.iloc[:, selected_features], y)
    y_pred = clf.predict(sel_X.iloc[:, selected_features])
    accuracy = accuracy_score(y, y_pred)
    return accuracy,

# DEAP Toolbox
toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("evaluate", evaluate_individual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

population_size = 50
generations = 20
crossover_rate = 0.8
mutation_rate = 0.1

population = toolbox.population(n=population_size)
algorithms.eaMuPlusLambda(population, toolbox, mu=population_size, lambda_=2*population_size,
                            cxpb=crossover_rate, mutpb=mutation_rate, ngen=generations,
                            stats=None, halloffame=None, verbose=True)

# Get the best individual from the final population
best_individual = tools.selBest(population, k=1)[0]
selected_features = [i for i, gene in enumerate(best_individual) if gene]

print("Best individual:", best_individual)
print("Selected features:", selected_features)


gen	nevals
0  	50    
1  	89    
2  	94    
3  	90    
4  	91    
5  	92    
6  	88    
7  	88    
8  	90    
9  	92    
10 	92    
11 	89    
12 	84    
13 	88    
14 	93    
15 	89    
16 	92    
17 	89    
18 	90    
19 	91    
20 	90    
Best individual: [1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,

In [16]:
genetic_features = X.columns[selected_features]

In [17]:
from sklearn.ensemble import RandomForestClassifier
from k_fold import k_fold

k_fold(classifier=RandomForestClassifier(n_estimators=1000, n_jobs=-1), 
                X=X[genetic_features], y=y)

Fold 1: 0.868020304568528
Fold 2: 0.8629441624365483
Fold 3: 0.868020304568528
Fold 4: 0.8578680203045685
Fold 5: 0.8673469387755102
Fold 6: 0.8673469387755102
Fold 7: 0.8673469387755102
Fold 8: 0.8571428571428571
Fold 9: 0.8673469387755102
Fold 10: 0.8673469387755102
Average Weighted Accuracy: 0.865073034289858


0.865073034289858