In [1]:
import pandas as pd
from cluster_comparison import perform_umap, perform_hdbscan, calculate_silhouette
import random
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
# Sample data and demographics DataFrames, replace these with your actual DataFrames
raw_data = pd.read_csv('/Users/leo/Programming/PLR/Leo/data/cleaned_data_SYMPTOMS_9_13_23_DNA.csv')
data_symp_groups = pd.read_csv('data/skew_corr_groupadd.csv', usecols=['Grouped_Neuro_Sensory', 'Grouped_Cognitive_Memory', 'Grouped_Gastrointestinal', 'Grouped_Respiratory_Cardiac', 'Grouped_Eye_Vision'])
data_symp_groups_all = pd.read_csv('data/skew_corr_groupadd.csv')
demographics = pd.read_csv('/Users/leo/Programming/PLR/Leo/data/non_binary_data_processed.csv')

# combine demographics and data_symp_group_all 
demo_all = pd.concat([demographics, data_symp_groups_all], axis=1)

# combine demographics and data_symp_group
demo_groups = pd.concat([demographics, data_symp_groups], axis=1)

In [3]:
# Initialize variables for adaptive mutation
last_best_fitness = -1
increased_mutation_rate = 0.2

# Fitness function
def fitness(params):
    n_neighbors, min_dist, min_cluster_size, n_components = params
    
    # Validate n_neighbors
    n_neighbors = max(2, int(n_neighbors))  # Ensure n_neighbors is an integer and greater than 1
    
    min_cluster_size = max(2, int(min_cluster_size))
    n_components = max(2, int(n_components))
    min_dist = min(min_dist, 1.0)
    
    dataset_name, dataset = random.choice(list(datasets.items()))
    dataset = dataset.dropna()
    umap_result = perform_umap(dataset, n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    labels = perform_hdbscan(umap_result, min_cluster_size=min_cluster_size)
    score = calculate_silhouette(umap_result, labels)
    return score

# Datasets to consider
datasets = {'data_symp_groups_all': data_symp_groups_all}  # Replace with your actual dataset
print(f"number of features: {len(datasets['data_symp_groups_all'].columns)}")
# Genetic Algorithm Parameters
population_size = 180
n_generations = 150
selection_rate = 0.3
mutation_rate = 0.05

# Initialize population
population = []
for _ in range(population_size // 4):
    # First set of individuals (close to best performing genome)
    population.append((0.6757706356698308, 0.04030749239323905, 29, 48))
    population.append((0.6757706356698308, 0.04030749239323905, 38, 35))
    
    # Second set of individuals (random within a specific range)
    population.append((random.uniform(0.1, 5), random.uniform(0.045, 0.75), random.randint(30, 45), random.randint(8, 22)))
    population.append((random.uniform(6, 20), random.uniform(0.76, 1.0), random.randint(46, 50), random.randint(23, 25)))
    
    # Third set of individuals (another set close to best performing genome but with slight variations)
    population.append((0.6757706356698308 + random.uniform(-0.1, 0.1), 0.04030749239323905 + random.uniform(-0.01, 0.01), 29 + random.randint(-2, 2), 48 + random.randint(-2, 2)))
    population.append((0.6757706356698308 + random.uniform(-0.1, 0.1), 0.04030749239323905 + random.uniform(-0.01, 0.01), 38 + random.randint(-2, 2), 35 + random.randint(-2, 2)))

    # Fourth set of individuals (another set of random individuals)
    population.append((random.uniform(21, 35), random.uniform(0.26, 0.5), random.randint(21, 35), random.randint(25, 38)))
    population.append((random.uniform(36, 45), random.uniform(0.51, 0.75), random.randint(36, 45), random.randint(39, 65)))

# To store best fitness and variance for each generation
best_fitnesses = []
variances = []

# Main GA loop
for generation in tqdm(range(n_generations), desc="Generations"):
    # Evaluate fitness of each individual in parallel using joblib
    scores = Parallel(n_jobs=-1)(delayed(fitness)(ind) for ind in population)
    
    # Store best fitness and variance
    best_fitness = max(scores)
    best_fitnesses.append(best_fitness)
    variances.append(np.var(scores))
    
    # Adaptive mutation rate
    if best_fitness <= last_best_fitness:
        mutation_rate = increased_mutation_rate
    else:
        mutation_rate = 0.1
    last_best_fitness = best_fitness
    
    # Elitism: Keep the best individual
    best_idx = scores.index(best_fitness)
    best_individual = population[best_idx]
    
    # Print the top 5 individuals
    top5_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:5]
    print(f"\nTop 5 individuals in generation {generation+1}:")
    for i in top5_idx:
        print(f"  Genome: {population[i]}, Fitness: {scores[i]}")
        print(f"    n_neighbors={population[i][0]}, min_dist={population[i][1]}, min_cluster_size={population[i][2]}, n_components={population[i][3]}")
    
    # Roulette wheel selection
    fitness_sum = sum(scores)
    selected_population = []
    for _ in range(int(selection_rate * population_size)):
        pick = random.uniform(0, fitness_sum)
        current = 0
        for i in range(len(scores)):
            current += scores[i]
            if current > pick:
                selected_population.append(population[i])
                break
    
    # Crossover (mate) the selected individuals
    children = []
    while len(children) < population_size - len(selected_population) - 1:
        parent1, parent2 = random.sample(selected_population, 2)
        crossover_point = random.randint(1, len(parent1) - 1)
        child = parent1[:crossover_point] + parent2[crossover_point:]
        children.append(child)
    
    # Mutation
    mutations = 0
    for i in range(len(children)):
        if random.random() < mutation_rate:
            mutations += 1
            mutate_pos = random.randint(0, len(children[i]) - 1)
            new_value = random.choice([
                random.randint(5, 50),
                random.uniform(0.0, 1.0),
                random.randint(5, 50),
                random.randint(2, 25)
            ])
            children[i] = children[i][:mutate_pos] + (new_value,) + children[i][mutate_pos+1:]
    
    print(f"Number of mutations: {mutations}")
    
    # Create new population
    population = selected_population + children

    # Add the best individual back into the population
    population.append(best_individual)

# Evaluate the final population and find the best individual
final_scores = Parallel(n_jobs=-1)(delayed(fitness)(ind) for ind in population)
best_idx = max(range(len(final_scores)), key=lambda i: final_scores[i])
best_individual = population[best_idx]

# Print best parameters and silhouette score
print("\nBest parameters found:")
print(f"  n_neighbors={best_individual[0]}, min_dist={best_individual[1]}, min_cluster_size={best_individual[2]}, n_components={best_individual[3]}")
print(f"Best silhouette score: {final_scores[best_idx]}")

# Plotting the graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_generations + 1), best_fitnesses, label='Best Fitness')
plt.fill_between(range(1, n_generations + 1), 
                np.array(best_fitnesses) - np.array(variances), 
                np.array(best_fitnesses) + np.array(variances), 
                color='gray', alpha=0.5, label='Variance')
plt.xlabel('Generation')
plt.ylabel('Fitness')
plt.title('Evolution of Best Fitness Over Generations')
plt.legend()
plt.show()


number of features: 112


failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitte