In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from scipy.stats import friedmanchisquare
import warnings
warnings.filterwarnings('ignore')

# BMFK class
class BMFK:
    def __init__(self, n_neighbors=5, m=2, p=2, q=2):
        self.n_neighbors = n_neighbors
        self.m = m
        self.p = p
        self.q = q

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='minkowski', p=2)
        self.nn.fit(X)

    def bonferroni_mean(self, values):
        n = len(values)
        if n <= 1:
            return np.mean(values)
        sum_pq = 0
        for i in range(n):
            for j in range(n):
                if i != j:
                    sum_pq += values[i]**self.p * values[j]**self.q
        return (sum_pq / (n * (n-1)))**(1 / (self.p + self.q))

    def predict(self, X):
        predictions = []
        for x in X:
            distances, indices = self.nn.kneighbors([x])
            neighbors = self.X[indices[0]]
            neighbor_labels = self.y[indices[0]]
            
            memberships = 1 / (distances[0] ** (2 / (self.m - 1)) + 1e-8)
            memberships /= np.sum(memberships)
            
            class_memberships = {}
            for c in self.classes:
                class_indices = neighbor_labels == c
                if np.any(class_indices):
                    class_memberships[c] = self.bonferroni_mean(memberships[class_indices])
                else:
                    class_memberships[c] = 0
            
            predictions.append(max(class_memberships, key=class_memberships.get))
        
        return np.array(predictions)

# Heuristic Algorithms
class GA:
    def __init__(self, pop_size=50, n_generations=100, crossover_rate=0.8, mutation_rate=0.1):
        self.pop_size = pop_size
        self.n_generations = n_generations
        self.crossover_rate = crossover_rate
        self.mutation_rate = mutation_rate

    def optimize(self, X, y):
        n_features = X.shape[1]
        population = np.random.randint(2, size=(self.pop_size, n_features))
        
        best_fitness = 0
        best_solution = None
        fitness_history = []

        for _ in range(self.n_generations):
            fitness = self.calculate_fitness(population, X, y)
            fitness_history.append(np.mean(fitness))
            
            best_idx = np.argmax(fitness)
            if fitness[best_idx] > best_fitness:
                best_fitness = fitness[best_idx]
                best_solution = population[best_idx]

            parents = self.selection(population, fitness)
            offspring = self.crossover(parents)
            offspring = self.mutation(offspring)
            population = offspring

        return best_solution, best_fitness, np.mean(fitness_history)

    def calculate_fitness(self, population, X, y):
        fitness = []
        for individual in population:
            selected_features = X[:, individual.astype(bool)]
            if selected_features.shape[1] == 0:
                fitness.append(0)
            else:
                model = BMFK()
                model.fit(selected_features, y)
                y_pred = model.predict(selected_features)
                fitness.append(accuracy_score(y, y_pred))
        return np.array(fitness)

    def selection(self, population, fitness):
        return population[np.argsort(fitness)[-self.pop_size//2:]]

    def crossover(self, parents):
        offspring = []
        np.random.shuffle(parents)  # Shuffle parents to ensure random pairing
        for i in range(0, len(parents) - 1, 2):  # Ensure we always have pairs
            if np.random.rand() < self.crossover_rate:
                crossover_point = np.random.randint(1, len(parents[i]))
                offspring.append(np.concatenate((parents[i][:crossover_point], parents[i+1][crossover_point:])))
                offspring.append(np.concatenate((parents[i+1][:crossover_point], parents[i][crossover_point:])))
            else:
                offspring.extend([parents[i], parents[i+1]])
        
        # If there's an odd number of parents, add the last one without crossover
        if len(parents) % 2 != 0:
            offspring.append(parents[-1])
        
        return np.array(offspring)

    def mutation(self, offspring):
        for i in range(len(offspring)):
            if np.random.rand() < self.mutation_rate:
                mutation_point = np.random.randint(0, len(offspring[i]))
                offspring[i][mutation_point] = 1 - offspring[i][mutation_point]
        return offspring

class PSO:
    def __init__(self, n_particles=30, n_iterations=100, w=0.7, c1=1.5, c2=1.5):
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.w = w
        self.c1 = c1
        self.c2 = c2

    def optimize(self, X, y):
        n_features = X.shape[1]
        particles = np.random.rand(self.n_particles, n_features)
        velocities = np.zeros_like(particles)
        personal_best = particles.copy()
        personal_best_fitness = np.zeros(self.n_particles)
        global_best = None
        global_best_fitness = -np.inf
        fitness_history = []

        for _ in range(self.n_iterations):
            fitness = self.calculate_fitness(particles, X, y)
            fitness_history.append(np.mean(fitness))

            # Update personal and global best
            improved = fitness > personal_best_fitness
            personal_best[improved] = particles[improved]
            personal_best_fitness[improved] = fitness[improved]

            if np.max(fitness) > global_best_fitness:
                global_best = particles[np.argmax(fitness)]
                global_best_fitness = np.max(fitness)

            # Update velocities and positions
            r1, r2 = np.random.rand(2, self.n_particles, n_features)
            velocities = (self.w * velocities +
                          self.c1 * r1 * (personal_best - particles) +
                          self.c2 * r2 * (global_best - particles))
            particles = np.clip(particles + velocities, 0, 1)

        return (global_best > 0.5).astype(int), global_best_fitness, np.mean(fitness_history)

    def calculate_fitness(self, particles, X, y):
        fitness = []
        for particle in particles:
            selected_features = X[:, (particle > 0.5).astype(bool)]
            if selected_features.shape[1] == 0:
                fitness.append(0)
            else:
                model = BMFK()
                model.fit(selected_features, y)
                y_pred = model.predict(selected_features)
                fitness.append(accuracy_score(y, y_pred))
        return np.array(fitness)

class GWO:
    def __init__(self, n_wolves=30, n_iterations=100):
        self.n_wolves = n_wolves
        self.n_iterations = n_iterations

    def optimize(self, X, y):
        n_features = X.shape[1]
        wolves = np.random.rand(self.n_wolves, n_features)
        alpha, beta, delta = None, None, None
        alpha_score, beta_score, delta_score = -np.inf, -np.inf, -np.inf
        fitness_history = []

        for t in range(self.n_iterations):
            fitness = self.calculate_fitness(wolves, X, y)
            fitness_history.append(np.mean(fitness))

            # Update alpha, beta, and delta
            for i in range(self.n_wolves):
                if fitness[i] > alpha_score:
                    alpha_score = fitness[i]
                    alpha = wolves[i].copy()
                elif fitness[i] > beta_score:
                    beta_score = fitness[i]
                    beta = wolves[i].copy()
                elif fitness[i] > delta_score:
                    delta_score = fitness[i]
                    delta = wolves[i].copy()

            a = 2 - t * (2 / self.n_iterations)
            for i in range(self.n_wolves):
                for j in range(n_features):
                    r1, r2 = np.random.rand(2)
                    A1, C1 = 2 * a * r1 - a, 2 * r2
                    D_alpha = abs(C1 * alpha[j] - wolves[i, j])
                    X1 = alpha[j] - A1 * D_alpha

                    r1, r2 = np.random.rand(2)
                    A2, C2 = 2 * a * r1 - a, 2 * r2
                    D_beta = abs(C2 * beta[j] - wolves[i, j])
                    X2 = beta[j] - A2 * D_beta

                    r1, r2 = np.random.rand(2)
                    A3, C3 = 2 * a * r1 - a, 2 * r2
                    D_delta = abs(C3 * delta[j] - wolves[i, j])
                    X3 = delta[j] - A3 * D_delta

                    wolves[i, j] = (X1 + X2 + X3) / 3

            wolves = np.clip(wolves, 0, 1)

        return (alpha > 0.5).astype(int), alpha_score, np.mean(fitness_history)

    def calculate_fitness(self, wolves, X, y):
        fitness = []
        for wolf in wolves:
            selected_features = X[:, (wolf > 0.5).astype(bool)]
            if selected_features.shape[1] == 0:
                fitness.append(0)
            else:
                model = BMFK()
                model.fit(selected_features, y)
                y_pred = model.predict(selected_features)
                fitness.append(accuracy_score(y, y_pred))
        return np.array(fitness)

# Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_excel(file_path, sheet_name="Full_new")
    df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'])
    df = df.apply(pd.to_numeric, errors='coerce')
    df.fillna(df.median(), inplace=True)

    categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)', 
                          'Weight gain(Y/N)', 'hair growth(Y/N)', 
                          'Skin darkening (Y/N)', 'Hair loss(Y/N)', 
                          'Pimples(Y/N)', 'Fast food (Y/N)', 
                          'Reg.Exercise(Y/N)']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category').cat.codes

    X = df.drop(columns=['PCOS (Y/N)'])
    y = df['PCOS (Y/N)'].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y, X.columns

def run_algorithm(algorithm, X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    best_fitnesses = []
    mean_fitnesses = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        if isinstance(algorithm, BMFK):
            # For BMFK, we don't need feature selection
            model = algorithm
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            best_fitness = accuracy
            mean_fitness = accuracy
        else:
            # For optimization algorithms
            best_solution, best_fitness, mean_fitness = algorithm.optimize(X_train, y_train)
            selected_features = X_test[:, best_solution.astype(bool)]
            model = BMFK()
            model.fit(X_train[:, best_solution.astype(bool)], y_train)
            y_pred = model.predict(selected_features)
            accuracy = accuracy_score(y_test, y_pred)
        
        accuracies.append(accuracy)
        best_fitnesses.append(best_fitness)
        mean_fitnesses.append(mean_fitness)
    
    return np.mean(accuracies), np.mean(best_fitnesses), np.mean(mean_fitnesses)

def calculate_friedman_ranks(cv_accuracies):
    methods = list(cv_accuracies.keys())
    n_methods = len(methods)
    n_folds = len(cv_accuracies[methods[0]])
    
    accuracy_matrix = np.zeros((n_folds, n_methods))
    for i, method in enumerate(methods):
        accuracy_matrix[:, i] = cv_accuracies[method]
    
    rank_matrix = n_methods + 1 - pd.DataFrame(accuracy_matrix).rank(axis=1)
    mean_ranks = rank_matrix.mean(axis=0).values
    
    return mean_ranks

# Main execution
if __name__ == "__main__":
    # File path
    file_path = "PCOS_data_without_infertility.xlsx"
    
    # Load and preprocess data
    X_scaled, y, feature_names = load_and_preprocess_data(file_path)
    
    # Define the proposed feature set
    proposed_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                                 'Cycle(R/I)', 'Fast food (Y/N)', 'Skin darkening (Y/N)', 
                                 'Cycle length(days)', 'FSH/LH']
    
    # Prepare the proposed feature set
    X_proposed = X_scaled[:, [list(feature_names).index(feature) for feature in proposed_selected_features]]
    
    # Define algorithms
    algorithms = {
        "Ensemble filter+BEEO(RL)+BMFK(proposed)": (BMFK(), X_proposed),
        "GA-BMFK": (GA(), X_scaled),
        "PSO-BMFK": (PSO(), X_scaled),
        "GWO-BMFK": (GWO(), X_scaled)
    }
    
    # Run algorithms and collect results
    results = {}
    for name, (algorithm, X) in algorithms.items():
        accuracy, best_fitness, mean_fitness = run_algorithm(algorithm, X, y)
        results[name] = {
            "Accuracy": accuracy,
            "Best Fitness": best_fitness,
            "Mean Fitness": mean_fitness
        }
    
    # Calculate Friedman ranks
    accuracies = {name: [results[name]["Accuracy"]] for name in algorithms.keys()}
    friedman_ranks = calculate_friedman_ranks(accuracies)
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        "Methods": list(algorithms.keys()),
        "Best Fitness": [results[name]["Best Fitness"] for name in algorithms.keys()],
        "Mean Fitness": [results[name]["Mean Fitness"] for name in algorithms.keys()],
        "Accuracy": [results[name]["Accuracy"] for name in algorithms.keys()],
        "Friedman mean rank": friedman_ranks
    })

    # Sort by Accuracy
    results_df = results_df.sort_values('Accuracy', ascending=False)

    # Format and display results
    pd.set_option('display.float_format', '{:.4f}'.format)
    print("\nResults Table:")
    print(results_df.to_string(index=False))

    # Perform Friedman test
    accuracies_array = np.array([results[name]["Accuracy"] for name in algorithms.keys()])
    statistic, p_value = friedmanchisquare(*[accuracies_array])
    print(f"\nFriedman test statistic: {statistic:.4f}")
    print(f"p-value: {p_value:.4f}")

    # Save results to CSV
    results_df.to_csv("algorithm_comparison_results.csv", index=False)
    print("\nResults saved to 'algorithm_comparison_results.csv'")

KeyboardInterrupt: 