In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from scipy.stats import friedmanchisquare
import warnings
warnings.filterwarnings('ignore')

class BMFK:
    def __init__(self, n_neighbors=5, m=2, p=2, q=2):
        self.n_neighbors = n_neighbors
        self.m = m  # fuzzy strength parameter
        self.p = p  # Bonferroni p parameter
        self.q = q  # Bonferroni q parameter

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='minkowski', p=2)
        self.nn.fit(X)

    def bonferroni_mean(self, values):
        n = len(values)
        if n <= 1:
            return np.mean(values)
        sum_pq = 0
        for i in range(n):
            for j in range(n):
                if i != j:
                    sum_pq += values[i]**self.p * values[j]**self.q
        return (sum_pq / (n * (n-1)))**(1 / (self.p + self.q))

    def predict(self, X):
        predictions = []
        for x in X:
            distances, indices = self.nn.kneighbors([x])
            neighbors = self.X[indices[0]]
            neighbor_labels = self.y[indices[0]]
            
            memberships = 1 / (distances[0] ** (2 / (self.m - 1)) + 1e-8)
            memberships /= np.sum(memberships)
            
            class_memberships = {}
            for c in self.classes:
                class_indices = neighbor_labels == c
                if np.any(class_indices):
                    class_memberships[c] = self.bonferroni_mean(memberships[class_indices])
                else:
                    class_memberships[c] = 0
            
            predictions.append(max(class_memberships, key=class_memberships.get))
        
        return np.array(predictions)

# Load and preprocess data
def load_and_preprocess_data(file_path):
    df = pd.read_excel(file_path, sheet_name="Full_new")
    df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'])
    df = df.apply(pd.to_numeric, errors='coerce')
    df.fillna(df.median(), inplace=True)

    categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)', 
                          'Weight gain(Y/N)', 'hair growth(Y/N)', 
                          'Skin darkening (Y/N)', 'Hair loss(Y/N)', 
                          'Pimples(Y/N)', 'Fast food (Y/N)', 
                          'Reg.Exercise(Y/N)']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category').cat.codes

    X = df.drop(columns=['PCOS (Y/N)'])
    y = df['PCOS (Y/N)'].values

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y, X.columns

def run_bmfk_cv(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    accuracies = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = BMFK(n_neighbors=5, m=2, p=1, q=1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    
    return accuracies

def calculate_friedman_ranks(cv_accuracies):
    methods = list(cv_accuracies.keys())
    n_methods = len(methods)
    n_folds = len(cv_accuracies[methods[0]])
    
    accuracy_matrix = np.zeros((n_folds, n_methods))
    for i, method in enumerate(methods):
        accuracy_matrix[:, i] = cv_accuracies[method]
    
    rank_matrix = n_methods + 1 - pd.DataFrame(accuracy_matrix).rank(axis=1)
    mean_ranks = rank_matrix.mean(axis=0).values
    final_ranks = pd.Series(mean_ranks).rank().values
    
    return mean_ranks, final_ranks

# Main execution
if __name__ == "__main__":
    # File path
    file_path = "PCOS_data_without_infertility.xlsx"
    
    # Load and preprocess data
    X_scaled, y, feature_names = load_and_preprocess_data(file_path)
    
    # Define feature sets
    ensemble_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                                 'Cycle(R/I)', 'Fast food (Y/N)', 'AMH(ng/mL)', 'Skin darkening (Y/N)', 
                                 'Weight gain(Y/N)', 'Pimples(Y/N)', 'Cycle length(days)', 
                                 'Hip(inch)', 'Weight (Kg)', 'FSH/LH', 'FSH(mIU/mL)']

    rl_selected_features = [' Age (yrs)', 'Weight (Kg)', 'Pulse rate(bpm) ', 'RR (breaths/min)', 
                           'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)', 
                           '  I   beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'Waist(inch)', 'Waist:Hip Ratio', 
                           'TSH (mIU/L)', 'AMH(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 
                           'Weight gain(Y/N)', 'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Pimples(Y/N)', 
                           'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)']

    proposed_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                                 'Cycle(R/I)', 'Fast food (Y/N)', 'Skin darkening (Y/N)', 
                                 'Cycle length(days)', 'FSH/LH']

    # Prepare feature sets
    X_ensemble = X_scaled[:, [list(feature_names).index(feature) for feature in ensemble_selected_features]]
    X_rl = X_scaled[:, [list(feature_names).index(feature) for feature in rl_selected_features]]
    X_proposed = X_scaled[:, [list(feature_names).index(feature) for feature in proposed_selected_features]]
    X_all = X_scaled

    # Run cross-validation for all methods
    cv_accuracies = {
        "Ensemble filter+BEEO(RL)+BMFK(proposed)": run_bmfk_cv(X_proposed, y),
        "With ensemble filter & with BEO alone (NO RL)": run_bmfk_cv(X_ensemble, y),
        "Without ensemble filter & with BEO-RL": run_bmfk_cv(X_rl, y),
        "Without filter & wrapper(all 44 features to BMFK classifier)": run_bmfk_cv(X_all, y)
    }

    # Calculate ranks
    mean_ranks, final_ranks = calculate_friedman_ranks(cv_accuracies)

    # Create results DataFrame
    methods = list(cv_accuracies.keys())
    mean_accuracies = [np.mean(accs) for accs in cv_accuracies.values()]

    results_df = pd.DataFrame({
        'Methods': methods,
        'Accuracy': mean_accuracies,
        'Friedman mean rank': mean_ranks,
        'Rank': final_ranks
    })

    # Sort by Rank
    results_df = results_df.sort_values('Rank')

    # Format and display results
    pd.set_option('display.float_format', '{:.4f}'.format)
    print("\nResults Table:")
    print(results_df.to_string(index=False))

    # Perform Friedman test
    accuracies_array = np.array([cv_accuracies[method] for method in methods]).T
    statistic, p_value = friedmanchisquare(*[accuracies_array[:, i] for i in range(len(methods))])
    print(f"\nFriedman test statistic: {statistic:.4f}")
    print(f"p-value: {p_value:.4f}")


Results Table:
                                                     Methods  Accuracy  Friedman mean rank   Rank
                     Ensemble filter+BEEO(RL)+BMFK(proposed)    0.8559              1.6500 1.0000
                       Without ensemble filter & with BEO-RL    0.8261              2.5500 2.0000
Without filter & wrapper(all 44 features to BMFK classifier)    0.8151              2.7500 3.0000
               With ensemble filter & with BEO alone (NO RL)    0.8113              3.0500 4.0000

Friedman test statistic: 7.2667
p-value: 0.0639
