In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from scipy.stats import rankdata
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Load and preprocess the PCOS dataset
file_path = "PCOS_data_without_infertility.xlsx"
df = pd.read_excel(file_path, sheet_name="Full_new")
df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'])
df = df.apply(pd.to_numeric, errors='coerce')
df.fillna(df.median(), inplace=True)

# Encode categorical columns
le = LabelEncoder()
categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)', 'Weight gain(Y/N)', 
                       'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)', 
                       'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)']
for col in categorical_columns:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

# Split into features and target
X = df.drop(columns=['PCOS (Y/N)'])
y = df['PCOS (Y/N)']

# Function to evaluate the SVM classifier
def evaluate_svm(X_data, y_data):
    # Split the data into training (90%) and testing (10%) sets
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42)
    
    # Initialize the SVM classifier
    svm_clf = SVC()

    # Train the SVM classifier
    svm_clf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = svm_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

# Selected features from Ensemble Filter + BEEO (RL) + SVM (Proposed)
selected_features_proposed = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                              'Cycle(R/I)', 'Fast food (Y/N)', 'Skin darkening (Y/N)', 
                              'Cycle length(days)', 'FSH/LH']

# Selected features from Ensemble Filter only
ensemble_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                              'Cycle(R/I)', 'Fast food (Y/N)', 'AMH(ng/mL)', 
                              'Skin darkening (Y/N)', 'Weight gain(Y/N)', 
                              'Pimples(Y/N)', 'Cycle length(days)', 
                              'Hip(inch)', 'Weight (Kg)', 'FSH/LH', 'FSH(mIU/mL)']

# Selected Features without ensemble, only RL
rl_selected_features = [' Age (yrs)', 'Height(Cm) ', 'RR (breaths/min)', 'No. of aborptions', 
                        'Hip(inch)', 'Waist:Hip Ratio', 'AMH(ng/mL)', 'PRG(ng/mL)', 
                        'Fast food (Y/N)', 'Follicle No. (R)', 'Avg. F size (L) (mm)', 
                        'Endometrium (mm)'] 

# Check if all selected features exist in the DataFrame
def validate_features(features, X):
    missing_features = [feature for feature in features if feature not in X.columns]
    if missing_features:
        print(f"Missing features: {missing_features}")
        return False
    return True

# Validate features before proceeding
if validate_features(selected_features_proposed, X) and \
   validate_features(ensemble_selected_features, X) and \
   validate_features(rl_selected_features, X):
    
    # Calculate accuracies for each method using SVM
    accuracy_proposed = evaluate_svm(X[selected_features_proposed], y)  # Proposed method
    accuracy_ensemble = evaluate_svm(X[ensemble_selected_features], y)  # Ensemble method
    accuracy_rl = evaluate_svm(X[rl_selected_features], y)  # RL method
    accuracy_all = evaluate_svm(X, y)  # All features

    # Create a dictionary to hold the accuracies
    accuracies = {
        "Ensemble Filter + BEEO (RL) + SVM (Proposed)": accuracy_proposed,
        "With Ensemble Filter & BEO alone (No RL)": accuracy_ensemble,
        "Without Ensemble Filter & with BEO-RL": accuracy_rl,
        "Without Filter & Wrapper (All 44 features to SVM classifier)": accuracy_all
    }

    # Convert accuracies into an array for ranking
    accuracy_values = np.array(list(accuracies.values()))

    # Calculate ranks (lower rank is better, so higher accuracy gets lower rank)
    ranks = rankdata(-accuracy_values)  # Negative to rank by highest accuracy

    # Create a DataFrame to format the table neatly
    df_results = pd.DataFrame({
        "Method": list(accuracies.keys()),
        "Accuracy": accuracy_values,
        "Friedman Mean Rank": ranks,
        "Rank": ranks
    })

    # Assign final rank based on mean rank
    final_ranks = rankdata(ranks)

    # Add final rank to the DataFrame
    df_results["Final Rank"] = final_ranks

    # Display the results
    print("\n--- Accuracy, Friedman Mean Ranks, and Final Ranks ---")
    print(df_results)

else:
    print("Validation failed due to missing features. Please check your selected features.")



--- Accuracy, Friedman Mean Ranks, and Final Ranks ---
                                              Method  Accuracy  \
0       Ensemble Filter + BEEO (RL) + SVM (Proposed)  0.854545   
1           With Ensemble Filter & BEO alone (No RL)  0.672727   
2              Without Ensemble Filter & with BEO-RL  0.690909   
3  Without Filter & Wrapper (All 44 features to S...  0.690909   

   Friedman Mean Rank  Rank  Final Rank  
0                 1.0   1.0         1.0  
1                 4.0   4.0         4.0  
2                 2.5   2.5         2.5  
3                 2.5   2.5         2.5  
