In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Load and preprocess the PCOS dataset
file_path = "PCOS_data_without_infertility.xlsx"
df = pd.read_excel(file_path, sheet_name="Full_new")
df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'])
df = df.apply(pd.to_numeric, errors='coerce')
df.fillna(df.median(), inplace=True)

# Encode categorical columns
le = LabelEncoder()
categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)', 'Weight gain(Y/N)', 
                       'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)', 
                       'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)']
for col in categorical_columns:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

# Split into features and target
X = df.drop(columns=['PCOS (Y/N)'])
y = df['PCOS (Y/N)']

# Selected features from Ensemble Filter only
ensemble_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                              'Cycle(R/I)', 'Fast food (Y/N)', 'AMH(ng/mL)', 'Skin darkening (Y/N)', 
                              'Weight gain(Y/N)', 'Pimples(Y/N)', 'Cycle length(days)', 
                              'Hip(inch)', 'Weight (Kg)', 'FSH/LH', 'FSH(mIU/mL)']

# Selected Features without ensemble, only RL
rl_selected_features = [' Age (yrs)', 'Height(Cm) ', 'RR (breaths/min)', 'No. of aborptions', 
                        'Hip(inch)', 'Waist:Hip Ratio', 'AMH(ng/mL)', 'PRG(ng/mL)', 
                        'Fast food (Y/N)', 'Follicle No. (R)', 'Avg. F size (L) (mm)', 
                        'Endometrium (mm)'] 

# Selected features from Ensemble Filter + BEEO (RL) + BMFK (Proposed)
proposed_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                              'Cycle(R/I)', 'Fast food (Y/N)', 'AMH(ng/mL)', 
                              'Skin darkening (Y/N)', 'Weight gain(Y/N)', 
                              'Pimples(Y/N)', 'Cycle length(days)', 
                              'Hip(inch)', 'Weight (Kg)', 'FSH/LH', 'FSH(mIU/mL)']

# Prepare datasets for each feature set
X_ensemble = X[ensemble_selected_features]
X_rl = X[rl_selected_features]
X_proposed = X[proposed_selected_features]

# Function to evaluate classifiers with a 10% test split
def evaluate_classifiers(X_data):
    results = {}
    classifiers = {
        'SVM': SVC(kernel='rbf', C=1.0, gamma='scale'),
        'Random Forest': RandomForestClassifier(),
        'Decision Tree': DecisionTreeClassifier(),
        'Naive Bayes': GaussianNB(),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'AdaBoost': AdaBoostClassifier()
    }

    # Split the data into training (90%) and testing (10%) sets
    X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.1, random_state=42)

    for clf_name, clf in classifiers.items():
        # Train the model
        clf.fit(X_train, y_train)
        
        # Calculate accuracy on the test set
        test_accuracy = clf.score(X_test, y_test)
        
        # Store results
        results[clf_name] = test_accuracy
        print(f"{clf_name} Test Accuracy with Selected Features: {test_accuracy:.4f}")
    
    return results

# Evaluate classifiers for each set of features
print("\n--- Accuracy Results for Ensemble Selected Features ---")
ensemble_results = evaluate_classifiers(X_ensemble)

print("\n--- Accuracy Results for Selected Features without Ensemble (only RL) ---")
rl_results = evaluate_classifiers(X_rl)

print("\n--- Accuracy Results for Selected Features from Proposed Model ---")
proposed_results = evaluate_classifiers(X_proposed)

# Display final results
print("\n--- Final Accuracy Results ---")
print("Ensemble Results:", ensemble_results)
print("RL Results:", rl_results)
print("Proposed Results:", proposed_results)



--- Accuracy Results for Ensemble Selected Features ---
SVM Test Accuracy with Selected Features: 0.6727
Random Forest Test Accuracy with Selected Features: 0.9273
Decision Tree Test Accuracy with Selected Features: 0.8182
Naive Bayes Test Accuracy with Selected Features: 0.7273
KNN Test Accuracy with Selected Features: 0.8182
AdaBoost Test Accuracy with Selected Features: 0.9091

--- Accuracy Results for Selected Features without Ensemble (only RL) ---
SVM Test Accuracy with Selected Features: 0.6909
Random Forest Test Accuracy with Selected Features: 0.8545
Decision Tree Test Accuracy with Selected Features: 0.8364
Naive Bayes Test Accuracy with Selected Features: 0.6545
KNN Test Accuracy with Selected Features: 0.8182
AdaBoost Test Accuracy with Selected Features: 0.8000

--- Accuracy Results for Selected Features from Proposed Model ---
SVM Test Accuracy with Selected Features: 0.6727
Random Forest Test Accuracy with Selected Features: 0.9273
Decision Tree Test Accuracy with Selec