In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Load the dataset
file_path = "PCOS_data_without_infertility.xlsx"
df = pd.read_excel(file_path, sheet_name="Full_new")

# Data Preprocessing
df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'])
df = df.apply(pd.to_numeric, errors='coerce')
df.fillna(df.median(), inplace=True)

# Encode categorical columns
categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)', 
                       'Weight gain(Y/N)', 'hair growth(Y/N)', 
                       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 
                       'Pimples(Y/N)', 'Fast food (Y/N)', 
                       'Reg.Exercise(Y/N)']
for col in categorical_columns:
    if col in df.columns:
        df[col] = df[col].astype('category').cat.codes

# Split into features and target
X = df.drop(columns=['PCOS (Y/N)'])
y = df['PCOS (Y/N)'].values  # Convert to numpy array

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define the different feature sets
ensemble_selected_features = ['Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)', 
                              'Cycle(R/I)', 'Fast food (Y/N)', 'AMH(ng/mL)', 'Skin darkening (Y/N)', 
                              'Weight gain(Y/N)', 'Pimples(Y/N)', 'Cycle length(days)', 
                              'Hip(inch)', 'Weight (Kg)', 'FSH/LH', 'FSH(mIU/mL)']

rl_selected_features = [' Age (yrs)', 'Weight (Kg)', 'Pulse rate(bpm) ', 'RR (breaths/min)', 'Hb(g/dl)', 
                        'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)', '  I   beta-HCG(mIU/mL)', 
                        'FSH(mIU/mL)', 'Waist(inch)', 'Waist:Hip Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 
                        'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)', 
                        'Skin darkening (Y/N)', 'Pimples(Y/N)', 'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 
                        'Follicle No. (L)', 'Follicle No. (R)']

proposed_selected_features = [
        'Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)',
        'Cycle(R/I)', 'Fast food (Y/N)', 'Skin darkening (Y/N)',
        'Cycle length(days)', 'FSH/LH', ' Age (yrs)', 'Weight (Kg)',
        'Hip(inch)'  # Added more features identified as important
    ]

# 1. SVM on Ensemble Selected Features
X_ensemble = X_scaled[:, [X.columns.get_loc(feature) for feature in ensemble_selected_features]]

# 2. SVM on RL Selected Features
X_rl = X_scaled[:, [X.columns.get_loc(feature) for feature in rl_selected_features]]

# 3. SVM on Proposed Selected Features
X_proposed = X_scaled[:, [X.columns.get_loc(feature) for feature in proposed_selected_features]]

# 4. SVM on All Features (no feature selection)
X_all = X_scaled  # Use all features

# Helper function to run cross-validation
def run_svm_cv(X, y, svm_params=None):
    if svm_params is None:
        svm_params = {}
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracies = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model = SVC(**svm_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    
    return np.mean(accuracies)

# Run SVM for Ensemble, RL, and All Features with initial parameters
svm_params_initial = {'kernel': 'rbf', 'C': 1, 'gamma': 'scale'}  # Initial SVM parameters

accuracy_ensemble = run_svm_cv(X_ensemble, y, svm_params_initial)
accuracy_rl = run_svm_cv(X_rl, y, svm_params_initial)
accuracy_all = run_svm_cv(X_all, y, svm_params_initial)

# Hyperparameter tuning for Proposed Selected Features
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

grid = GridSearchCV(SVC(), param_grid, cv=10, scoring='accuracy')
grid.fit(X_proposed, y)

print("Best parameters for Proposed Selected Features:", grid.best_params_)
best_svm_params_proposed = grid.best_params_

# Run SVM for Proposed Selected Features with best parameters
accuracy_proposed = run_svm_cv(X_proposed, y, best_svm_params_proposed)

print(f"SVM on Ensemble Selected Features Accuracy: {accuracy_ensemble:.4f}")
print(f"SVM on RL Selected Features Accuracy: {accuracy_rl:.4f}")
print(f"SVM on Proposed Selected Features Accuracy: {accuracy_proposed:.4f}")
print(f"SVM on All Features Accuracy: {accuracy_all:.4f}")

Best parameters for Proposed Selected Features: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
SVM on Ensemble Selected Features Accuracy: 0.9019
SVM on RL Selected Features Accuracy: 0.8982
SVM on Proposed Selected Features Accuracy: 0.8983
SVM on All Features Accuracy: 0.8798
