In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import NearestNeighbors

# BMFK Classifier
class BMFK:
    def __init__(self, n_neighbors=5, m=2, p=2, q=2):
        self.n_neighbors = n_neighbors
        self.m = m
        self.p = p
        self.q = q

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        # Using Euclidean distance (p=2)
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='minkowski', p=2)
        self.nn.fit(X)

    def bonferroni_mean(self, values):
        n = len(values)
        if n <= 1:
            return np.mean(values)
        sum_pq = 0
        for i in range(n):
            for j in range(n):
                if i != j:
                    sum_pq += values[i] ** self.p * values[j] ** self.q
        return (sum_pq / (n * (n - 1))) ** (1 / (self.p + self.q))

    def predict(self, X):
        predictions = []
        for x in X:
            distances, indices = self.nn.kneighbors([x])
            neighbor_labels = self.y[indices[0]]

            # Avoid division by zero or negative distances
            distances = distances[0]
            distances[distances == 0] = 1e-8

            # Calculate memberships
            memberships = 1 / (distances ** (2 / (self.m - 1)) + 1e-8)
            memberships /= np.sum(memberships)

            # Calculate class memberships using Bonferroni mean
            class_memberships = {}
            for c in self.classes:
                idx = neighbor_labels == c
                if np.any(idx):
                    class_memberships[c] = self.bonferroni_mean(memberships[idx])
                else:
                    class_memberships[c] = 0

            # Predict class with highest membership
            predictions.append(max(class_memberships, key=class_memberships.get))

        return np.array(predictions)

# Preprocess and Augment Dataset
def preprocess_and_augment_pcos_data(file_path):
    # Load the PCOS dataset
    df = pd.read_excel(file_path, sheet_name="Full_new")
    # Drop unwanted columns
    df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'], errors='ignore')
    # Convert to numeric and handle missing values
    df = df.apply(pd.to_numeric, errors='coerce')
    df.fillna(df.median(), inplace=True)
    
    # Encode categorical variables
    categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)',
                           'Weight gain(Y/N)', 'hair growth(Y/N)',
                           'Skin darkening (Y/N)', 'Hair loss(Y/N)',
                           'Pimples(Y/N)', 'Fast food (Y/N)',
                           'Reg.Exercise(Y/N)']
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].astype('category').cat.codes
    
    # Separate features and target
    X = df.drop(columns=['PCOS (Y/N)'], errors='ignore')
    y = df['PCOS (Y/N)'].values
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    # Data augmentation: Add small noise
    noise = np.random.normal(0, 0.01, X_scaled.shape)
    X_augmented = X_scaled + noise
    y_augmented = y.copy()  # Labels remain the same for augmented data
    
    # Combine original and augmented datasets
    X_combined = pd.concat([X_scaled, pd.DataFrame(X_augmented, columns=X.columns)], ignore_index=True)
    y_combined = np.concatenate([y, y_augmented])
    
    return X_combined, y_combined

# Select Proposed Features
def select_proposed_features(X):
    proposed_selected_features = [
        'Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)',
        'Cycle(R/I)', 'Fast food (Y/N)', 'Skin darkening (Y/N)',
        'Cycle length(days)', 'LH(mIU/mL)', 'FSH(mIU/mL)', 
        ' Age (yrs)', 'Weight (Kg)', 'Marraige Status (Yrs)', 'PRL(ng/mL)'
    ]
    # Filter columns based on proposed selected features
    selected_features = [feature for feature in proposed_selected_features if feature in X.columns]
    return X[selected_features]

# Train and Evaluate BMFK
def train_and_evaluate_bmfk(X_train, X_test, y_train, y_test, n_neighbors=5, m=2, p=2, q=2):
    # Initialize and fit BMFK
    bmfk = BMFK(n_neighbors=n_neighbors, m=m, p=p, q=q)
    bmfk.fit(X_train.values, y_train)
    # Predict and evaluate
    y_pred = bmfk.predict(X_test.values)
    metrics = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }
    return metrics

# Main Workflow
if __name__ == "__main__":
    # File path for PCOS dataset
    file_path = "/kaggle/input/pcos-ml/PCOS_data_without_infertility.xlsx"
    
    # Preprocess and augment data
    X, y = preprocess_and_augment_pcos_data(file_path)
    
    # Select proposed features
    X_selected = select_proposed_features(X)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # Train and evaluate BMFK
    bmfk_metrics = train_and_evaluate_bmfk(X_train, X_test, y_train, y_test)
    print("BMFK Classifier Metrics on Proposed Features:")
    for metric, value in bmfk_metrics.items():
        print(f"{metric}: {value:.4f}")


BMFK Classifier Metrics on Proposed Features:
Accuracy: 0.9631
Precision: 0.9631
Recall: 0.9631
F1 Score: 0.9631
