In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import joblib
from sklearn.neighbors import NearestNeighbors

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# ----------------------------------------
# BMFK Classifier (Custom Implementation)
# ----------------------------------------
class BMFK:
    def __init__(self, n_neighbors=5, m=2, p=2, q=2):
        self.n_neighbors = n_neighbors
        self.m = m
        self.p = p
        self.q = q

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        # Use the public 'euclidean' metric
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors, metric='euclidean')
        self.nn.fit(X)

    def bonferroni_mean(self, values):
        n = len(values)
        if n <= 1:
            return np.mean(values)
        sum_pq = 0
        for i in range(n):
            for j in range(n):
                if i != j:
                    sum_pq += values[i] ** self.p * values[j] ** self.q
        return (sum_pq / (n * (n - 1))) ** (1 / (self.p + self.q))

    def predict(self, X):
        predictions = []
        for x in X:
            distances, indices = self.nn.kneighbors([x])
            neighbor_labels = self.y[indices[0]]

            # Avoid division by zero or negative distances
            distances = distances[0]
            distances[distances == 0] = 1e-8

            # Calculate memberships
            memberships = 1 / (distances ** (2 / (self.m - 1)) + 1e-8)
            memberships /= np.sum(memberships)

            # Calculate class memberships using Bonferroni mean
            class_memberships = {}
            for c in self.classes:
                idx = neighbor_labels == c
                if np.any(idx):
                    class_memberships[c] = self.bonferroni_mean(memberships[idx])
                else:
                    class_memberships[c] = 0

            # Predict class with highest membership
            predictions.append(max(class_memberships, key=class_memberships.get))

        return np.array(predictions)

# ----------------------------------------
# DataPreprocessor Class
# ----------------------------------------
class DataPreprocessor:
    def __init__(self):
        self.scaler = StandardScaler()
        self.fill_medians = None
        self.training_features = None

    def fit_transform_train(self, df):
        """
        Preprocesses the training data:
        - Drops unnecessary columns.
        - Converts to numeric.
        - Fills missing values with median.
        - Encodes categorical variables.
        - Scales features.
        """
        # Drop unnecessary columns
        df = df.drop(columns=['Sl. No', 'Patient File No.', 'Unnamed: 44'], errors='ignore')

        # Convert to numeric and fill missing values with median
        df = df.apply(pd.to_numeric, errors='coerce')
        df.fillna(df.median(), inplace=True)

        # Encode categorical variables
        categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)',
                               'Weight gain(Y/N)', 'hair growth(Y/N)',
                               'Skin darkening (Y/N)', 'Hair loss(Y/N)',
                               'Pimples(Y/N)', 'Fast food (Y/N)',
                               'Reg.Exercise(Y/N)']
        for col in categorical_columns:
            if col in df.columns:
                df[col] = df[col].astype('category').cat.codes

        # Separate features and target
        if 'PCOS (Y/N)' not in df.columns:
            raise ValueError("Training data must contain 'PCOS (Y/N)' column as target.")
        X = df.drop(columns=['PCOS (Y/N)'], errors='ignore')
        y = df['PCOS (Y/N)'].values

        # Feature scaling
        X_scaled = pd.DataFrame(self.scaler.fit_transform(X), columns=X.columns)

        # Store medians for future use
        self.fill_medians = df.median()

        # Store training features
        self.training_features = X.columns.tolist()

        return X_scaled, y

    def transform_test(self, df):
        """
        Preprocesses the test data:
        - Ensures all training features are present.
        - Fills missing features with median values from training data.
        - Encodes categorical variables.
        - Scales features using the trained scaler.
        """
        # Check if 'PCOS (Y/N)' is present
        if 'PCOS (Y/N)' in df.columns:
            y = df['PCOS (Y/N)'].values
            X = df.drop(columns=['PCOS (Y/N)'], errors='ignore')
        else:
            y = None
            X = df.copy()

        # Add missing training features with median values
        for feature in self.training_features:
            if feature not in X.columns:
                X[feature] = self.fill_medians[feature]

        # Drop any extra features not in training
        X = X[self.training_features]

        # Encode categorical variables
        categorical_columns = ['Blood Group', 'Cycle(R/I)', 'Pregnant(Y/N)',
                               'Weight gain(Y/N)', 'hair growth(Y/N)',
                               'Skin darkening (Y/N)', 'Hair loss(Y/N)',
                               'Pimples(Y/N)', 'Fast food (Y/N)',
                               'Reg.Exercise(Y/N)']
        for col in categorical_columns:
            if col in X.columns:
                X[col] = X[col].astype('category').cat.codes

        # Feature scaling using the fitted scaler
        X_scaled = pd.DataFrame(self.scaler.transform(X), columns=X.columns)

        return X_scaled, y

# ----------------------------------------
# ModelEvaluator Class
# ----------------------------------------
class ModelEvaluator:
    def __init__(self, model_path):
        # Load the saved model and scaler
        loaded_data = joblib.load(model_path)
        self.loaded_model = loaded_data['model']
        self.best_test_idx_loaded = loaded_data.get('test_idx', None)
        self.scaler = loaded_data['scaler']

    def evaluate(self, train_df, test_df):
        """
        Preprocesses the training and test data, makes predictions on the test set,
        and calculates accuracy.
        """
        # Initialize the data preprocessor and fit on training data
        preprocessor = DataPreprocessor()
        X_train_scaled, y_train = preprocessor.fit_transform_train(train_df)

        # Store medians from training data
        train_medians = preprocessor.fill_medians

        # Get the list of training features
        training_features = preprocessor.training_features

        # Preprocess test data
        X_test_scaled, y_test = preprocessor.transform_test(test_df)

        # Define the proposed selected features
        proposed_selected_features = [
            'Follicle No. (L)', 'hair growth(Y/N)', 'Follicle No. (R)',
            'Cycle(R/I)', 'Fast food (Y/N)', 'Skin darkening (Y/N)',
            'Cycle length(days)', 'LH(mIU/mL)', 'FSH(mIU/mL)', ' Age (yrs)', 
            'Weight (Kg)', 'Marraige Status (Yrs)', 'PRL(ng/mL)'
        ]

        # Identify existing features in the test set
        existing_features = [feature for feature in proposed_selected_features if feature in X_test_scaled.columns]
        missing_features = [feature for feature in proposed_selected_features if feature not in X_test_scaled.columns]

        # Add missing features with median values
        for feature in missing_features:
            if feature in preprocessor.fill_medians:
                X_test_scaled[feature] = preprocessor.fill_medians[feature]
            else:
                # If the feature wasn't in training medians, fill with 0 or another appropriate value
                X_test_scaled[feature] = 0

        # Reorder columns to match the proposed_selected_features order
        X_test_scaled = X_test_scaled[proposed_selected_features]

        # Prepare the final test set for prediction
        X_proposed_test = X_test_scaled.values

        # Make predictions using the loaded model
        y_pred_loaded = self.loaded_model.predict(X_proposed_test)

        # Calculate accuracy
        loaded_accuracy = accuracy_score(y_test, y_pred_loaded)
        print(f"\nLoaded Proposed BMFK Model Accuracy on Test Dataset: {loaded_accuracy:.4f}")

        # Optionally, display a sample comparison of actual vs. predicted values
        results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_loaded})
        print("\nSample of Test Results:")
        print(results.head())

# ----------------------------------------
# Main Execution
# ----------------------------------------
def main():
    # Define file paths
    model_path = 'proposed_bmfk_model.pkl'
    train_file_path = "PCOS_data_without_infertility.xlsx"
    test_file_path = "PCOS_test.xlsx"

    # Load training and test data
    train_df = pd.read_excel(train_file_path, sheet_name="Full_new")
    test_df = pd.read_excel(test_file_path)

    # Initialize the model evaluator
    evaluator = ModelEvaluator(model_path)

    # Evaluate the model on the test data
    evaluator.evaluate(train_df, test_df)

if __name__ == "__main__":
    main()



Loaded Proposed BMFK Model Accuracy on Test Dataset: 0.0000

Sample of Test Results:
   Actual  Predicted
0       1          0
1       1          0
2       1          0
3       1          0
4       1          0
