In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Load Dataset
disease_df = pd.read_csv('disease_diagnosis.csv')

In [3]:

# Split Blood Pressure into Systolic and Diastolic
bp_split = disease_df['Blood_Pressure_mmHg'].str.split('/', expand=True).astype(float)
disease_df['BP_Systolic'] = bp_split[0]
disease_df['BP_Diastolic'] = bp_split[1]

# One-Hot Encode Symptoms
all_symptoms = pd.unique(
    disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].values.ravel()
)

for sym in all_symptoms:
    disease_df[f"symptom_{sym}"] = disease_df[['Symptom_1', 'Symptom_2', 'Symptom_3']].isin([sym]).any(axis=1).astype(int)

# Feature and Target Selection
features = ['Heart_Rate_bpm', 'Body_Temperature_C',
            'Oxygen_Saturation_%', 'BP_Systolic', 'BP_Diastolic'] + \
           [f"symptom_{sym}" for sym in all_symptoms]

X = disease_df[features]
# Target Columns
y = disease_df[['Diagnosis', 'Severity']]

# Encode each target
diagnosis_encoder = LabelEncoder()
severity_encoder = LabelEncoder()

y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
y['Severity'] = severity_encoder.fit_transform(y['Severity'])

# Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Diagnosis'] = diagnosis_encoder.fit_transform(y['Diagnosis'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Severity'] = severity_encoder.fit_transform(y['Severity'])


In [4]:
# Initialize KNN Classifier
knn = KNeighborsClassifier(
    n_neighbors=17,        # You can tune this!
    weights='uniform',    # Try 'distance' for weighted voting
    metric='manhattan',   # Default (Euclidean)
    p=2                   # p=2 -> Euclidean, p=1 -> Manhattan
)

multi_knn = MultiOutputClassifier(knn)

# Train Model
multi_knn.fit(X_train, y_train)

# Predictions
y_pred = multi_knn.predict(X_test)

# Decode Predictions
y_pred_diagnosis = diagnosis_encoder.inverse_transform(y_pred[:, 0])
y_pred_severity = severity_encoder.inverse_transform(y_pred[:, 1])

In [5]:
# Diagnosis Evaluation
print("\nDiagnosis Classification Report:")
print(classification_report(y_test['Diagnosis'], y_pred[:, 0]))

print("\nDiagnosis Confusion Matrix:")
print(confusion_matrix(y_test['Diagnosis'], y_pred[:, 0]))

# Severity Evaluation
print("\nSeverity Classification Report:")
print(classification_report(y_test['Severity'], y_pred[:, 1]))

print("\nSeverity Confusion Matrix:")
print(confusion_matrix(y_test['Severity'], y_pred[:, 1]))


Diagnosis Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84        64
           1       0.84      0.95      0.89       238
           2       0.75      0.79      0.77        58
           3       0.00      0.00      0.00        11
           4       0.75      0.10      0.18        29

    accuracy                           0.82       400
   macro avg       0.63      0.54      0.54       400
weighted avg       0.79      0.82      0.79       400


Diagnosis Confusion Matrix:
[[ 55   7   2   0   0]
 [  8 225   4   0   1]
 [  1  11  46   0   0]
 [  2   1   8   0   0]
 [  1  24   1   0   3]]

Severity Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.98      0.94       267
           1       0.80      0.78      0.79        58
           2       0.91      0.67      0.77        75

    accuracy                           0.89       400
   macro avg       0.87    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
