In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report, roc_auc_score
)
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("dataset.csv")
print("Dataset Shape:", df.shape)

# Clean symptom columns
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().replace('nan', '')

# Create binary symptom matrix
all_symptoms = sorted(set(df.iloc[:, 1:].values.flatten()))
all_symptoms = [s for s in all_symptoms if s != '']

X = pd.DataFrame(0, index=df.index, columns=all_symptoms)
for i in range(len(df)):
    for symptom in df.iloc[i, 1:].values:
        if symptom:
            X.loc[i, symptom] = 1

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

print("\nDisease Classes:")
for cls, val in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{cls} -> {val}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

# Regularization models
models = {
    "L1 (Lasso)": LogisticRegression(
        penalty='l1',
        C=0.3,
        solver='saga',
        multi_class='ovr',
        class_weight='balanced',
        max_iter=3000,
        random_state=42
    ),
    "L2 (Ridge)": LogisticRegression(
        penalty='l2',
        C=0.3,
        solver='saga',
        multi_class='ovr',
        class_weight='balanced',
        max_iter=3000,
        random_state=42
    ),
    "Elastic Net": LogisticRegression(
        penalty='elasticnet',
        l1_ratio=0.5,     # balance between L1 & L2
        C=0.3,
        solver='saga',
        multi_class='ovr',
        class_weight='balanced',
        max_iter=3000,
        random_state=42
    )
}

results = {}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Train & evaluate each model
for name, model in models.items():
    print(f"\nTraining {name}...")

    cv_f1 = cross_val_score(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring='f1_weighted'
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    results[name] = {
        "model": model,
        "cv_f1_mean": cv_f1.mean(),
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, average='weighted'),
        "recall": recall_score(y_test, y_pred, average='weighted'),
        "f1": f1_score(y_test, y_pred, average='weighted'),
        "roc_auc": roc_auc_score(y_test, y_proba, multi_class='ovr')
    }

    print(f"CV F1 Mean : {cv_f1.mean():.4f}")
    print(f"Test F1    : {results[name]['f1']:.4f}")
    print(f"ROC-AUC    : {results[name]['roc_auc']:.4f}")

# Select best model based on CV F1 (robustness)
best_model_name = max(results, key=lambda x: results[x]['cv_f1_mean'])
best_model = results[best_model_name]['model']

print("\n==============================")
print("BEST REGULARIZED MODEL")
print("==============================")
print("Model:", best_model_name)

print("\nFinal Evaluation Metrics:")
print(f"Accuracy  : {results[best_model_name]['accuracy']:.4f}")
print(f"Precision : {results[best_model_name]['precision']:.4f}")
print(f"Recall    : {results[best_model_name]['recall']:.4f}")
print(f"F1-score  : {results[best_model_name]['f1']:.4f}")
print(f"ROC-AUC   : {results[best_model_name]['roc_auc']:.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test,
    best_model.predict(X_test),
    target_names=label_encoder.classes_
))


Dataset Shape: (4920, 18)

Disease Classes:
(vertigo) Paroymsal  Positional Vertigo -> 0
AIDS -> 1
Acne -> 2
Alcoholic hepatitis -> 3
Allergy -> 4
Arthritis -> 5
Bronchial Asthma -> 6
Cervical spondylosis -> 7
Chicken pox -> 8
Chronic cholestasis -> 9
Common Cold -> 10
Dengue -> 11
Diabetes -> 12
Dimorphic hemmorhoids(piles) -> 13
Drug Reaction -> 14
Fungal infection -> 15
GERD -> 16
Gastroenteritis -> 17
Heart attack -> 18
Hepatitis B -> 19
Hepatitis C -> 20
Hepatitis D -> 21
Hepatitis E -> 22
Hypertension -> 23
Hyperthyroidism -> 24
Hypoglycemia -> 25
Hypothyroidism -> 26
Impetigo -> 27
Jaundice -> 28
Malaria -> 29
Migraine -> 30
Osteoarthristis -> 31
Paralysis (brain hemorrhage) -> 32
Peptic ulcer diseae -> 33
Pneumonia -> 34
Psoriasis -> 35
Tuberculosis -> 36
Typhoid -> 37
Urinary tract infection -> 38
Varicose veins -> 39
hepatitis A -> 40

Training L1 (Lasso)...
CV F1 Mean : 0.9995
Test F1    : 1.0000
ROC-AUC    : 1.0000

Training L2 (Ridge)...
CV F1 Mean : 1.0000
Test F1    : 1.