In [6]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report, roc_auc_score
)
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("dataset.csv")
print("Dataset Shape:", df.shape)

# Clean symptom columns
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().replace('nan', '')

# Create binary symptom matrix
all_symptoms = sorted(set(df.iloc[:, 1:].values.flatten()))
all_symptoms = [s for s in all_symptoms if s != '']

X = pd.DataFrame(0, index=df.index, columns=all_symptoms)

for i in range(len(df)):
    for symptom in df.iloc[i, 1:].values:
        if symptom:
            X.loc[i, symptom] = 1

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

print("\nDisease Classes:")
for cls, val in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{cls} -> {val}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# CatBoost Model
model = CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.05,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42,
    verbose=False
)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1 = cross_val_score(
    model,
    X_train,
    y_train,
    cv=cv,
    scoring='f1_weighted'
)

print("\nCross-Validation F1 Scores:", cv_f1)
print("Mean CV F1:", cv_f1.mean())

# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred = y_pred.flatten()  # CatBoost returns shape (n,1)
y_proba = model.predict_proba(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr')

print("\nFinal Evaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")
print(f"ROC-AUC   : {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))


Dataset Shape: (4920, 18)

Disease Classes:
(vertigo) Paroymsal  Positional Vertigo -> 0
AIDS -> 1
Acne -> 2
Alcoholic hepatitis -> 3
Allergy -> 4
Arthritis -> 5
Bronchial Asthma -> 6
Cervical spondylosis -> 7
Chicken pox -> 8
Chronic cholestasis -> 9
Common Cold -> 10
Dengue -> 11
Diabetes -> 12
Dimorphic hemmorhoids(piles) -> 13
Drug Reaction -> 14
Fungal infection -> 15
GERD -> 16
Gastroenteritis -> 17
Heart attack -> 18
Hepatitis B -> 19
Hepatitis C -> 20
Hepatitis D -> 21
Hepatitis E -> 22
Hypertension -> 23
Hyperthyroidism -> 24
Hypoglycemia -> 25
Hypothyroidism -> 26
Impetigo -> 27
Jaundice -> 28
Malaria -> 29
Migraine -> 30
Osteoarthristis -> 31
Paralysis (brain hemorrhage) -> 32
Peptic ulcer diseae -> 33
Pneumonia -> 34
Psoriasis -> 35
Tuberculosis -> 36
Typhoid -> 37
Urinary tract infection -> 38
Varicose veins -> 39
hepatitis A -> 40

Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
Mean CV F1: 1.0

Final Evaluation Metrics:
Accuracy  : 1.0000
Precision : 1.0000
Recall    : 1.00