In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report, roc_auc_score
)
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("dataset.csv")
print("Dataset Shape:", df.shape)

# Clean symptom columns
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().replace('nan', '')

# Create binary symptom matrix
all_symptoms = sorted(set(df.iloc[:, 1:].values.flatten()))
all_symptoms = [s for s in all_symptoms if s != '']

X = pd.DataFrame(0, index=df.index, columns=all_symptoms)

for i in range(len(df)):
    for symptom in df.iloc[i, 1:].values:
        if symptom:
            X.loc[i, symptom] = 1

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

print("\nDisease Classes:")
for cls, val in zip(label_encoder.classes_, range(len(label_encoder.classes_))):
    print(f"{cls} -> {val}")

print(f"\nNumber of features (symptoms): {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

# XGBoost Model
model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    objective='multi:softprob',
    num_class=len(label_encoder.classes_),
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    gamma=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1 = cross_val_score(
    model,
    X_train,
    y_train,
    cv=cv,
    scoring='f1_weighted'
)

print("\nCross-Validation F1 Scores:", cv_f1)
print("Mean CV F1:", cv_f1.mean())
print(f"CV F1 Std: {cv_f1.std():.4f}")

# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# ROC-AUC (One-vs-Rest)
try:
    roc_auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
    print(f"ROC-AUC   : {roc_auc:.4f}")
except Exception as e:
    print(f"ROC-AUC calculation failed: {e}")
    roc_auc = None

print("\nFinal Evaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))

# Feature importance
print("\nFeature Importance Analysis:")
importance_df = pd.DataFrame({
    'symptom': all_symptoms,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important symptoms:")
for i, (symptom, imp) in enumerate(zip(importance_df['symptom'].head(10),
                                      importance_df['importance'].head(10)), 1):
    print(f"{i:2}. {symptom:<30} : {imp:.4f}")

# Create a prediction function
def predict_disease_from_symptoms(symptoms_list):
    """
    Predict disease based on given symptoms

    Args:
        symptoms_list: List of symptom strings

    Returns:
        Dictionary with predictions
    """
    # Create binary vector
    input_vector = np.zeros(len(all_symptoms))
    for symptom in symptoms_list:
        if symptom in all_symptoms:
            idx = all_symptoms.index(symptom)
            input_vector[idx] = 1

    # Reshape for prediction
    input_data = pd.DataFrame([input_vector], columns=all_symptoms)

    # Predict
    probabilities = model.predict_proba(input_data)[0]
    predicted_class_idx = np.argmax(probabilities)
    predicted_disease = label_encoder.inverse_transform([predicted_class_idx])[0]
    confidence = probabilities[predicted_class_idx]

    # Get top predictions
    top_n = 5
    top_indices = np.argsort(probabilities)[-top_n:][::-1]

    result = {
        'predicted_disease': predicted_disease,
        'confidence': float(confidence),
        'top_predictions': []
    }

    for idx in top_indices:
        disease = label_encoder.inverse_transform([idx])[0]
        prob = probabilities[idx]
        result['top_predictions'].append({
            'disease': disease,
            'probability': float(prob)
        })

    return result

# Test predictions
print("\n" + "="*50)
print("TEST PREDICTIONS")
print("="*50)

# Test case 1: Cold symptoms
test_symptoms_1 = ['continuous_sneezing', 'chills', 'cough', 'headache', 'fatigue']
print(f"\nTest 1 - Symptoms: {test_symptoms_1}")
prediction_1 = predict_disease_from_symptoms(test_symptoms_1)
print(f"Predicted: {prediction_1['predicted_disease']} (Confidence: {prediction_1['confidence']:.2%})")
print("Top 5 predictions:")
for pred in prediction_1['top_predictions']:
    print(f"  - {pred['disease']}: {pred['probability']:.2%}")

# Test case 2: Digestive issues
test_symptoms_2 = ['stomach_pain', 'vomiting', 'nausea', 'abdominal_pain']
print(f"\nTest 2 - Symptoms: {test_symptoms_2}")
prediction_2 = predict_disease_from_symptoms(test_symptoms_2)
print(f"Predicted: {prediction_2['predicted_disease']} (Confidence: {prediction_2['confidence']:.2%})")
print("Top 5 predictions:")
for pred in prediction_2['top_predictions']:
    print(f"  - {pred['disease']}: {pred['probability']:.2%}")

# Test case 3: Skin issues
test_symptoms_3 = ['itching', 'skin_rash', 'nodal_skin_eruptions']
print(f"\nTest 3 - Symptoms: {test_symptoms_3}")
prediction_3 = predict_disease_from_symptoms(test_symptoms_3)
print(f"Predicted: {prediction_3['predicted_disease']} (Confidence: {prediction_3['confidence']:.2%})")
print("Top 5 predictions:")
for pred in prediction_3['top_predictions']:
    print(f"  - {pred['disease']}: {pred['probability']:.2%}")

# Model saving
import joblib
import os

model_data = {
    'model': model,
    'label_encoder': label_encoder,
    'symptom_list': all_symptoms,
    'feature_names': all_symptoms,
    'metrics': {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }
}

# Create directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Save model
joblib.dump(model_data, 'models/xgboost_disease_model.pkl')
print("\nModel saved to 'models/xgboost_disease_model.pkl'")

# Print final summary
print("\n" + "="*50)
print("MODEL TRAINING COMPLETE")
print("="*50)
print(f"\nModel Summary:")
print(f"- Number of diseases: {len(label_encoder.classes_)}")
print(f"- Number of symptoms: {len(all_symptoms)}")
print(f"- Test Accuracy: {accuracy:.2%}")
print(f"- Test F1-score: {f1:.2%}")
print(f"- Model saved to: models/xgboost_disease_model.pkl")

Dataset Shape: (4920, 18)

Disease Classes:
(vertigo) Paroymsal  Positional Vertigo -> 0
AIDS -> 1
Acne -> 2
Alcoholic hepatitis -> 3
Allergy -> 4
Arthritis -> 5
Bronchial Asthma -> 6
Cervical spondylosis -> 7
Chicken pox -> 8
Chronic cholestasis -> 9
Common Cold -> 10
Dengue -> 11
Diabetes -> 12
Dimorphic hemmorhoids(piles) -> 13
Drug Reaction -> 14
Fungal infection -> 15
GERD -> 16
Gastroenteritis -> 17
Heart attack -> 18
Hepatitis B -> 19
Hepatitis C -> 20
Hepatitis D -> 21
Hepatitis E -> 22
Hypertension -> 23
Hyperthyroidism -> 24
Hypoglycemia -> 25
Hypothyroidism -> 26
Impetigo -> 27
Jaundice -> 28
Malaria -> 29
Migraine -> 30
Osteoarthristis -> 31
Paralysis (brain hemorrhage) -> 32
Peptic ulcer diseae -> 33
Pneumonia -> 34
Psoriasis -> 35
Tuberculosis -> 36
Typhoid -> 37
Urinary tract infection -> 38
Varicose veins -> 39
hepatitis A -> 40

Number of features (symptoms): 131
Number of samples: 4920

Training set shape: (3936, 131)
Test set shape: (984, 131)
