In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, classification_report
)
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("dataset.csv")
print("Dataset Shape:", df.shape)

# Clean symptom columns
for col in df.columns:
    df[col] = df[col].astype(str).str.strip().replace('nan', '')

# Create binary symptom matrix
all_symptoms = sorted(set(df.iloc[:, 1:].values.flatten()))
all_symptoms = [s for s in all_symptoms if s != '']

X = pd.DataFrame(0, index=df.index, columns=all_symptoms)

for i in range(len(df)):
    for symptom in df.iloc[i, 1:].values:
        if symptom:
            X.loc[i, symptom] = 1

# Encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Disease'])

print("\nDisease Classes:", len(label_encoder.classes_))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# LightGBM Model
model = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1 = cross_val_score(
    model,
    X_train,
    y_train,
    cv=cv,
    scoring='f1_weighted'
)

print("\nCross-Validation F1 Scores:", cv_f1)
print("Mean CV F1:", cv_f1.mean())

# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nFinal Evaluation Metrics:")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1-score  : {f1:.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))

# Feature importance
print("\nTop 10 Important Symptoms:")
feature_importance = pd.DataFrame({
    'symptom': all_symptoms,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

for i, (symptom, imp) in enumerate(zip(feature_importance['symptom'].head(10),
                                      feature_importance['importance'].head(10)), 1):
    print(f"{i:2}. {symptom:<30} : {imp:.4f}")

# Prediction function
def predict_disease(symptoms_list):
    input_vector = np.zeros(len(all_symptoms))
    for symptom in symptoms_list:
        if symptom in all_symptoms:
            idx = all_symptoms.index(symptom)
            input_vector[idx] = 1

    input_data = pd.DataFrame([input_vector], columns=all_symptoms)
    prediction = model.predict(input_data)[0]
    disease = label_encoder.inverse_transform([prediction])[0]

    return disease

# Test predictions
print("\nTest Predictions:")
print("-" * 30)

test_cases = [
    ['continuous_sneezing', 'chills', 'cough'],
    ['itching', 'skin_rash', 'nodal_skin_eruptions'],
    ['stomach_pain', 'vomiting', 'nausea']
]

for i, symptoms in enumerate(test_cases, 1):
    prediction = predict_disease(symptoms)
    print(f"Test {i} - Symptoms: {symptoms}")
    print(f"Prediction: {prediction}\n")

print("LightGBM model trained successfully!")

Dataset Shape: (4920, 18)

Disease Classes: 41

Training set: (3936, 131)
Test set: (984, 131)

Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
Mean CV F1: 1.0

Final Evaluation Metrics:
Accuracy  : 1.0000
Precision : 1.0000
Recall    : 1.0000
F1-score  : 1.0000

Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        24
                                   AIDS       1.00      1.00      1.00        24
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        24
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        24
                       Bronchial Asthma       1.00      1.00      1.00        24
                   Cervical spondylosis       1.00      1.00      1