In [2]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score

!pip install fuzzywuzzy
from fuzzywuzzy import process

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0




In [3]:
dataset = pd.read_csv("/content/Training.csv")


In [4]:
X = dataset.drop("prognosis", axis=1)
y = dataset["prognosis"]


In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.3,
    random_state=42,
    stratify=y_encoded
)

In [7]:
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingClassifier(
        n_estimators=100,
        random_state=42
    )
}


In [10]:
cv_scores = {}

for name, model in models.items():
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=5,
        scoring="f1_weighted"
    )
    cv_scores[name] = scores.mean()
    print(f"{name} | Mean CV F1-score: {scores.mean():.4f}")


RandomForest | Mean CV F1-score: 0.9500
GradientBoosting | Mean CV F1-score: 0.9100


In [13]:
sorted_models = sorted(
    cv_scores.items(),
    key=lambda x: x[1],
    reverse=True
)

print("\nModel Ranking:")
for name, score in sorted_models:
    print(f"{name}: {score:.4f}")

best_model_name = sorted_models[0][0]
best_model = models[best_model_name]

print("\n Selected Model:", best_model_name)


Model Ranking:
RandomForest: 0.9500
GradientBoosting: 0.9100
Selected Model: RandomForest


In [None]:
best_model.fit(X_train, y_train)

In [None]:
pickle.dump(best_model, open("RandomForest.pkl", "wb"))
pickle.dump(le, open("label_encoder.pkl", "wb"))


In [None]:
Rf = pickle.load(open("RandomForest.pkl", "rb"))
le = pickle.load(open("label_encoder.pkl", "rb"))


In [None]:
symptoms_df = pd.read_csv("/content/symptoms_df.csv")
precautions_df = pd.read_csv("/content/precautions_df.csv")
workout_df = pd.read_csv("/content/workout_df.csv")
description_df = pd.read_csv("/content/description.csv")
medications_df = pd.read_csv("/content/medications.csv")
diets_df = pd.read_csv("/content/diets.csv")


In [None]:
symptoms_list = {
    col.replace("_", " ").lower(): idx
    for idx, col in enumerate(X.columns)
}

diseases_list = {i: d for i, d in enumerate(le.classes_)}


In [None]:
def correct_spelling(symptom):
    match, score = process.extractOne(symptom, symptoms_list.keys())
    return match if score >= 80 else None


In [None]:
def get_disease_info(disease):
    return {
        "description": " ".join(description_df[description_df["Disease"] == disease]["Description"]),
        "precautions": precautions_df[precautions_df["Disease"] == disease].iloc[:, 1:].values[0],
        "medications": medications_df[medications_df["Disease"] == disease]["Medication"].values,
        "diet": diets_df[diets_df["Disease"] == disease]["Diet"].values,
        "workout": workout_df[workout_df["disease"] == disease]["workout"].values
    }


In [None]:
def predict_disease(symptoms):
    vector = np.zeros(len(symptoms_list))
    for s in symptoms:
        if s in symptoms_list:
            vector[symptoms_list[s]] = 1
    pred = Rf.predict([vector])[0]
    return diseases_list[pred]


In [14]:
raw = input("Enter symptoms (comma-separated): ").lower()
user_symptoms = [s.strip() for s in raw.split(",")]

corrected = []
for s in user_symptoms:
    cs = correct_spelling(s)
    if cs:
        corrected.append(cs)
    else:
        print(f"‚ö†Ô∏è '{s}' not recognized")

if not corrected:
    print("‚ùå No valid symptoms found.")
else:
    disease = predict_disease(corrected)
    info = get_disease_info(disease)

    print("\nü©∫ Predicted Disease:", disease)
    print("\nüìñ Description:", info["description"])

    print("\nüõ°Ô∏è Precautions:")
    for i, p in enumerate(info["precautions"], 1):
        print(f"{i}. {p}")

    print("\nüíä Medications:")
    for i, m in enumerate(info["medications"], 1):
        print(f"{i}. {m}")

    print("\nü•ó Diet:")
    for i, d in enumerate(info["diet"], 1):
        print(f"{i}. {d}")

    print("\nüèÉ Workout:")
    for i, w in enumerate(info["workout"], 1):
        print(f"{i}. {w}")


Enter symptoms (comma-separated): stomach_pain,acidity,ulcers,vomiting

ü©∫ Predicted Disease: GERD

üìñ Description: GERD (Gastroesophageal Reflux Disease) is a digestive disorder that affects the lower esophageal sphincter.

üõ°Ô∏è Precautions:
1. GERD
2. avoid fatty spicy food
3. avoid lying down after eating
4. maintain healthy weight
5. exercise

üíä Medications:
1. ['Proton Pump Inhibitors (PPIs)', 'H2 Blockers', 'Antacids', 'Prokinetics', 'Antibiotics']

ü•ó Diet:
1. ['Low-Acid Diet', 'Fiber-rich foods', 'Ginger', 'Licorice', 'Aloe vera juice']

üèÉ Workout:
1. Consume smaller meals
2. Avoid trigger foods (spicy, fatty)
3. Eat high-fiber foods
4. Limit caffeine and alcohol
5. Chew food thoroughly
6. ...
7. Consume non-citrus fruits
8. Include lean proteins
9. Stay hydrated
10. Avoid carbonated beverages

