In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
try:
    train_df = pd.read_csv('Training.csv')
    test_df = pd.read_csv('Testing.csv')
    print("Files loaded successfully! ")
except FileNotFoundError:
    print(" Error: Make sure 'Training.csv' and 'Testing.csv' are uploaded to Colab.")


Files loaded successfully! 


In [None]:
# Display the first few rows of the training data
print("\n--- Training Data Sample ---")
print(train_df.head())


--- Training Data Sample ---
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_

In [None]:
# --- Data Preparation ---
# Separate the features (symptoms) from the target (prognosis)
X_train = train_df.drop('prognosis', axis=1)
y_train = train_df['prognosis']
X_test = test_df.drop('prognosis', axis=1)
y_test = test_df['prognosis']


In [None]:
# The 'prognosis' column is text. We need to convert it to numbers for the model.
# A LabelEncoder turns each unique disease name into a unique number.
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print("\nData is prepped and ready for training!")


Data is prepped and ready for training!


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
# n_estimators=100 means it will build 100 decision trees.
# random_state=42 ensures we get the same result every time we run it.
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using our training data
print("Training the Random Forest model... ")

Training the Random Forest model... 


In [None]:
model.fit(X_train, y_train_encoded)
print("Model training complete! ")

Model training complete! 


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Use the trained model to make predictions on the test data
print("Making predictions on the test data... ")
y_pred_encoded = model.predict(X_test)

Making predictions on the test data... 


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Unnamed: 133


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the training and testing datasets
try:
    train_df = pd.read_csv('Training.csv')
    test_df = pd.read_csv('Testing.csv')
    print("Files loaded successfully! ")
except FileNotFoundError:
    print(" Error: Make sure 'Training.csv' and 'Testing.csv' are uploaded to Colab.")

# --- FIX THE ERROR ---
# The training data has an extra, empty column at the end called 'Unnamed: 133'.
# This causes an error because the training and testing columns don't match.
# The following line removes that column if it exists.
if 'Unnamed: 133' in train_df.columns:
    train_df = train_df.drop('Unnamed: 133', axis=1)
    print("\nFound and removed extra 'Unnamed: 133' column from the training data.")

# --- Data Preparation ---
# Separate the features (symptoms) from the target (prognosis)
X_train = train_df.drop('prognosis', axis=1)
y_train = train_df['prognosis']
X_test = test_df.drop('prognosis', axis=1)
y_test = test_df['prognosis']

# Check if columns match now
if list(X_train.columns) == list(X_test.columns):
    print("Training and testing columns now match perfectly. ")
else:
    print(" Warning: Columns still do not match.")

# The 'prognosis' column is text. We need to convert it to numbers for the model.
# A LabelEncoder turns each unique disease name into a unique number.
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

print("\nData is prepped and ready for training!")

Files loaded successfully! 

Found and removed extra 'Unnamed: 133' column from the training data.
Training and testing columns now match perfectly. 

Data is prepped and ready for training!


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest model
# n_estimators=100 means it will build 100 decision trees.
# random_state=42 ensures we get the same result every time we run it.
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model using our training data
print("Training the Random Forest model... ")
model.fit(X_train, y_train_encoded)
print("Model training complete! ")

Training the Random Forest model... 
Model training complete! 


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Use the trained model to make predictions on the test data
print("Making predictions on the test data... ")
y_pred_encoded = model.predict(X_test)

# --- Evaluate the Model's Performance ---

# 1. Check the overall accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"\n Overall Accuracy: {accuracy * 100:.2f}%")

# 2. Get a detailed report
# This shows how well the model did for each individual disease.
print('\n--- Detailed Classification Report ---')
report = classification_report(y_test_encoded, y_pred_encoded, target_names=label_encoder.classes_)
print(report)

Making predictions on the test data... 

 Overall Accuracy: 97.62%

--- Detailed Classification Report ---
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox       1.00      1.00      1.00         1
                    Chronic cholestasis       1.00      1.00      1.00         1
 

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import ipywidgets as widgets
from IPython.display import display, HTML

# --- 1. Load and Clean the Training Dataset ---
try:
    df = pd.read_csv('Training.csv')
    print("Training.csv loaded successfully! ")
except FileNotFoundError:
    print(" Error: Make sure 'Training.csv' is uploaded to Colab.")

# Clean the data: Drop the extra, empty column if it exists.
if 'Unnamed: 133' in df.columns:
    df = df.drop('Unnamed: 133', axis=1)
    print("Cleaned the extra 'Unnamed: 133' column.")

Training.csv loaded successfully! 
Cleaned the extra 'Unnamed: 133' column.


In [None]:
# Prepare the data
X = df.drop('prognosis', axis=1)
y = df['prognosis']
symptom_names = X.columns.tolist()

# Encode the disease names (target) into numbers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
# --- 2. Train the Model on the Full Dataset ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("\nTraining the model on the full dataset... ")
model.fit(X, y_encoded)
print("Model is trained and ready! ")


Training the model on the full dataset... 
Model is trained and ready! 


In [None]:
# --- 3. Create the Top-1 and Top-3 Prediction Function ---
def get_top_predictions(symptoms):
    """
    Takes a list of symptoms and returns the Top-1 and Top-3 predicted diseases.
    """
    # Create a binary vector (0s and 1s) from the selected symptoms
    input_vector = [1 if symptom in symptoms else 0 for symptom in symptom_names]
    input_df = pd.DataFrame([input_vector], columns=symptom_names)

    # Get the predicted probabilities for all diseases
    # This gives us a confidence score for each possible disease
    probabilities = model.predict_proba(input_df)[0]

    # Get the indices of the top 3 diseases with the highest probabilities
    top_3_indices = np.argsort(probabilities)[-3:][::-1]

    # Decode the indices back to disease names
    top_1_prediction = label_encoder.inverse_transform([top_3_indices[0]])[0]
    top_3_predictions = label_encoder.inverse_transform(top_3_indices)

    return top_1_prediction, list(top_3_predictions)

In [None]:
# --- 4. Build the Interactive User Interface ---
print("\n---  Interactive Symptom Checker ---")

# Create a grid of checkboxes for a better layout
checkboxes = [widgets.Checkbox(description=symptom, layout={'width': '250px'}) for symptom in symptom_names]
grid = widgets.GridBox(checkboxes, layout=widgets.Layout(grid_template_columns="repeat(4, 270px)"))

# Create labels to display the output
style = "<style>div.widget-label { font-size: 16px; }</style>"
top1_output = widgets.HTML(value="<b>Top-1 Prediction:</b> (Select symptoms to see a prediction)")
top3_output = widgets.HTML(value="<b>Top-3 Predictions:</b>")

def on_symptom_change(change):
    """This function runs every time a checkbox is clicked."""
    selected_symptoms = [cb.description for cb in checkboxes if cb.value]

    if selected_symptoms:
        top1, top3 = get_top_predictions(selected_symptoms)
        top1_output.value = f"<b>Top-1 Prediction:</b> <font color='red'>{top1}</font>"
        top3_list_html = "".join([f"<li>{pred}</li>" for pred in top3])
        top3_output.value = f"<b>Top-3 Predictions:</b> <ul>{top3_list_html}</ul>"
    else:
        top1_output.value = "<b>Top-1 Prediction:</b> (Select symptoms to see a prediction)"
        top3_output.value = "<b>Top-3 Predictions:</b>"


---  Interactive Symptom Checker ---


In [None]:
k_folds = KFold(n_splits=10, shuffle=True, random_state=42)

print("\nRunning 10-Fold Cross-Validation to calculate accuracy... ")
# This command runs the entire process: splits, trains, and tests 10 times.
scores = cross_val_score(model, X, y_encoded, cv=k_folds)

# --- 4. Show the Results ---
print("\n--- Cross-Validation Results ---")
print(f"Scores for each of the 10 folds: {np.round(scores, 3)}")
print(f"\n‚úÖ Average Accuracy: {scores.mean() * 100:.2f}%")
print(f"üìä Standard Deviation: {scores.std():.4f}")


NameError: name 'KFold' is not defined

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
import numpy as np

# --- 1. Load ALL Data (Training + Testing) ---
# We combine them because cross-validation creates its own test sets automatically.
try:
    train_df = pd.read_csv('Training.csv')
    test_df = pd.read_csv('Testing.csv')
    full_df = pd.concat([train_df, test_df], ignore_index=True)
    print("Successfully loaded and combined Training.csv and Testing.csv! ")
except FileNotFoundError:
    print("‚ùå Error: Make sure both 'Training.csv' and 'Testing.csv' are uploaded to Colab.")


# Drop the empty 'Unnamed: 133' column if it exists
if 'Unnamed: 133' in full_df.columns:
    full_df = full_df.drop('Unnamed: 133', axis=1)

# --- 2. Prepare the Combined Data ---
X = full_df.drop('prognosis', axis=1)
y = full_df['prognosis']

# Encode the target variable (disease names) into numbers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# --- 3. Set Up and Run Cross-Validation ---
# Initialize the same model as before
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Set up the K-Fold process. We'll split the data into 10 parts (folds).
k_folds = KFold(n_splits=10, shuffle=True, random_state=42)

print("\nRunning 10-Fold Cross-Validation to calculate accuracy... ")
# This command runs the entire process: splits, trains, and tests 10 times.
scores = cross_val_score(model, X, y_encoded, cv=k_folds)

# --- 4. Show the Results ---
print("\n--- Cross-Validation Results ---")
print(f"Scores for each of the 10 folds: {np.round(scores, 3)}")
print(f"\n Average Accuracy: {scores.mean() * 100:.2f}%")
print(f" Standard Deviation: {scores.std():.4f}")

Successfully loaded and combined Training.csv and Testing.csv! 

Running 10-Fold Cross-Validation to calculate accuracy... 

--- Cross-Validation Results ---
Scores for each of the 10 folds: [1.    1.    0.998 1.    1.    1.    1.    1.    1.    1.   ]

 Average Accuracy: 99.98%
 Standard Deviation: 0.0006


In [None]:
# --- Define Your Symptoms Here ---
# Simply add or remove symptom strings in this list to test.
# Make sure the spellings are exactly as they appear in the dataset.

# Test Case 1: Fungal infection
my_symptoms = ['itching', 'skin_rash', 'nodal_skin_eruptions']

# Test Case 2: Common Cold (uncomment to try)
# my_symptoms = ['continuous_sneezing', 'chills', 'cough', 'fatigue']

# Test Case 3: Jaundice (uncomment to try)
# my_symptoms = ['itching', 'yellowish_skin', 'dark_urine', 'vomiting']


# --- Get the Prediction ---
# This uses the same prediction function from the previous step
if 'model' in locals():
    top1, top3 = get_top_predictions(my_symptoms)

    # --- Display the Results ---
    print(f"Symptoms Provided:\n{my_symptoms}\n")
    print("--- Predictions ---")
    print(f" Top-1 (Best Guess): {top1}")
    print(f" Top-3 (Most Likely): {top3}")
else:
    print(" Error: Please make sure you have run the model training cell first.")

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# --- 1. Load and Clean the Training Dataset ---
try:
    df = pd.read_csv('Training.csv')
    print("Training.csv loaded successfully! ")
except FileNotFoundError:
    print("‚ùå Error: Make sure 'Training.csv' is uploaded to Colab.")

# Clean the data: Drop the extra, empty column if it exists.
if 'Unnamed: 133' in df.columns:
    df = df.drop('Unnamed: 133', axis=1)
    print("Cleaned the extra 'Unnamed: 133' column.")

# Prepare the data for the model
X = df.drop('prognosis', axis=1)
y = df['prognosis']
symptom_names = X.columns.tolist()

# Encode the disease names (target) into numbers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


# --- 2. Train the Model on the Full Dataset ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("\nTraining the model on the full dataset... ")
model.fit(X, y_encoded)
print("Model is trained and ready! ")


# --- 3. Prediction Function (Top-1 and Top-3) ---
def get_top_predictions(symptoms):
    """
    Takes a list of symptoms and returns the Top-1 and Top-3 predicted diseases.
    """
    # Create a binary vector (0s and 1s) from the selected symptoms
    input_vector = [1 if symptom in symptoms else 0 for symptom in symptom_names]
    input_df = pd.DataFrame([input_vector], columns=symptom_names)

    # Get the predicted probabilities for all diseases
    probabilities = model.predict_proba(input_df)[0]

    # Get the indices of the top 3 diseases with the highest probabilities
    top_3_indices = np.argsort(probabilities)[-3:][::-1]

    # Decode the indices back to disease names
    top_1_prediction = label_encoder.inverse_transform([top_3_indices[0]])[0]
    top_3_predictions = label_encoder.inverse_transform(top_3_indices)

    return top_1_prediction, list(top_3_predictions)


# --- 4. TEST YOUR MODEL HERE ---
#
# Instructions:
# 1. Change the symptoms in the `my_symptoms` list below.
# 2. Make sure the spellings are an exact match to the dataset columns.
# 3. Run the cell to see the predictions.
#
print("\n---  Symptom Checker Test ---")

# Define Your Symptoms Here
my_symptoms = ['itching', 'skin_rash', 'nodal_skin_eruptions']

# --- Get and Display the Prediction ---
if 'model' in locals():
    top1, top3 = get_top_predictions(my_symptoms)
    print(f"Symptoms Provided:\n{my_symptoms}\n")
    print("--- Predictions ---")
    print(f"Top-1 (Best Guess): {top1}")
    print(f" Top-3 (Most Likely): {top3}")
else:
    print(" Error: The model was not trained. Please check the code above.")

Training.csv loaded successfully! 
Cleaned the extra 'Unnamed: 133' column.

Training the model on the full dataset... 
Model is trained and ready! 

---  Symptom Checker Test ---
Symptoms Provided:
['itching', 'skin_rash', 'nodal_skin_eruptions']

--- Predictions ---
Top-1 (Best Guess): Fungal infection
 Top-3 (Most Likely): ['Fungal infection', 'Varicose veins', 'Urinary tract infection']


In [None]:
my_symptoms = ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes']

In [None]:
  import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# --- 1. Load and Clean the Training Dataset ---
try:
    df = pd.read_csv('Training.csv')
    print(" Training.csv loaded successfully!")
except FileNotFoundError:
    print(" Error: Make sure 'Training.csv' is uploaded to Colab.")

# Clean the data: Drop the extra, empty column if it exists.
if 'Unnamed: 133' in df.columns:
    df = df.drop('Unnamed: 133', axis=1)
    print(" Cleaned the extra 'Unnamed: 133' column.")

# Prepare the data for the model
X = df.drop('prognosis', axis=1)
y = df['prognosis']
symptom_names = X.columns.tolist()

# Encode the disease names (target) into numbers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


# --- 2. Train the Model on the Full Dataset ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
print("\n Training the model on the full dataset...")
model.fit(X, y_encoded)
print(" Model is trained and ready!")


# --- 3. Prediction Function (Top-1 and Top-3) ---
def get_top_predictions(symptoms):
    """
    Takes a list of symptoms and returns the Top-1 and Top-3 predicted diseases.
    """
    input_vector = [1 if symptom in symptoms else 0 for symptom in symptom_names]
    input_df = pd.DataFrame([input_vector], columns=symptom_names)
    probabilities = model.predict_proba(input_df)[0]
    top_3_indices = np.argsort(probabilities)[-3:][::-1]
    top_1_prediction = label_encoder.inverse_transform([top_3_indices[0]])[0]
    top_3_predictions = label_encoder.inverse_transform(top_3_indices)
    return top_1_prediction, list(top_3_predictions)


# --- 4. Automated Testing with Clear Print Statements ---
print("\n---  Running Automated Symptom Checks ---")

# A dictionary of test cases to check
test_cases = {
    "Fungal infection": ['itching', 'skin_rash', 'nodal_skin_eruptions'],
    "Allergy": ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes'],
    "GERD": ['stomach_pain', 'acidity', 'ulcers_on_tongue', 'vomiting', 'cough'],
    "Dengue": [
    'skin_rash',
    'chills',
    'joint_pain',
    'vomiting',
    'high_fever',
    'headache',
    'nausea',
    'loss_of_appetite',
    'pain_behind_the_eyes' ],
    "Migraine": ['acidity', 'headache', 'blurred_and_distorted_vision', 'excessive_hunger'],
    "Psoriasis": ['skin_rash', 'joint_pain', 'inflammatory_nails', 'small_dents_in_nails']
}

# Loop through each test case and print the results
for disease, symptoms in test_cases.items():
    top1, top3 = get_top_predictions(symptoms)

    print("--------------------------------------------------")
    print(f"  Testing for: {disease}")
    print(f"    Symptoms: {symptoms}")
    print(f"    Top-1 Prediction (Best Guess): {top1}")
    print(f"     Top-3 Predictions (Most Likely): {top3}")
    # Check if the prediction was correct
    if top1 == disease:
        print("     Result: CORRECT")
    else:
        print(f"     Result: INCORRECT (Expected {disease}, but got {top1})")

print("--------------------------------------------------")
print("\n All tests complete.")

 Training.csv loaded successfully!
 Cleaned the extra 'Unnamed: 133' column.

 Training the model on the full dataset...
 Model is trained and ready!

---  Running Automated Symptom Checks ---
--------------------------------------------------
  Testing for: Fungal infection
    Symptoms: ['itching', 'skin_rash', 'nodal_skin_eruptions']
    Top-1 Prediction (Best Guess): Fungal infection
     Top-3 Predictions (Most Likely): ['Fungal infection', 'Varicose veins', 'Urinary tract infection']
     Result: CORRECT
--------------------------------------------------
  Testing for: Allergy
    Symptoms: ['continuous_sneezing', 'shivering', 'chills', 'watering_from_eyes']
    Top-1 Prediction (Best Guess): Allergy
     Top-3 Predictions (Most Likely): ['Allergy', 'Varicose veins', 'Urinary tract infection']
     Result: CORRECT
--------------------------------------------------
  Testing for: GERD
    Symptoms: ['stomach_pain', 'acidity', 'ulcers_on_tongue', 'vomiting', 'cough']
    Top-1 Pred