clean and transform dataset
data preprocessing
combine symptoms into a list per row
drop rows with no symptoms

In [None]:
import pandas as pd
import random

# Load the dataset
df = pd.read_csv('/Users/jessemoreno/PythonProgramming/Project/data/Symptom2Disease.csv'
)

# Combine all symptom columns into one list per row
symptom_cols = [col for col in df.columns if "Symptom" in col]
df["Symptom_List"] = df[symptom_cols].values.tolist()

# Remove empty values and strip whitespaces
df["Symptom_List"] = df["Symptom_List"].apply(
    lambda x: [str(symptom).strip() for symptom in x if pd.notna(symptom)]
)

# Filter to rows with at least 3 symptoms
df = df[df["Symptom_List"].apply(len) >= 3]

# Create a new dataset where we randomly select 3–7 symptoms
def generate_partial_symptoms(row):
    symptom_list = row["Symptom_List"]
    num_symptoms = random.randint(3, min(7, len(symptom_list)))
    return random.sample(symptom_list, num_symptoms)

df["User_Input_Symptoms"] = df.apply(generate_partial_symptoms, axis=1)

# Final cleaned dataset
processed_df = df[["User_Input_Symptoms", "Disease"]].copy()

# Optional: display a few rows
print(processed_df.head(10))


                                 User_Input_Symptoms           Disease
0  [skin_rash, nodal_skin_eruptions, dischromic _...  Fungal infection
1  [skin_rash, dischromic _patches, nodal_skin_er...  Fungal infection
2  [dischromic _patches, itching, nodal_skin_erup...  Fungal infection
3          [skin_rash, itching, dischromic _patches]  Fungal infection
4         [skin_rash, nodal_skin_eruptions, itching]  Fungal infection
5  [nodal_skin_eruptions, skin_rash, dischromic _...  Fungal infection
6  [itching, nodal_skin_eruptions, dischromic _pa...  Fungal infection
7          [skin_rash, itching, dischromic _patches]  Fungal infection
8         [nodal_skin_eruptions, skin_rash, itching]  Fungal infection
9          [itching, skin_rash, dischromic _patches]  Fungal infection


convert symptoms to features
create a ML friendly format
['itching', 'skin rash'] = binary vector [1, 1, 1, 0, 0]
use multilabelbinarizer from sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Vectorize the symptoms using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(processed_df["User_Input_Symptoms"])

# Step 2: Labels (Diseases)
y = processed_df["Disease"]

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Accuracy: {accuracy:.2f}\n")
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

✅ Accuracy: 0.97

📊 Classification Report:
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       0.95      1.00      0.97        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       0.91      1.00      0.95        21
                    Chronic cholestasis       0.81      0.87      0.84        15
                            Common Cold       1.00      1.00     