In [49]:
# 1️⃣ Import Required Libraries
import pandas as pd
import os

# Define paths
RAW_DATA_PATH = "../data/raw/"
PROCESSED_DATA_PATH = "../data/processed/"

In [50]:
# Ensure processed data directory exists
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)


# 2️⃣ Load Datasets

In [51]:

dataset_path = os.path.join(RAW_DATA_PATH, "dataset.csv")
description_path = os.path.join(RAW_DATA_PATH, "symptom_Description.csv")
precaution_path = os.path.join(RAW_DATA_PATH, "symptom_precaution.csv")
severity_path = os.path.join(RAW_DATA_PATH, "Symptom-severity.csv")

In [52]:
df_disease = pd.read_csv(dataset_path)
df_description = pd.read_csv(description_path)
df_precaution = pd.read_csv(precaution_path)
df_severity = pd.read_csv(severity_path)

In [53]:
# 3️⃣ Inspect Column Names
print("Columns in dataset.csv:", df_disease.columns)
print("Columns in symptom_Description.csv:", df_description.columns)
print("Columns in symptom_precaution.csv:", df_precaution.columns)
print("Columns in Symptom-severity.csv:", df_severity.columns)


Columns in dataset.csv: Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')
Columns in symptom_Description.csv: Index(['Disease', 'Description'], dtype='object')
Columns in symptom_precaution.csv: Index(['Disease', 'Precaution_1', 'Precaution_2', 'Precaution_3',
       'Precaution_4'],
      dtype='object')
Columns in Symptom-severity.csv: Index(['Symptom', 'weight'], dtype='object')


In [54]:
# 4️⃣ Rename columns for consistency
df_disease.columns = df_disease.columns.str.strip()
df_description.columns = df_description.columns.str.strip()
df_precaution.columns = df_precaution.columns.str.strip()
df_severity.columns = df_severity.columns.str.strip()

In [55]:
# 5️⃣ Convert Wide to Long Format for Symptoms
df_long = df_disease.melt(id_vars=["Disease"], var_name="Symptom_Type", value_name="Symptom")

In [57]:
# Remove NaN Symptoms
df_long.dropna(subset=["Symptom"], inplace=True)

In [58]:
# 6️⃣ Standardize Symptoms Names
df_long["Symptom"] = df_long["Symptom"].str.strip().str.lower()
df_severity["Symptom"] = df_severity["Symptom"].str.strip().str.lower()

In [59]:
# 7️⃣ Merge Severity Data
df_long = df_long.merge(df_severity, on="Symptom", how="left")

In [60]:
# 8️⃣ Handle Missing Severity Scores
df_long["weight"].fillna(0, inplace=True)  # If severity is missing, assign 0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_long["weight"].fillna(0, inplace=True)  # If severity is missing, assign 0


In [61]:
# 9️⃣ Save Cleaned Dataset
processed_file_path = os.path.join(PROCESSED_DATA_PATH, "disease_symptoms_severity.csv")
df_long.to_csv(processed_file_path, index=False)

print(f"✅ Processed dataset saved at: {processed_file_path}")

✅ Processed dataset saved at: ../data/processed/disease_symptoms_severity.csv


In [62]:

# Display first few rows
df_long.head()

Unnamed: 0,Disease,Symptom_Type,Symptom,weight
0,Fungal infection,Symptom_1,itching,1.0
1,Fungal infection,Symptom_1,skin_rash,3.0
2,Fungal infection,Symptom_1,itching,1.0
3,Fungal infection,Symptom_1,itching,1.0
4,Fungal infection,Symptom_1,itching,1.0


## Load and Inspect Processed Data

In [63]:
import pandas as pd

# Load the processed dataset
processed_file_path = "../data/processed/disease_symptoms_severity.csv"
df_cleaned = pd.read_csv(processed_file_path)

# Display first few rows
df_cleaned.head()

Unnamed: 0,Disease,Symptom_Type,Symptom,weight
0,Fungal infection,Symptom_1,itching,1.0
1,Fungal infection,Symptom_1,skin_rash,3.0
2,Fungal infection,Symptom_1,itching,1.0
3,Fungal infection,Symptom_1,itching,1.0
4,Fungal infection,Symptom_1,itching,1.0


## Check for Missing Values and Duplicates

In [64]:
# Check for missing values
missing_values = df_cleaned.isnull().sum()

# Check for duplicate rows
duplicate_rows = df_cleaned.duplicated().sum()

print("Missing Values:\n", missing_values)
print("\nNumber of Duplicate Rows:", duplicate_rows)


Missing Values:
 Disease         0
Symptom_Type    0
Symptom         0
weight          0
dtype: int64

Number of Duplicate Rows: 36160


In [65]:
# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Check the shape of the dataset after removing duplicates
print("Shape after removing duplicates:", df_cleaned.shape)

# Save the cleaned data
df_cleaned.to_csv("../data/processed/disease_symptoms_severity_cleaned.csv", index=False)


Shape after removing duplicates: (602, 4)


## Check Data Distribution

In [66]:
# Check the distribution of symptoms
symptom_counts = df_cleaned["Symptom"].value_counts()

# Display top 10 most common symptoms
print("Top 10 most common symptoms:")
print(symptom_counts.head(10))

# Display top 10 diseases with the most symptom associations
disease_counts = df_cleaned["Disease"].value_counts()
print("\nTop 10 diseases with most symptoms:")
print(disease_counts.head(10))


Top 10 most common symptoms:
Symptom
fatigue              28
vomiting             26
high_fever           24
nausea               20
loss_of_appetite     20
headache             19
abdominal_pain       18
yellowish_skin       16
yellowing_of_eyes    14
malaise              12
Name: count, dtype: int64

Top 10 diseases with most symptoms:
Disease
Common Cold       33
Tuberculosis      31
Dengue            27
Hepatitis E       25
Hypothyroidism    25
Hypoglycemia      23
Hepatitis B       23
Chicken pox       21
hepatitis A       21
Pneumonia         21
Name: count, dtype: int64


## Encode Symptoms for ML Readiness

In [67]:
# Pivot table to create a structured dataset with binary encoding for symptoms
df_encoded = df_cleaned.pivot_table(index="Disease", columns="Symptom", values="weight", aggfunc="max").fillna(0)



# Reset index to bring Disease back as a column
df_encoded.reset_index(inplace=True)


# Ensure columns are in sorted order
df_encoded = df_encoded.reindex(sorted(df_encoded.columns), axis=1)

# Save the processed data
df_encoded.to_csv("../data/processed/disease_symptom_encoded.csv", index=False)

print("✅ Encoded dataset saved at: ../data/processed/disease_symptom_encoded.csv")

✅ Encoded dataset saved at: ../data/processed/disease_symptom_encoded.csv


## Data Augmentation

In [68]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("../data/processed/disease_symptom_encoded.csv")

# Set a minimum number of cases per disease (e.g., 5)
min_cases = 5

# Augment data
augmented_data = []
for disease, group in df.groupby("Disease"):
    if len(group) < min_cases:
        additional_samples = group.sample(n=min_cases - len(group), replace=True)
        # Slightly modify symptom severity to create variations
        additional_samples.iloc[:, 1:] += np.random.randint(-1, 2, size=additional_samples.iloc[:, 1:].shape)
        additional_samples.iloc[:, 1:] = additional_samples.iloc[:, 1:].clip(0, 1)  # ✅ Ensuring binary encoding
        augmented_data.append(additional_samples)

# Combine original and augmented data
if augmented_data:
    df = pd.concat([df] + augmented_data, ignore_index=True)

# Save the augmented dataset
df.to_csv("../data/processed/disease_symptom_encoded_augmented.csv", index=False)

print(f"New dataset shape after augmentation: {df.shape}")


New dataset shape after augmentation: (205, 132)


In [69]:
df = pd.read_csv("../data/processed/disease_symptom_encoded_augmented.csv")

# Check if all diseases now have at least 5 cases
print(df["Disease"].value_counts())


Disease
(vertigo) Paroymsal  Positional Vertigo    5
AIDS                                       5
Acne                                       5
Alcoholic hepatitis                        5
Allergy                                    5
Arthritis                                  5
Bronchial Asthma                           5
Cervical spondylosis                       5
Chicken pox                                5
Chronic cholestasis                        5
Common Cold                                5
Dengue                                     5
Diabetes                                   5
Dimorphic hemmorhoids(piles)               5
Drug Reaction                              5
Fungal infection                           5
GERD                                       5
Gastroenteritis                            5
Heart attack                               5
Hepatitis B                                5
Hepatitis C                                5
Hepatitis D                                5
He

In [70]:
df.head()

Unnamed: 0,Disease,abdominal_pain,abnormal_menstruation,acidity,acute_liver_failure,altered_sensorium,anxiety,back_pain,belly_pain,blackheads,...,vomiting,watering_from_eyes,weakness_in_limbs,weakness_of_one_body_side,weight_gain,weight_loss,yellow_crust_ooze,yellow_urine,yellowing_of_eyes,yellowish_skin
0,(vertigo) Paroymsal Positional Vertigo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AIDS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Acne,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alcoholic hepatitis,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,Allergy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Define Features (X) and Target (y)

In [71]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the augmented dataset
df = pd.read_csv("../data/processed/disease_symptom_encoded_augmented.csv")

# Define Features & Target
X = df.drop(columns=["Disease"])  # Features
y = df["Disease"]  # Target

# Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Save split datasets
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

# Print shapes to verify
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (164, 131)
Test set shape: (41, 131)
