# Disease Prediction and Care Suggestion Model

## Data Loading and Preprocessing

In [1]:
import pandas as pd
from sklearn.utils import shuffle

# Load the dataset
df = pd.read_csv('disease_symptoms.csv')

# Shuffle the dataset
df = shuffle(df, random_state=42)

# Display the first few rows of the dataset
df.head()


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_1.1,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
4379,Hypoglycemia,vomiting,fatigue,anxiety,sweating,headache,nausea,blurred_and_distorted_vision,excessive_hunger,drying_and_tingling_lips,slurred_speech,irritability,palpitations,,,,,
393,Psoriasis,skin_rash,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,,,,,,,,,,,,
1164,Osteoarthritis,joint_pain,neck_pain,hip_joint_pain,swelling_joints,,,,,,,,,,,,,
4478,Bronchial Asthma,fatigue,cough,high_fever,breathlessness,family_history,mucoid_sputum,,,,,,,,,,,
731,Hyperthyroidism,fatigue,mood_swings,weight_loss,restlessness,sweating,diarrhoea,fast_heart_rate,excessive_hunger,muscle_weakness,irritability,abnormal_menstruation,,,,,,


In [2]:
# Replace hyphens with spaces in the symptom columns
for col in df.columns[1:]:
    df[col] = df[col].str.replace('_', ' ')

# Strip any leading or trailing spaces from the symptom strings
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Replace NaN values with 0
df = df.fillna(0)

# Display the cleaned dataset
df.head()


Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_1.1,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
4379,Hypoglycemia,vomiting,fatigue,anxiety,sweating,headache,nausea,blurred and distorted vision,excessive hunger,drying and tingling lips,slurred speech,irritability,palpitations,0,0,0,0,0
393,Psoriasis,skin rash,skin peeling,silver like dusting,small dents in nails,inflammatory nails,0,0,0,0,0,0,0,0,0,0,0,0
1164,Osteoarthritis,joint pain,neck pain,hip joint pain,swelling joints,0,0,0,0,0,0,0,0,0,0,0,0,0
4478,Bronchial Asthma,fatigue,cough,high fever,breathlessness,family history,mucoid sputum,0,0,0,0,0,0,0,0,0,0,0
731,Hyperthyroidism,fatigue,mood swings,weight loss,restlessness,sweating,diarrhoea,fast heart rate,excessive hunger,muscle weakness,irritability,abnormal menstruation,0,0,0,0,0,0


## Exploratory Data Analysis (EDA)

#### Distribution of Diseases:

In [3]:
import matplotlib.pyplot as plt

In [4]:
df.shape

(4961, 18)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4961 entries, 4379 to 860
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Disease      4961 non-null   object
 1   Symptom_1    4961 non-null   object
 2   Symptom_2    4961 non-null   object
 3   Symptom_3    4961 non-null   object
 4   Symptom_4    4961 non-null   object
 5   Symptom_5    4961 non-null   object
 6   Symptom_6    4961 non-null   object
 7   Symptom_7    4961 non-null   object
 8   Symptom_8    4961 non-null   object
 9   Symptom_9    4961 non-null   object
 10  Symptom_1.1  4961 non-null   object
 11  Symptom_11   4961 non-null   object
 12  Symptom_12   4961 non-null   object
 13  Symptom_13   4961 non-null   object
 14  Symptom_14   4961 non-null   object
 15  Symptom_15   4961 non-null   object
 16  Symptom_16   4961 non-null   object
 17  Symptom_17   4961 non-null   object
dtypes: object(18)
memory usage: 736.4+ KB
