In [44]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [45]:
# Define the data path and SEED
DATA_PATH =r'C:\Users\user\Downloads\My-Projects\Doctor-Octopus'
SEED = 120

# Read the data
data = pd.read_csv(os.path.join(DATA_PATH, 'Disease_symptom_and_patient_profile_dataset.csv'))

# Print the head
print(data.head())


       Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0    Influenza   Yes    No     Yes                  Yes   19  Female   
1  Common Cold    No   Yes     Yes                   No   25  Female   
2       Eczema    No   Yes     Yes                   No   25  Female   
3       Asthma   Yes   Yes      No                  Yes   25    Male   
4       Asthma   Yes   Yes      No                  Yes   25    Male   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Negative  
2         Normal            Normal         Negative  
3         Normal            Normal         Positive  
4         Normal            Normal         Positive  


In [46]:
# Check the number of unique values
data['Disease'].nunique()

116

Looking at the Dataset, We have the feature Outcome Variable which confirms if the suspected disease is true or not. Our task is to build a model that can predict the disease given a person's input. We need to subset our dataset for records that have positive outcomes.

In [47]:
# Subset our data where Outcome Variable == 'Positive'
data = data[data['Outcome Variable'] == 'Positive']

# Reset the index
data.reset_index(drop=True, inplace=True) 

print(data.head(10))

        Disease Fever Cough Fatigue Difficulty Breathing  Age  Gender  \
0     Influenza   Yes    No     Yes                  Yes   19  Female   
1        Asthma   Yes   Yes      No                  Yes   25    Male   
2        Asthma   Yes   Yes      No                  Yes   25    Male   
3        Eczema   Yes    No      No                   No   25  Female   
4     Influenza   Yes   Yes     Yes                  Yes   25  Female   
5     Influenza   Yes   Yes     Yes                  Yes   25  Female   
6        Asthma   Yes    No      No                  Yes   28    Male   
7    Depression   Yes   Yes     Yes                  Yes   29    Male   
8  Liver Cancer   Yes   Yes     Yes                  Yes   29  Female   
9        Stroke   Yes   Yes     Yes                  Yes   29  Female   

  Blood Pressure Cholesterol Level Outcome Variable  
0            Low            Normal         Positive  
1         Normal            Normal         Positive  
2         Normal            Normal

 The dataset contains irrelevant features like the Age and Gender, We do not care about the age and the gender of the patient we only want to predict the disease so we will drop these features

In [48]:
# Drop Irrelevant Features
data.drop(columns=['Age', 'Gender'], inplace= True)

# Check the columns 
print(data.head())


     Disease Fever Cough Fatigue Difficulty Breathing Blood Pressure  \
0  Influenza   Yes    No     Yes                  Yes            Low   
1     Asthma   Yes   Yes      No                  Yes         Normal   
2     Asthma   Yes   Yes      No                  Yes         Normal   
3     Eczema   Yes    No      No                   No         Normal   
4  Influenza   Yes   Yes     Yes                  Yes         Normal   

  Cholesterol Level Outcome Variable  
0            Normal         Positive  
1            Normal         Positive  
2            Normal         Positive  
3            Normal         Positive  
4            Normal         Positive  


In [50]:
# Check Missing Values
data.isnull().sum().any()

False

In [53]:
# Check Duplicated Values
data.duplicated().any()

True

In [54]:
# Check duplicated
print(data[data.duplicated()])

                                          Disease Fever Cough Fatigue  \
2                                          Asthma   Yes   Yes      No   
5                                       Influenza   Yes   Yes     Yes   
11                                         Asthma   Yes   Yes      No   
12                                         Asthma   Yes   Yes      No   
17                                     Bronchitis   Yes   Yes     Yes   
23                                      Influenza   Yes   Yes     Yes   
30                                         Asthma   Yes   Yes      No   
31                                         Asthma   Yes   Yes      No   
37                                Hyperthyroidism   Yes   Yes     Yes   
42                                        Malaria   Yes    No      No   
59                                         Asthma   Yes   Yes      No   
60                                         Asthma   Yes   Yes      No   
63                                         Asthma  

After viewing the duplicated data it makes no sense to drop the duplicates since it is not actually the same instances for all the features. Some features are actually different. We will keep these in our dataset 

In [None]:
# 