In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Exploratory Data Analysis

#### Loading the data

In [2]:
asthma_dataset = pd.read_csv('../data/asthma_disease_data.csv')

In [4]:
pd.set_option('display.max_columns', None)
asthma_dataset.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,7.571845,3.965316,0,0,0,0,1,0,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,3.049807,8.260605,0,0,0,0,1,0,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


In [5]:
asthma_dataset.shape

(2392, 29)

#### Exploring the data

In [9]:
asthma_dataset.dtypes

PatientID                   int64
Age                         int64
Gender                      int64
Ethnicity                   int64
EducationLevel              int64
BMI                       float64
Smoking                     int64
PhysicalActivity          float64
DietQuality               float64
SleepQuality              float64
PollutionExposure         float64
PollenExposure            float64
DustExposure              float64
PetAllergy                  int64
FamilyHistoryAsthma         int64
HistoryOfAllergies          int64
Eczema                      int64
HayFever                    int64
GastroesophagealReflux      int64
LungFunctionFEV1          float64
LungFunctionFVC           float64
Wheezing                    int64
ShortnessOfBreath           int64
ChestTightness              int64
Coughing                    int64
NighttimeSymptoms           int64
ExerciseInduced             int64
Diagnosis                   int64
DoctorInCharge             object
dtype: object

In [53]:
# Isolating binary categorical variables
asthma_int_cols = asthma_dataset.select_dtypes(include='int')
asthma_int_cols = asthma_int_cols[[col for col in asthma_int_cols.columns if col not in ['Age','PatientID']]]
asthma_int_cols.head()

Unnamed: 0,Gender,Ethnicity,EducationLevel,Smoking,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis
0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,1,0
1,1,2,2,0,0,0,1,0,0,0,1,0,0,1,1,1,0
2,0,2,1,0,0,1,1,0,1,0,1,1,1,0,1,1,0
3,1,2,1,0,0,0,0,0,1,0,1,0,1,1,1,0,0
4,0,0,3,0,0,0,0,0,1,0,1,1,1,0,0,1,0


In [51]:
for col in asthma_int_cols.columns:
    print(asthma_dataset[col].value_counts())

Gender
0    1212
1    1180
Name: count, dtype: int64
Ethnicity
0    1465
1     475
2     229
3     223
Name: count, dtype: int64
EducationLevel
1    933
2    749
0    478
3    232
Name: count, dtype: int64
Smoking
0    2053
1     339
Name: count, dtype: int64
PetAllergy
0    1995
1     397
Name: count, dtype: int64
FamilyHistoryAsthma
0    1672
1     720
Name: count, dtype: int64
HistoryOfAllergies
0    1437
1     955
Name: count, dtype: int64
Eczema
0    1933
1     459
Name: count, dtype: int64
HayFever
0    1786
1     606
Name: count, dtype: int64
GastroesophagealReflux
0    2014
1     378
Name: count, dtype: int64
Wheezing
1    1426
0     966
Name: count, dtype: int64
ShortnessOfBreath
1    1197
0    1195
Name: count, dtype: int64
ChestTightness
1    1204
0    1188
Name: count, dtype: int64
Coughing
1    1204
0    1188
Name: count, dtype: int64
NighttimeSymptoms
1    1441
0     951
Name: count, dtype: int64
ExerciseInduced
1    1447
0     945
Name: count, dtype: int64
Diagnosis
0   