In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import unittest
from sklearn.preprocessing import StandardScaler

In [2]:
import sys
sys.path.insert(0, '../src/') 
from functions import *

### Running unit tests of custom functions

In [3]:
%run ../src/test_functions.py

...
----------------------------------------------------------------------
Ran 3 tests in 0.002s

OK


### Load the data

In [4]:
asthma_dataset = pd.read_csv('../data/asthma_disease_data.csv')
pd.set_option('display.max_columns', None)
asthma_dataset.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,7.571845,3.965316,0,0,0,0,1,0,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,3.049807,8.260605,0,0,0,0,1,0,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


#### Check the data dimensions

In [5]:
asthma_dataset.shape

(2392, 29)

### Data Exploration

#### Check for missing values

In [6]:
asthma_dataset.isnull().sum()

PatientID                 0
Age                       0
Gender                    0
Ethnicity                 0
EducationLevel            0
BMI                       0
Smoking                   0
PhysicalActivity          0
DietQuality               0
SleepQuality              0
PollutionExposure         0
PollenExposure            0
DustExposure              0
PetAllergy                0
FamilyHistoryAsthma       0
HistoryOfAllergies        0
Eczema                    0
HayFever                  0
GastroesophagealReflux    0
LungFunctionFEV1          0
LungFunctionFVC           0
Wheezing                  0
ShortnessOfBreath         0
ChestTightness            0
Coughing                  0
NighttimeSymptoms         0
ExerciseInduced           0
Diagnosis                 0
DoctorInCharge            0
dtype: int64

#### Preview categorical variables

In [7]:
asthma_dataset_cat = extract_categorical_cols(asthma_dataset)

for col in asthma_dataset_cat.columns:
    print(asthma_dataset_cat[col].value_counts())

Gender
0    1212
1    1180
Name: count, dtype: int64
Ethnicity
0    1465
1     475
2     229
3     223
Name: count, dtype: int64
EducationLevel
1    933
2    749
0    478
3    232
Name: count, dtype: int64
Smoking
0    2053
1     339
Name: count, dtype: int64
PetAllergy
0    1995
1     397
Name: count, dtype: int64
FamilyHistoryAsthma
0    1672
1     720
Name: count, dtype: int64
HistoryOfAllergies
0    1437
1     955
Name: count, dtype: int64
Eczema
0    1933
1     459
Name: count, dtype: int64
HayFever
0    1786
1     606
Name: count, dtype: int64
GastroesophagealReflux
0    2014
1     378
Name: count, dtype: int64
Wheezing
1    1426
0     966
Name: count, dtype: int64
ShortnessOfBreath
1    1197
0    1195
Name: count, dtype: int64
ChestTightness
1    1204
0    1188
Name: count, dtype: int64
Coughing
1    1204
0    1188
Name: count, dtype: int64
NighttimeSymptoms
1    1441
0     951
Name: count, dtype: int64
ExerciseInduced
1    1447
0     945
Name: count, dtype: int64
Diagnosis
0   

#### Calculate pair-wise correlation of numeric variables

In [8]:
asthma_dataset_num = extract_numerical_cols(asthma_dataset)
corr_matrix = calculate_pair_wise_correlation(asthma_dataset_num)
corr_matrix

Unnamed: 0,BMI,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,LungFunctionFEV1,LungFunctionFVC,Age
BMI,1.0,-0.01,-0.01,-0.01,-0.03,-0.01,0.06,-0.01,0.0,0.02
PhysicalActivity,-0.01,1.0,0.05,0.01,-0.03,0.01,-0.01,0.02,-0.03,0.0
DietQuality,-0.01,0.05,1.0,0.01,0.01,0.02,-0.03,-0.01,-0.0,0.0
SleepQuality,-0.01,0.01,0.01,1.0,-0.03,-0.04,-0.03,-0.0,0.0,0.0
PollutionExposure,-0.03,-0.03,0.01,-0.03,1.0,0.01,-0.01,-0.02,0.03,-0.04
PollenExposure,-0.01,0.01,0.02,-0.04,0.01,1.0,-0.0,-0.01,0.01,-0.01
DustExposure,0.06,-0.01,-0.03,-0.03,-0.01,-0.0,1.0,-0.0,0.02,0.01
LungFunctionFEV1,-0.01,0.02,-0.01,-0.0,-0.02,-0.01,-0.0,1.0,0.01,-0.01
LungFunctionFVC,0.0,-0.03,-0.0,0.0,0.03,0.01,0.02,0.01,1.0,0.01
Age,0.02,0.0,0.0,0.0,-0.04,-0.01,0.01,-0.01,0.01,1.0


#### Explore categorical variables by calculating chi-square test of independenc

In [9]:
chi_square_test_results = calculate_pair_wise_chi_square_test(asthma_dataset_cat, p_value_threshold = 0.1)
chi_square_test_results

  output_df = pd.concat([output_df, new_row], ignore_index=True)


Unnamed: 0,first_col,second_col,chi_square_stat,p_value
0,ChestTightness,Diagnosis,3.34454,0.067428
1,EducationLevel,Diagnosis,7.041078,0.0706
2,EducationLevel,GastroesophagealReflux,6.452523,0.091554
3,EducationLevel,PetAllergy,7.387168,0.06053
4,Ethnicity,ShortnessOfBreath,6.48279,0.090344
9,ExerciseInduced,Diagnosis,6.474892,0.010941
5,HayFever,Wheezing,8.135196,0.004341
6,ShortnessOfBreath,ChestTightness,3.537593,0.059992
7,Smoking,ShortnessOfBreath,8.012524,0.004645
8,Wheezing,Coughing,3.280153,0.070122


### Data Processing and Cleaning

#### Drop columns with no information value

In [11]:
asthma_dataset = drop_cols_no_cardinality(asthma_dataset)
asthma_dataset.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,1,1,0,0,0,0,1.369051,4.941206,0,0,1,0,0,1,0
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,0,0,1,0,0,0,2.197767,1.702393,1,0,0,1,1,1,0
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,0,1,1,0,1,0,1.698011,5.022553,1,1,1,0,1,1,0
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,7.571845,3.965316,0,0,0,0,1,0,3.032037,2.300159,1,0,1,1,1,0,0
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,3.049807,8.260605,0,0,0,0,1,0,3.470589,3.067944,1,1,1,0,0,1,0


#### Feature Standardization

Standardize numeric variables by converting them to a z-score

In [12]:
asthma_dataset_num_standardized = standardize_numeric_variables(asthma_dataset)

In [13]:
asthma_dataset_standardized = pd.merge(
    asthma_dataset[[col for col in asthma_dataset.columns if col not in asthma_dataset_num_standardized.columns]],
    asthma_dataset_num_standardized,
    left_index=True,
    right_index=True)

In [14]:
asthma_dataset_standardized = asthma_dataset_standardized[asthma_dataset.columns]
asthma_dataset_standardized

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,PollutionExposure,PollenExposure,DustExposure,PetAllergy,FamilyHistoryAsthma,HistoryOfAllergies,Eczema,HayFever,GastroesophagealReflux,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis
0,5034,0.965740,0,1,0,-1.582769,0,-1.432099,0.160113,0.971063,0.809355,-0.780866,-1.401921,1,1,0,0,0,0,-1.368934,0.920608,0,0,1,0,0,1,0
1,5035,-0.747054,1,2,2,-0.623300,0,0.291269,0.453069,-1.076746,-1.036866,0.810184,0.560684,0,0,1,0,0,0,-0.407132,-1.564256,1,0,0,1,1,1,0
2,5036,0.687989,0,2,1,-1.229074,0,0.581330,1.434458,-0.102976,-1.210374,-1.267434,0.162295,0,1,1,0,1,0,-0.987146,0.983019,1,1,1,0,1,1,0
3,5037,-0.098970,1,2,1,1.565307,0,-1.256398,0.276233,-1.596880,-1.509757,0.849659,-0.355611,0,0,0,0,1,0,0.561114,-1.105641,1,0,1,1,1,0,0
4,5038,0.873156,0,0,3,-1.105686,0,-0.154081,-0.651625,1.504976,-1.373822,-0.713717,1.146977,0,0,0,0,1,0,1.070095,-0.516586,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,7421,0.039905,1,0,2,0.252042,0,-0.699950,0.376978,0.740107,-0.861740,0.760717,-0.544470,1,0,1,0,0,0,0.669296,1.093099,0,1,0,0,0,1,1
2388,7422,-1.117388,1,0,1,-0.903322,0,0.259526,-0.218561,0.411163,0.927074,-0.980178,0.519779,0,0,0,1,0,0,-1.642920,1.356614,0,0,0,1,1,0,1
2389,7423,0.549114,0,3,2,1.365905,0,-0.109067,1.096868,0.268175,-0.755772,-0.711873,1.574952,0,0,0,0,1,0,-1.001130,-0.302584,1,0,1,1,0,1,1
2390,7424,0.178780,1,0,2,-0.527792,0,1.591768,0.804295,-0.174204,1.511361,0.898316,0.024327,0,1,1,0,0,1,1.082816,-1.555908,0,1,1,0,1,1,0
