In [2]:
import numpy as np 
import pandas as pd

In [89]:
df = pd.read_csv('heart_2020_cleaned.csv')
df.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
6,No,21.63,No,No,No,15.0,0.0,No,Female,70-74,White,No,Yes,Fair,4.0,Yes,No,Yes
7,No,31.64,Yes,No,No,5.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,9.0,Yes,No,No
8,No,26.45,No,No,No,0.0,0.0,No,Female,80 or older,White,"No, borderline diabetes",No,Fair,5.0,No,Yes,No
9,No,40.69,No,No,No,0.0,0.0,Yes,Male,65-69,White,No,Yes,Good,10.0,No,No,No


In [39]:
#check for null values
df.any()

HeartDisease        True
BMI                 True
Smoking             True
AlcoholDrinking     True
Stroke              True
PhysicalHealth      True
MentalHealth        True
DiffWalking         True
Sex                 True
AgeCategory         True
Race                True
Diabetic            True
PhysicalActivity    True
GenHealth           True
SleepTime           True
Asthma              True
KidneyDisease       True
SkinCancer          True
dtype: bool

In [37]:
#unique values of each column
for col in df.columns:
    x = df[col].unique() 
    print(col, x, "(",len(x),")")

HeartDisease ['No' 'Yes'] ( 2 )
BMI [16.6  20.34 26.58 ... 62.42 51.46 46.56] ( 3604 )
Smoking ['Yes' 'No'] ( 2 )
AlcoholDrinking ['No' 'Yes'] ( 2 )
Stroke ['No' 'Yes'] ( 2 )
PhysicalHealth [ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.] ( 31 )
MentalHealth [30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.] ( 31 )
DiffWalking ['No' 'Yes'] ( 2 )
Sex ['Female' 'Male'] ( 2 )
AgeCategory ['55-59' '80 or older' '65-69' '75-79' '40-44' '70-74' '60-64' '50-54'
 '45-49' '18-24' '35-39' '30-34' '25-29'] ( 13 )
Race ['White' 'Black' 'Asian' 'American Indian/Alaskan Native' 'Other'
 'Hispanic'] ( 6 )
Diabetic ['Yes' 'No' 'No, borderline diabetes' 'Yes (during pregnancy)'] ( 4 )
PhysicalActivity ['Yes' 'No'] ( 2 )
GenHealth ['Very good' 'Fair' 'Good' 'Poor' 'Excellent'] ( 5 )
SleepTime [ 5.  7.  8.  6. 12.  4.  9. 10. 15.  3.  2.  1. 16. 18. 14.

In Diabetic column, there are "No, borderline diabetes" and "Yes (during pregnancy)" values. They are likely considered as "No"

In [90]:
df['Diabetic'] = df['Diabetic'].apply(lambda x : "No" if (x == 'No, borderline diabetes' or x ==  'Yes (during pregnancy)') else x)
#re-check
df['Diabetic'].unique()

array(['Yes', 'No'], dtype=object)

### Data analysis

In [91]:
#create an array to contain columns that have only Yes-No values
arr = []
for col in df.columns:
    if len(df[col].unique()) == 2 :
        arr.append(col)
arr.remove('Sex')

In [92]:
def yesno_process(x):
    return 1 if x =='Yes' else 0
for col in arr:
    df[col]= df[col].apply(yesno_process)
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,Female,55-59,White,1,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,Female,80 or older,White,0,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,Male,65-69,White,1,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,Female,75-79,White,0,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,Female,40-44,White,0,1,Very good,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,Male,60-64,Hispanic,1,0,Fair,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,Male,35-39,Hispanic,0,1,Very good,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,Female,45-49,Hispanic,0,1,Good,6.0,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,Female,25-29,Hispanic,0,0,Good,12.0,0,0,0
