In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 1: The Data and Features

Data Source and Feature Codebook: https://www.cdc.gov/brfss/annual_data/annual_2019.html

According to the CDC, important risk factors for heart disease are high blood pressure, high cholesterol, diabetes, smoking, obesity, unhealthy diet, and physical inactivity. From the 343 variables in the dataset, the health indicators in this dataset were chosen from this information. 

Source: https://www.cdc.gov/chronicdisease/resources/publications/factsheets/heart-disease-stroke.htm#:~:text=Leading%20risk%20factors%20for%20heart,unhealthy%20diet%2C%20and%20physical%20inactivity.

TARGET VARIABLE:
heart disease - _MICHD

DEMOGRAPHICS:
age - _AGEG5YR,
sex - SEXVAR,
education - EDUCA,
income - INCOME2,

INDICATORS:
cholestrol (high) - TOLDHI2,
BP (high) (hypertension) - _RFHYPE5,
obesity (BMI) - _BMI5,
alcohol consumption (heavy) - _RFDRHV7,
smoking (current) - _SMOKER3,
stroke - CVDSTRK3,
diabetes - DIABETE4,
diet fruit - _FRTLT1A,
diet vegetables -  _VEGLT1A,
physical activity - _TOTINDA,

HEALTH STATUS:
general health - GENHLTH,
mental health - MENTHLTH,
physical health - PHYSHLTH

In [None]:
brfss_raw = pd.read_csv('brfss2019.csv')

In [38]:
brfss_raw.head()

Unnamed: 0.1,Unnamed: 0,_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,_PSU,...,_VEGESU1,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1,_FLSHOT7,_PNEUMO3,_AIDTST4
0,1,1,1,1182019,1,18,2019,1100,2019000001,2019000001,...,114.0,1,1,1,1,0,0,2.0,1.0,2.0
1,2,1,1,1132019,1,13,2019,1100,2019000002,2019000002,...,121.0,1,1,1,1,0,0,1.0,1.0,2.0
2,3,1,1,1182019,1,18,2019,1100,2019000003,2019000003,...,164.0,1,1,1,1,0,0,1.0,2.0,2.0
3,4,1,1,1182019,1,18,2019,1200,2019000004,2019000004,...,,9,9,1,1,1,1,9.0,9.0,
4,5,1,1,1042019,1,4,2019,1100,2019000005,2019000005,...,178.0,1,1,1,1,0,0,2.0,1.0,2.0


In [39]:
brfss_raw.shape

(418268, 343)

In [40]:
brfss_selected = brfss_raw[['_MICHD',
                       '_AGEG5YR', 
                       'SEXVAR', 
                       'EDUCA',
                       'INCOME2',
                       'TOLDHI2',
                       '_RFHYPE5',
                       '_BMI5',
                       '_RFDRHV7', 
                       '_RFSMOK3', 
                       'CVDSTRK3', 
                       'DIABETE4', 
                       '_TOTINDA', 
                       '_VEGLT1A',
                       '_FRTLT1A', 
                       'GENHLTH', 
                       'MENTHLTH', 
                       'PHYSHLTH']]

In [41]:
brfss_selected.head()

Unnamed: 0,_MICHD,_AGEG5YR,SEXVAR,EDUCA,INCOME2,TOLDHI2,_RFHYPE5,_BMI5,_RFDRHV7,_RFSMOK3,CVDSTRK3,DIABETE4,_TOTINDA,_VEGLT1A,_FRTLT1A,GENHLTH,MENTHLTH,PHYSHLTH
0,2.0,13,2,3.0,3.0,1.0,2,2817.0,1,1,2.0,3.0,2,1,1,3.0,88.0,15.0
1,2.0,11,2,5.0,5.0,2.0,1,1854.0,1,1,2.0,3.0,1,1,1,4.0,88.0,10.0
2,2.0,10,2,6.0,7.0,2.0,2,3162.0,1,1,2.0,1.0,1,1,1,3.0,30.0,88.0
3,2.0,13,2,5.0,6.0,2.0,2,2030.0,9,9,2.0,3.0,9,9,9,4.0,88.0,30.0
4,2.0,13,2,5.0,99.0,1.0,1,2148.0,1,1,2.0,3.0,2,1,1,2.0,88.0,88.0


In [42]:
brfss_selected.shape

(418268, 18)

In [43]:
brfss_selected.dtypes

_MICHD      float64
_AGEG5YR      int64
SEXVAR        int64
EDUCA       float64
INCOME2     float64
TOLDHI2     float64
_RFHYPE5      int64
_BMI5       float64
_RFDRHV7      int64
_RFSMOK3      int64
CVDSTRK3    float64
DIABETE4    float64
_TOTINDA      int64
_VEGLT1A      int64
_FRTLT1A      int64
GENHLTH     float64
MENTHLTH    float64
PHYSHLTH    float64
dtype: object

2: Missing Values

In [44]:
brfss_selected.isnull().sum()

_MICHD       4325
_AGEG5YR        0
SEXVAR          0
EDUCA          26
INCOME2      6881
TOLDHI2     24443
_RFHYPE5        0
_BMI5       36203
_RFDRHV7        0
_RFSMOK3        0
CVDSTRK3       11
DIABETE4        9
_TOTINDA        0
_VEGLT1A        0
_FRTLT1A        0
GENHLTH        26
MENTHLTH       19
PHYSHLTH       32
dtype: int64

In [45]:
#drop all missing rows
#too many rows with missing values for heart attack, chol, income, and BMI
#large dataset,chose not to interpolate over the remaining 30-40 missing rows
brfss_selected = brfss_selected.dropna()

In [46]:
brfss_selected.shape

(356710, 18)

In [47]:
brfss_selected.isnull().sum()

_MICHD      0
_AGEG5YR    0
SEXVAR      0
EDUCA       0
INCOME2     0
TOLDHI2     0
_RFHYPE5    0
_BMI5       0
_RFDRHV7    0
_RFSMOK3    0
CVDSTRK3    0
DIABETE4    0
_TOTINDA    0
_VEGLT1A    0
_FRTLT1A    0
GENHLTH     0
MENTHLTH    0
PHYSHLTH    0
dtype: int64

3: Renaming the Variables

In [48]:
brfss = brfss_selected.rename(columns={'_MICHD': 'HeartDisease',
                                       '_AGEG5YR': 'Age', 
                                       'SEXVAR': 'Sex', 
                                       'EDUCA': 'Education', 
                                       'INCOME2': 'Income',
                                       'TOLDHI2': 'HighChol',
                                       '_RFHYPE5': 'HighBP',
                                       '_BMI5': 'BMI', 
                                       '_RFDRHV7': 'HeavyAlcoholConsump', 
                                       '_RFSMOK3': 'Smoker', 
                                       'CVDSTRK3': 'Stroke', 
                                       'DIABETE4': 'Diabetes', 
                                       '_TOTINDA': 'PhysicalActivity', 
                                       '_VEGLT1A': 'Vegetables',
                                       '_FRTLT1A': 'Fruits', 
                                       'GENHLTH': 'GenHealth', 
                                       'MENTHLTH': 'MenHealth', 
                                       'PHYSHLTH': 'PhysHealth'})

In [49]:
brfss.head()

Unnamed: 0,HeartDisease,Age,Sex,Education,Income,HighChol,HighBP,BMI,HeavyAlcoholConsump,Smoker,Stroke,Diabetes,PhysicalActivity,Vegetables,Fruits,GenHealth,MenHealth,PhysHealth
0,2.0,13,2,3.0,3.0,1.0,2,2817.0,1,1,2.0,3.0,2,1,1,3.0,88.0,15.0
1,2.0,11,2,5.0,5.0,2.0,1,1854.0,1,1,2.0,3.0,1,1,1,4.0,88.0,10.0
2,2.0,10,2,6.0,7.0,2.0,2,3162.0,1,1,2.0,1.0,1,1,1,3.0,30.0,88.0
3,2.0,13,2,5.0,6.0,2.0,2,2030.0,9,9,2.0,3.0,9,9,9,4.0,88.0,30.0
4,2.0,13,2,5.0,99.0,1.0,1,2148.0,1,1,2.0,3.0,2,1,1,2.0,88.0,88.0


4: Cleaning Variables

In [50]:
#HeartDisease
#Change the response variable from 1, 2 (Yes, No) to 1, 0
brfss['HeartDisease'] = brfss['HeartDisease'].replace({2: 0})
brfss['HeartDisease'].unique()

array([0., 1.])

In [51]:
#Age
#ordinal variable, categories of 1 to 13, leave as is
#remove 14 because it is unknown/missing 
brfss = brfss.drop(brfss[brfss['Age'] == 14].index)
brfss['Age'].unique()

array([13, 11, 10, 12,  8,  7,  9,  6,  5,  4,  1,  3,  2], dtype=int64)

In [52]:
#Sex
#change male/female from 1, 2 to 1, 0 respectively
brfss['Sex'] = brfss['Sex'].replace({2:0})
brfss['Sex'].unique()

array([0, 1], dtype=int64)

In [53]:
#Education
#ordinal variable 1 to 6, leave as is 
#remove 9 for refused to answer
brfss = brfss.drop(brfss[brfss['Education'] == 9].index)
brfss['Education'].unique()

array([3., 5., 6., 4., 2., 1.])

In [54]:
#Income
#ordinal variable 1 to 8, leave as is
#remove 99 and 77 for refused/don't know
brfss = brfss.drop(brfss[brfss['Income'] == 77].index)
brfss = brfss.drop(brfss[brfss['Income'] == 99].index)
brfss['Income'].unique()

array([3., 5., 7., 6., 8., 4., 2., 1.])

In [55]:
#HighChol
#binary variable, change response for no/yes from 2/1 to 0/1
#remove 7 for not sure
#remove 9 for refused
brfss['HighChol'] = brfss['HighChol'].replace({2:0})
brfss = brfss.drop(brfss[brfss['HighChol'] == 7].index)
brfss = brfss.drop(brfss[brfss['HighChol'] == 9].index)
brfss['HighChol'].unique()

array([1., 0.])

In [56]:
#HighBP
#binary variable, change response for no/yes from 1/2 to 0/1
#remove 9 for not sure/refused/missing
brfss['HighBP'] = brfss['HighBP'].replace({1:0, 2:1})
brfss = brfss.drop(brfss[brfss['HighBP'] == 9].index)
brfss['HighBP'].unique()

array([1, 0], dtype=int64)

In [57]:
#BMI
#has 2 implied decimal places (e.g. 2000 bmi = 20.00)
#divide all values by 100 to get BMI value, round to nearest int
brfss['BMI'] = brfss['BMI'].div(100).round(0)
brfss['BMI'].unique()

array([28., 19., 32., 20., 33., 24., 17., 22., 26., 23., 25., 31., 27.,
       35., 36., 30., 16., 34., 41., 37., 29., 42., 43., 39., 38., 21.,
       55., 40., 18., 50., 45., 48., 47., 44., 52., 46., 51., 57., 49.,
       59., 60., 15., 54., 58., 61., 53., 63., 67., 13., 70., 14., 56.,
       62., 64., 68., 12., 69., 72., 74., 76., 65., 66., 92., 86., 75.,
       79., 80., 82., 73., 87., 84., 88., 71., 81., 78., 77., 97., 94.,
       89., 99., 83., 96., 91., 98., 95., 90., 85.])

In [58]:
#HeavyAlcoholConsump
#binary variable, change response for no/yes from 1/2 to 0/1
#remove 9 for not sure/missing
brfss['HeavyAlcoholConsump'] = brfss['HeavyAlcoholConsump'].replace({1:0, 2:1})
brfss = brfss.drop(brfss[brfss['HeavyAlcoholConsump'] == 9].index)
brfss['HeavyAlcoholConsump'].unique()

array([0, 1], dtype=int64)

In [59]:
#Smoker
##binary variable, change response for no/yes from 1/2 to 0/1
#remove 9 for not sure/missing
brfss['Smoker'] = brfss['Smoker'].replace({1:0, 2:1})
brfss = brfss.drop(brfss[brfss['Smoker'] == 9].index)
brfss['Smoker'].unique()

array([0, 1], dtype=int64)

In [60]:
#Stroke
#binary variable, no/yes from 2/1 to 0/1 
#remove 7 for dont know/not sure
#remove 9 for refused
brfss['Stroke'] = brfss['Stroke'].replace({2:0})
brfss = brfss.drop(brfss[brfss['Stroke'] == 7].index)
brfss = brfss.drop(brfss[brfss['Stroke'] == 9].index)
brfss['Stroke'].unique()

array([0., 1.])

In [61]:
#Diabetes
#response from 1, 2, 3, 4 (yes, only during preg, no, pre-diabetes/borderline)
#change response to 0, 1, 2 (no/during preg, pre/borderline, yes)
#remove 7 for not sure/don't know
#remove 9 for refused
brfss['Diabetes'] = brfss['Diabetes'].replace({2:0, 3:0, 1:2, 4:1})
brfss = brfss.drop(brfss[brfss['Diabetes'] == 7].index)
brfss = brfss.drop(brfss[brfss['Diabetes'] == 9].index)
brfss['Diabetes'].unique()

array([0., 2., 1.])

In [62]:
#PhysicalActivity
#binary vairable, yes/no from 1/2 to 1/0
#remove 9 for refused/not sure
brfss['PhysicalActivity'] = brfss['PhysicalActivity'].replace({2:0})
brfss = brfss.drop(brfss[brfss['PhysicalActivity'] == 9].index)
brfss['PhysicalActivity'].unique()

array([0, 1], dtype=int64)

In [63]:
#VEGETABLES
#binary variable, change 2 to 0 for less than 1 vegetable a day
#remove 9 for refused/dont know/missing
brfss['Vegetables'] = brfss['Vegetables'].replace({2:0})
brfss = brfss.drop(brfss[brfss['Vegetables'] == 9].index)
brfss['Vegetables'].unique()

array([1, 0], dtype=int64)

In [64]:
#FRUITS
#binary variable, change 2 to 0 for less than 1 fruit a day
#remove 9 for refused/dont know/missing
brfss['Fruits'] = brfss['Fruits'].replace({2:0})
brfss = brfss.drop(brfss[brfss['Fruits'] == 9].index)
brfss['Fruits'].unique()

array([1, 0], dtype=int64)

In [65]:
#GenHealth
#ordinal variable 1 to 5 (1 being excellent, 5 being poor), leave as is
#remove 7 for don't know/not sure
#remove 9 for refused
brfss = brfss.drop(brfss[brfss['GenHealth'] == 7].index)
brfss = brfss.drop(brfss[brfss['GenHealth'] == 9].index)
brfss['GenHealth'].unique()

array([3., 4., 2., 5., 1.])

In [66]:
#MenHealth
#int variable, number of days from 1-30, leave as is
#change 88 to 0 for none
#remove 77 for not sure
#remove 99 for refused
brfss['MenHealth'] = brfss['MenHealth'].replace({88:0})
brfss = brfss.drop(brfss[brfss['MenHealth'] == 77].index)
brfss = brfss.drop(brfss[brfss['MenHealth'] == 99].index)
brfss['MenHealth'].unique()

array([ 0., 30.,  4.,  1.,  2., 15.,  5., 10.,  7.,  3.,  6., 20.,  8.,
       25., 14., 21., 17., 28., 16., 12., 27., 26., 24., 29.,  9., 13.,
       18., 23., 22., 11., 19.])

In [67]:
#PhysHealth
#int variable, number of days from 1-30, leave as is
#change 88 to 0 for none
#remove 77 for not sure
#remove 99 for refused
brfss['PhysHealth'] = brfss['PhysHealth'].replace({88:0})
brfss = brfss.drop(brfss[brfss['PhysHealth'] == 77].index)
brfss = brfss.drop(brfss[brfss['PhysHealth'] == 99].index)
brfss['PhysHealth'].unique()

array([15., 10.,  0., 30., 20.,  2.,  1.,  7., 14.,  5., 25.,  4.,  6.,
        3., 21.,  8., 28., 27., 12., 17., 23., 18., 13., 29., 19.,  9.,
       16., 24., 26., 11., 22.])

5: Cleaned and Modified Dataset

In [68]:
brfss.head()

Unnamed: 0,HeartDisease,Age,Sex,Education,Income,HighChol,HighBP,BMI,HeavyAlcoholConsump,Smoker,Stroke,Diabetes,PhysicalActivity,Vegetables,Fruits,GenHealth,MenHealth,PhysHealth
0,0.0,13,0,3.0,3.0,1.0,1,28.0,0,0,0.0,0.0,0,1,1,3.0,0.0,15.0
1,0.0,11,0,5.0,5.0,0.0,0,19.0,0,0,0.0,0.0,1,1,1,4.0,0.0,10.0
2,0.0,10,0,6.0,7.0,0.0,1,32.0,0,0,0.0,2.0,1,1,1,3.0,30.0,0.0
6,0.0,11,1,6.0,7.0,0.0,0,33.0,0,1,0.0,2.0,1,1,1,2.0,0.0,30.0
9,0.0,11,0,2.0,3.0,1.0,0,17.0,0,0,0.0,2.0,0,0,0,5.0,0.0,20.0


In [69]:
brfss.shape

(256203, 18)

In [70]:
brfss['HeartDisease'].value_counts()

HeartDisease
0.0    233139
1.0     23064
Name: count, dtype: int64

In [71]:
brfss.to_csv('heart_disease_brfss2019.csv', sep=',', index=False)