In [1]:
import pandas as pd
import pyreadstat
import numpy as np

In [2]:
data = pyreadstat.read_xport('./data/CDC_BRFSS_data.XPT')

In [3]:
df = data[0][[
        'CVDCRHD4', 
        'BPHIGH6',
        'TOLDHI3',
        'CHOLCHK3',
        '_RFBMI5',
        'SMOKE100',
        'CVDSTRK3',
        'DIABETE4',
        '_TOTINDA',
        '_FRTLT1A',
        '_VEGLT1A',
        '_RFDRHV7',
        'PRIMINSR',
        'MEDCOST1',
        'GENHLTH',
        '_RFHLTH',
        'PHYSHLTH',
        'DIFFWALK',
        'SEXVAR',
        '_AGEG5YR',
        'EDUCA',
        'INCOME3'
        ]]

In [4]:
# Initial drop of Nan rows
df = df.dropna()

In [5]:
# Independent Var - Heart Disease, Change to Yes = 1, No = 0, remove other values
df['CVDCRHD4'] = df['CVDCRHD4'].replace({2:0})
df = df[df['CVDCRHD4'].isin([1,0])]

In [6]:
df['TOLDHI3'] = df['TOLDHI3'].replace({2:0})
df = df[df['TOLDHI3'].isin([1,0])]

In [7]:
df['_RFBMI5'] = df['_RFBMI5'].replace({1:0, 2:1})
df = df[df['_RFBMI5'].isin([1,0])]

In [8]:
df['SMOKE100'] = df['SMOKE100'].replace({2:0})
df = df[df['SMOKE100'].isin([1,0])]

In [9]:
df['CVDSTRK3'] = df['CVDSTRK3'].replace({2:0})
df = df[df['CVDSTRK3'].isin([1,0])]

In [10]:
df['_TOTINDA'] = df['_TOTINDA'].replace({2:0})
df = df[df['_TOTINDA'].isin([1,0])]

In [11]:
df['_FRTLT1A'] = df['_FRTLT1A'].replace({2:0})
df = df[df['_FRTLT1A'].isin([1,0])]

In [12]:
df['_VEGLT1A'] = df['_VEGLT1A'].replace({2:0})
df = df[df['_VEGLT1A'].isin([1,0])]

In [13]:
# This one is backwards, it lives as 1-No 2-Yes, need to switch
df['_RFDRHV7'] = df['_RFDRHV7'].replace({1:0, 2:1})
df = df[df['_RFDRHV7'].isin([1,0])]

In [14]:
df['MEDCOST1'] = df['MEDCOST1'].replace({2:0})
df = df[df['MEDCOST1'].isin([1,0])]

In [15]:
df['_RFHLTH'] = df['_RFHLTH'].replace({2:0})
df = df[df['_RFHLTH'].isin([1,0])]

In [16]:
# Get rid of 77 and 99 codes, change 88 to 0
df['PHYSHLTH'] = df['PHYSHLTH'].replace({88:0})
df = df[~df['PHYSHLTH'].isin([77,99])]

In [17]:
df['DIFFWALK'] = df['DIFFWALK'].replace({2:0})
df = df[df['DIFFWALK'].isin([1,0])]

In [18]:
# Change to 1 = Male , 0 = Female 
df['SEXVAR'] = df['SEXVAR'].replace({2:0})
df = df[df['SEXVAR'].isin([1,0])]

In [19]:
# Make dummy columns for categories that Have more than a Yes or No answer
df_dummies = pd.get_dummies(data=df, columns=['BPHIGH6',
                                'CHOLCHK3',
                                'DIABETE4',
                                'PRIMINSR',
                                'GENHLTH',
                                '_AGEG5YR',
                                'EDUCA',
                                'INCOME3']
            )

In [20]:
# Remove Dummy columns that have answer = Don't Know or refused answer 
df_dummies = df_dummies.drop(columns=[
                                        'BPHIGH6_7.0',
                                        'BPHIGH6_9.0',
                                        'DIABETE4_7.0',
                                        'DIABETE4_9.0',
                                        'PRIMINSR_77.0',
                                        'PRIMINSR_99.0',
                                        # 'GENHLTH_7.0',
                                        # 'GENHLTH_9.0'
                                        '_AGEG5YR_14.0',
                                        'EDUCA_9.0',
                                        'INCOME3_77.0',
                                        'INCOME3_99.0'
                                        ]) 

Now every column is in terms of 1 = Yes , 0 = No , or for PHYSHLTH - continuous var 0-30 days of bad health days