In [1]:
import warnings
warnings.filterwarnings('ignore')

# sampling methods
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.under_sampling import TomekLinks, RandomUnderSampler

import pandas as pd
import matplotlib.pyplot as plt
import pyreadstat

In [2]:
from functions import *

# Selected Subset of Features from BRFSS 2021

**BRFSS 2021 Codebook:** https://www.cdc.gov/brfss/annual_data/2021/pdf/codebook21_llcp-v2-508.pdf


The **selected features** from the BRFSS 2021 dataset are:

**Response Variable / Dependent Variable:**
*   Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI) --> _MICHD


**Independent Variables:**

**High Blood Pressure**
*   Adults who have been told they have high blood pressure by a doctor, nurse, or other health professional --> _RFHYPE6

**High Cholesterol**
*   Have you ever been told by a doctor, nurse or other health professional that your cholesterol is high? --> TOLDHI3
*   About how long has it been since you last had your cholesterol checked? --> CHOLCHK3

**BMI**
*   Body Mass Index (BMI) --> _BMI5

**Smoking**
*   Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] --> SMOKE100

**Other Chronic Health Conditions**
*   (Ever told) you had a stroke. --> CVDSTRK3
*   (Ever told) you had diabetes (If "Yes" and respondent is female, ask "Was this only when you were pregnant?". If Respondent says pre-diabetes or borderline diabetes, use response code 4.) --> DIABETE4

**Physical Activity**
*   Adults who reported doing physical activity or exercise during the past 30 days other than their regular job --> _TOTINDA

**Diet**
*   Consume Fruit 1 or more times per day --> _FRTLT1A
*   Consume Vegetables 1 or more times per day --> _VEGLT1A

**Alcohol Consumption**
*   Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week) --> _RFDRHV7

**Health Care**
*   Adults who had some form of health insurance  --> _HLTHPLN
*   Was there a time in the past 12 months when you needed to see a doctor but could not because you could not afford it? --> MEDCOST1

**Health General and Mental Health**
*   Would you say that in general your health is: --> GENHLTH
*   Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good? --> MENTHLTH
*   Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? --> PHYSHLTH
*   Do you have serious difficulty walking or climbing stairs? --> DIFFWALK

**Demographics**
*   Calculated sex variable --> _SEX
*   Fourteen-level age category --> _AGEG5YR
*   What is the highest grade or year of school you completed? --> EDUCA
*   Is your annual household income from all sources: (If respondent refuses at any income level, code ´Refused.´) --> INCOME3

In [3]:
# Define the XPT file path
xpt_file_path = '../LLCP2021.XPT'

# Load the XPT file
df, meta = pyreadstat.read_xport(xpt_file_path)

In [4]:
# Select the Features
data = df[['_MICHD',
            '_RFHYPE6',
            'TOLDHI3', 'CHOLCHK3',
            '_BMI5',
            'SMOKE100',
            'CVDSTRK3', 'DIABETE4',
            '_TOTINDA',
            '_FRTLT1A', '_VEGLT1A',
            '_RFDRHV7',
            '_HLTHPLN', 'MEDCOST1',
            'GENHLTH', 'MENTHLTH', 'PHYSHLTH', 'DIFFWALK',
            '_SEX', '_AGEG5YR', 'EDUCA', 'INCOME3']]

In [5]:
data.dropna(inplace=True)
data.shape

(330355, 22)

In [6]:
data_cleaned = get_preprocessed(data)

In [7]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 236378 entries, 0 to 438692
Data columns (total 22 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   _MICHD    236378 non-null  float64
 1   _RFHYPE6  236378 non-null  float64
 2   TOLDHI3   236378 non-null  float64
 3   CHOLCHK3  236378 non-null  float64
 4   _BMI5     236378 non-null  float64
 5   SMOKE100  236378 non-null  float64
 6   CVDSTRK3  236378 non-null  float64
 7   DIABETE4  236378 non-null  float64
 8   _TOTINDA  236378 non-null  float64
 9   _FRTLT1A  236378 non-null  float64
 10  _VEGLT1A  236378 non-null  float64
 11  _RFDRHV7  236378 non-null  float64
 12  _HLTHPLN  236378 non-null  float64
 13  MEDCOST1  236378 non-null  float64
 14  GENHLTH   236378 non-null  float64
 15  MENTHLTH  236378 non-null  float64
 16  PHYSHLTH  236378 non-null  float64
 17  DIFFWALK  236378 non-null  float64
 18  _SEX      236378 non-null  float64
 19  _AGEG5YR  236378 non-null  float64
 20  EDUCA    

In [8]:
data_cleaned.groupby(['_MICHD']).size()

_MICHD
0.0    215920
1.0     20458
dtype: int64

In [9]:
results_all = pd.DataFrame(columns=['resampling','model_name', 'accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

# raw data

In [10]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)

result = results_df('x', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)


Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,x,m_lr_clf_train,0.743032,0.222187,0.787375,0.346575,0.763103
1,x,m_lr_clf_test,0.745227,0.223305,0.784422,0.347644,0.762968
2,,,,,,,
3,x,m_dt_clf_train,0.996712,0.963404,1.0,0.981361,0.9982
4,x,m_dt_clf_test,0.867642,0.233032,0.231058,0.232041,0.579505
5,,,,,,,
6,x,m_lgbm_clf_train,0.747196,0.23131,0.826828,0.361491,0.783239
7,x,m_lgbm_clf_test,0.74493,0.225176,0.797784,0.35122,0.768854
8,,,,,,,
9,x,m_xgb_clf_train,0.775196,0.257402,0.847427,0.394866,0.807889


# Undersampling

## Random Undersampling

In [11]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = RandomUnderSampler(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('Random Undersampling', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,Random Undersampling,m_lr_clf_train,0.766008,0.754357,0.788911,0.771247,0.766008
1,Random Undersampling,m_lr_clf_test,0.744141,0.222689,0.785563,0.347009,0.76289
2,,,,,,,
3,Random Undersampling,m_dt_clf_train,0.998394,0.99993,0.996858,0.998391,0.998394
4,Random Undersampling,m_dt_clf_test,0.67737,0.164031,0.665961,0.263227,0.672206
5,,,,,,,
6,Random Undersampling,m_lgbm_clf_train,0.675372,0.607517,0.990922,0.753238,0.675372
7,Random Undersampling,m_lgbm_clf_test,0.404631,0.1248,0.977839,0.221349,0.664082
8,,,,,,,
9,Random Undersampling,m_xgb_clf_train,0.709517,0.63335,0.995112,0.774048,0.709517


## Tomek Links

In [12]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = TomekLinks()
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('Tomek Links', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,Tomek Links,m_lr_clf_train,0.752836,0.235048,0.793939,0.362714,0.77139
1,Tomek Links,m_lr_clf_test,0.744451,0.223708,0.790614,0.348739,0.765346
2,,,,,,,
3,Tomek Links,m_dt_clf_train,0.996635,0.963404,1.0,0.981361,0.998154
4,Tomek Links,m_dt_clf_test,0.863045,0.227889,0.24393,0.235637,0.582815
5,,,,,,,
6,Tomek Links,m_lgbm_clf_train,0.754711,0.24234,0.831785,0.375329,0.789502
7,Tomek Links,m_lgbm_clf_test,0.740094,0.222808,0.805117,0.349027,0.769525
8,,,,,,,
9,Tomek Links,m_xgb_clf_train,0.782202,0.269164,0.85029,0.408892,0.812937


## Oversampling

## SMOTE

In [13]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = SMOTE(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('SMOTE', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,SMOTE,m_lr_clf_train,0.776437,0.76203,0.803927,0.782418,0.776437
1,SMOTE,m_lr_clf_test,0.753603,0.227421,0.770572,0.351194,0.761284
2,,,,,,,
3,SMOTE,m_dt_clf_train,0.998697,0.999907,0.997486,0.998695,0.998697
4,SMOTE,m_dt_clf_test,0.85763,0.227904,0.270165,0.247241,0.591726
5,,,,,,,
6,SMOTE,m_lgbm_clf_train,0.847052,0.772204,0.984538,0.865539,0.847052
7,SMOTE,m_lgbm_clf_test,0.721014,0.21319,0.826462,0.338947,0.768743
8,,,,,,,
9,SMOTE,m_xgb_clf_train,0.863388,0.792372,0.984836,0.878182,0.863388


## SVMSMOTE

In [14]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = SVMSMOTE(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('SVMSMOTE', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,SVMSMOTE,m_lr_clf_train,0.850321,0.841456,0.863302,0.852239,0.850321
1,SVMSMOTE,m_lr_clf_test,0.821079,0.269898,0.626039,0.377184,0.732798
2,,,,,,,
3,SVMSMOTE,m_dt_clf_train,0.9987,0.999907,0.997492,0.998698,0.9987
4,SVMSMOTE,m_dt_clf_test,0.856432,0.223468,0.266254,0.242992,0.5893
5,,,,,,,
6,SVMSMOTE,m_lgbm_clf_train,0.854806,0.782645,0.98246,0.871243,0.854806
7,SVMSMOTE,m_lgbm_clf_test,0.736145,0.22003,0.805117,0.345609,0.767364
8,,,,,,,
9,SVMSMOTE,m_xgb_clf_train,0.857721,0.786138,0.982804,0.873539,0.857721


## Borderline-SMOTE

In [15]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = BorderlineSMOTE(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('BorderlineSMOTE', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,BorderlineSMOTE,m_lr_clf_train,0.805628,0.785927,0.840079,0.812101,0.805628
1,BorderlineSMOTE,m_lr_clf_test,0.772372,0.237526,0.737657,0.359343,0.756659
2,,,,,,,
3,BorderlineSMOTE,m_dt_clf_train,0.998697,0.999881,0.997512,0.998695,0.998697
4,BorderlineSMOTE,m_dt_clf_test,0.856671,0.221777,0.261528,0.240018,0.587292
5,,,,,,,
6,BorderlineSMOTE,m_lgbm_clf_train,0.848332,0.773912,0.984181,0.866472,0.848332
7,BorderlineSMOTE,m_lgbm_clf_test,0.724074,0.21462,0.822878,0.340446,0.768796
8,,,,,,,
9,BorderlineSMOTE,m_xgb_clf_train,0.862415,0.791208,0.984677,0.877404,0.862415


## ADASYN

In [16]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = ADASYN(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('ADASYN', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,ADASYN,m_lr_clf_train,0.762445,0.75095,0.791022,0.770465,0.762213
1,ADASYN,m_lr_clf_test,0.739826,0.219688,0.786215,0.343416,0.760823
2,,,,,,,
3,ADASYN,m_dt_clf_train,0.998707,0.999896,0.997539,0.998716,0.998717
4,ADASYN,m_dt_clf_test,0.859999,0.230331,0.26381,0.245937,0.590146
5,,,,,,,
6,ADASYN,m_lgbm_clf_train,0.847069,0.773708,0.984531,0.86648,0.845954
7,ADASYN,m_lgbm_clf_test,0.72014,0.212925,0.828418,0.338775,0.76915
8,,,,,,,
9,ADASYN,m_xgb_clf_train,0.863703,0.793908,0.985377,0.87934,0.862716


# Over + Undersampling

## SMOTETomek

In [17]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = SMOTETomek(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('SMOTETomek', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,SMOTETomek,m_lr_clf_train,0.776906,0.7618,0.805757,0.783162,0.776906
1,SMOTETomek,m_lr_clf_test,0.752221,0.226669,0.772527,0.350497,0.761412
2,,,,,,,
3,SMOTETomek,m_dt_clf_train,0.998696,0.999907,0.997485,0.998695,0.998696
4,SMOTETomek,m_dt_clf_test,0.85615,0.221796,0.263973,0.241053,0.588113
5,,,,,,,
6,SMOTETomek,m_lgbm_clf_train,0.847203,0.772536,0.984188,0.865612,0.847203
7,SMOTETomek,m_lgbm_clf_test,0.72203,0.213644,0.825159,0.33941,0.768709
8,,,,,,,
9,SMOTETomek,m_xgb_clf_train,0.862439,0.791197,0.984764,0.877432,0.862439


## SMOTEENN

In [18]:
X = data_cleaned.drop('_MICHD', axis = 1)
y = data_cleaned['_MICHD']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3, 
                                                        random_state=13, 
                                                        stratify=y)
resample = SMOTEENN(random_state=13)
X_train, y_train = resample.fit_resample(X_train, y_train)

result = results_df('SMOTEENN', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
results_all = pd.concat([results_all, result], axis=0).reset_index(drop=True)

display(result)

Unnamed: 0,resampling,model_name,accuracy,precision,recall,f1,roc_auc
0,SMOTEENN,m_lr_clf_train,0.853699,0.887176,0.862608,0.87472,0.851687
1,SMOTEENN,m_lr_clf_test,0.70288,0.203086,0.832166,0.326493,0.761398
2,,,,,,,
3,SMOTEENN,m_dt_clf_train,1.0,1.0,1.0,1.0,1.0
4,SMOTEENN,m_dt_clf_test,0.803311,0.222759,0.511325,0.310324,0.671149
5,,,,,,,
6,SMOTEENN,m_lgbm_clf_train,0.911131,0.875379,0.990987,0.929602,0.893102
7,SMOTEENN,m_lgbm_clf_test,0.651197,0.18478,0.888219,0.305918,0.75848
8,,,,,,,
9,SMOTEENN,m_xgb_clf_train,0.924518,0.892413,0.992126,0.939631,0.909254


In [19]:
# Save all results to a CSV file

results_all.to_csv('./BRFSS2021_results.csv')