# BDA Final - ML for predicting heart diseases / attacks 

The goal of this mashine learning paper is to reveal hidden / non-obvious features contributing to heart diseases / attacks.

In [None]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats

In [None]:
pd.set_option("display.max_columns", 330) #displaying all of the columns in CDC datasets

df_cdc = pd.read_csv('Datasets/BRFSS_2015.csv') #importing the CDC dataset
df_kaggle = pd.read_csv('Datasets/heart_disease_indicators(KaeggleImport).csv') #importing kaggles cleaned dataset

In [None]:
#compareing kaggles colums to the colunms of the original dataset by the CDC 

column_names_cdc = []
column_names_kaggle = []

for col in df_cdc:
    column_names_cdc.append(col)

print(column_names_cdc)
print('\n')

for col in df_kaggle:
    column_names_kaggle.append(col)

print(column_names_kaggle)

In [None]:
print('Shape of CDC dataset: {}'.format(df_cdc.shape)) #compareing shapes 
print('Shape of Kaggle dataset: {}'.format(df_kaggle.shape))

### Removing all the columns that contain more than 5% of NaN values and/or are irrelevant for our ML

In [None]:
def remove_columns(df_name):
    redundant_columns = ['FMONTH','IDATE','IMONTH','IDAY','IYEAR','DISPCODE','SEQNO','_PSU','CTELENUM',
                         'PVTRESD1','COLGHOUS','STATERES','CELLFON3','LADULT','NUMADULT','NUMMEN','NUMWOMEN',
                         'CTELNUM1','CELLFON2','CADULT','PVTRESD2','CCLGHOUS','CSTATE','LANDLINE','HHADULT',
                         'POORHLTH','BPMEDS','ASTHNOW','DIABAGE2','NUMHHOL2','NUMPHON2','CPDEMO1','PREGNANT',
                         'SMOKDAY2','STOPSMK2','LASTSMK2','AVEDRNK2','DRNK3GE5','MAXDRNKS','FRUITJU1','FRUIT1',
                         'FVBEANS','FVGREEN','FVORANG','VEGETAB1','EXERANY2','EXRACT11','EXEROFT1','EXERHMM1',
                         'EXRACT21','EXEROFT2','EXERHMM2','STRENGTH','LMTJOIN3','ARTHDIS2','ARTHSOCL','JOINPAIN',
                         'SEATBELT','FLUSHOT6','FLSHTMY2','IMFVPLAC','PNEUVAC3','HIVTST6','HIVTSTD3','WHRTST10',
                         'PDIABTST','PREDIAB1','INSULIN','BLDSUGAR','FEETCHK2','DOCTDIAB','CHKHEMO3','FEETCHK',
                         'EYEEXAM','DIABEYE','DIABEDU','PAINACT2','QLMENTL2','QLSTRES2','QLHLTH2','CAREGIV1',
                         'CRGVREL1','CRGVLNG1','CRGVHRS1','CRGVPRB1','CRGVPERS','CRGVHOUS','CRGVMST2','CRGVEXPT',
                         'VIDFCLT2','VIREDIF3','VIREDIF3','VINOCRE2','VIEYEXM2','VIINSUR2','VICTRCT4','VIGLUMA2',
                         'VIMACDG2','CIMEMLOS','CDHOUSE','CDASSIST','CDHELP','CDSOCIAL','CDDISCUS','WTCHSALT',
                         'LONGWTCH','DRADVISE','ASTHMAGE','ASATTACK','ASERVIST','ASDRVIST','ASRCHKUP','ASACTLIM',
                         'ASRCHKUP','ASACTLIM','ASYMPTOM','ASNOSLEP','ASTHMED3','ASINHALR','HAREHAB1','STREHAB1',
                         'CVDASPRN','ASPUNSAF','RLIVPAIN','RDUCHART','RDUCSTRK','ARTTODAY','ARTHWGT','ARTHEXER',
                         'ARTHEDU','TETANUS', 'HPVADVC2','HPVADSHT','SHINGLE2','HADMAM','HOWLONG','HADPAP2',
                         'LASTPAP2','HPVTEST','HPLSTTST','HADHYST2','PROFEXAM','LENGEXAM','BLDSTOOL','LSTBLDS3',
                         'HADSIGM3','HADSGCO1','LASTSIG3','PCPSAAD2','PCPSADI1','PCPSARE1','PSATEST1','PSATIME',
                         'PCPSARS1','PCDMDECN','SCNTMNY1','SCNTMEL1','SCNTPAID','SCNTWRK1','SCNTLPAD','SCNTLWK1',
                         'RCSGENDR','RCSRLTN2','CASTHDX2','CASTHNO2','EMTSUPRT','LSATISFY','ADPLEASR','ADDOWN',
                         'ADSLEEP','ADENERGY','ADEAT1','ADFAIL','ADTHINK','ADMOVE','MISTMNT','ADANXEV','EXACTOT1',
                         'EXACTOT2','_STSTR','_STRWT','_RAWRAKE','_WT2RAKE','_CRACE1','_CPRACE','_CLLCPWT','_DUALCOR',
                         '_CLLCPWT','PADUR2_','METVL11_','METVL21_','ACTIN11_','ACTIN21_','PADUR1_','PAFREQ1_',
                         'PAFREQ2_','_MINAC11','_MINAC21','PAMIN11_','PAMIN21_','PA1MIN_','PAVIG11_','PAVIG21_',
                         'PA1VIGM_', 'VIPRFVS2', '_FLSHOT6', '_PNEUMO2', 'PCPSADE1', '_LLCPWT', 'SXORIENT', 
                         'TRNSGNDR', '_CHISPNC'] 
    df_name = df_name.drop(columns = redundant_columns, axis = 1, inplace = True) 
    return df_name

remove_columns(df_cdc)

In [None]:
print(df_cdc.isnull().sum().to_string(max_rows=None)) #looking at the NaN values per column

In [None]:
df_cdc['MSCODE'].fillna(7, inplace=True) #MSCODE stood out with 190k NaN values, but the CodeBook tells us that in 
                                           #--> this case NaN stands for something rather than no answer 

In [None]:
df_cdc.dropna(inplace=True)

In [None]:
print(df_cdc.shape)
df_cdc

In [None]:
df_cdc.to_csv('NanCleanFinal.csv', sep=",", index=False) #for simplicity we'll safe df_cdc in a seperate file

## Working with the data

In [None]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats

In [None]:
pd.set_option("display.max_columns", 125)
df = pd.read_csv('Datasets/NaNCleanFinal.csv')

## Visualizations to better understand the data we are dealing with

### Age distribution of the particapants in the survey

In [None]:
# Define a dictionary to map old values to new values
age_map = {1:'18-24', 2:'25-29', 3:'30-34', 4:'35-39', 5:'40-44', 6:'45-49',
           7:'50-54', 8:'55-59', 9:'60-64', 10:'65-69', 11:'70-74', 12:'75-79',
           13:'80+', 14:'Not known'}

# Use the map method to replace the values in the '_AGEG5YR' column
df['_AGEG5YR'] = df['_AGEG5YR'].map(age_map)

In [None]:
df['_AGEG5YR'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(15, 8), dpi=100) #creating a figur for the histogram

# plot to the existing fig, by using ax=ax
p = sns.histplot(df._AGEG5YR, stat='percent', discrete=True, shrink=.8,ax=ax)

In [None]:
height = df.value_counts(df['_AGEG5YR']) #counting instances of each range of ages
bars = ('60-64','65-69','55-59','50-54','70-74','80+','45-49','75-79',
        '40-44','35-39','30-34','25-29','18-24','Not known') #labelling the bars
x_pos = np.arange(len(bars))
 
fig, ax = plt.subplots(figsize=(15, 10), dpi=100)
    
plt.bar(x_pos, height, width=.8, color = ('#1f77b4'), edgecolor = ('#000000')) #plotting and selecting colour

plt.title('Distribution of Age groups')
plt.xlabel('Age')
plt.ylabel('Instances')
 
plt.xticks(x_pos, bars)  #Create names on the x axis
 
plt.show()

### Race distribution of the participants of the survey 

In [None]:
#changing the values for the RACE column to actual races
# Define a dictionary to map old values to new values
race_map = {1:'White', 2:'Black', 3:'Other race', 4:'Multiracial', 5:'Hispanic', 9:"Don't know/Refused"}

# Use the map method to replace the values in the '_RACEGR3' column
df['_RACEGR3'] = df['_RACEGR3'].map(race_map)


In [None]:
df['_RACEGR3'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(15, 8), dpi=100) #creating a figur for the histogram

# plot to the existing fig, by using ax=ax
p = sns.histplot(df._RACEGR3, stat='percent', discrete=True, shrink=.8,ax=ax)

In [None]:
height = df.value_counts(df['_RACEGR3']) #counting instances of each race
bars = ('White','Black','Hispanic','Other Race','Multiracial') #labelling the bars
x_pos = np.arange(len(bars))
 
fig, ax = plt.subplots(figsize=(15, 10), dpi=100)
    
plt.bar(x_pos, height, width=.8, color = ('#1f77b4'), edgecolor = ('#000000')) #plotting and selecting colour

plt.title('Distribution of race')
plt.xlabel('Race')
plt.ylabel('Instances')
 
plt.xticks(x_pos, bars)  #Create names on the x axis
 
plt.show()

### Creating a sub-dataset for the visualization of percentage of heart attacks/diseases per state

In [None]:
df_map = df[['_STATE', '_MICHD']] #creating the subset
df_map['_MICHD'] = df_map['_MICHD'].replace({2: 0}) #bringing our target value to a binary scale of '0' and '1'
df_map._MICHD.unique()

In [None]:
df_map_usa = df_map.groupby(['_STATE']).sum() #indexing by state
df_map_usa['Population'] =  df_map['_STATE'].value_counts() #adding the survey particiapants per state as population
df_map_usa['Percentage'] = df_map_usa['_MICHD'] / df_map_usa['Population'] #percentage heart attack per population
df_map_usa['Percentage'] = df_map_usa['Percentage'] * 100 
df_map_usa['Percentage'] = df_map_usa['Percentage'].round(2) #getting real percentages 

In [None]:
# Define a dictionary to map state codes to state names
state_map = {1: 'Alabama', 2: 'Alaska', 4: 'Arizona', 5: 'Arkansas', 6: 'California',
             8: 'Colorado', 9: 'Connecticut', 10: 'Delaware', 11: 'District of Columbia',
             12: 'Florida', 13: 'Georgia', 15: 'Hawaii', 16: 'Idaho', 17: 'Illinois',
             18: 'Indiana', 19: 'Iowa', 20: 'Kansas', 21: 'Kentucky', 22: 'Louisiana',
             23: 'Maine', 24: 'Maryland', 25: 'Massachusetts', 26: 'Michigan',
             27: 'Minnesota', 28: 'Mississippi', 29: 'Missouri', 30: 'Montana',
             31: 'Nebraska', 32: 'Nevada', 33: 'New Hampshire', 34: 'New Jersey',
             35: 'New Mexico', 36: 'New York', 37: 'North Carolina', 38: 'North Dakota',
             39: 'Ohio', 40: 'Oklahoma', 41: 'Oregon', 42: 'Pennsylvania',
             44: 'Rhode Island', 45: 'South Carolina', 46: 'South Dakota', 47: 'Tennessee',
             48: 'Texas', 49: 'Utah', 50: 'Vermont', 51: 'Virginia', 53: 'Washington',
             54: 'West Virginia', 55: 'Wisconsin', 56: 'Wyoming', 66:'Guam', 72:'Puerto Rico'}

# Reset the index of the dataframe to make '_STATE' a regular column
df_map_usa = df_map_usa.reset_index()

# Use the map method to replace the values in the '_STATE' column
df_map_usa['_STATE'] = df_map_usa['_STATE'].map(state_map)
df_map_usa.head()

In [None]:
df_map_usa.to_csv('MapUSAFinal.csv', sep=",", index=False) #exporting the file to import it to Tableau

### Correlation matrix (General overview of potential correlations)

In [None]:
#bennet's code

### Per-class feature histograms 

Plotting the distribution of all features in their respective class (heart attack/disease: yes/no)

In [None]:
# Devide target variable into their respective classes
yes_data = df[df['_MICHD'] == 1]
no_data = df[df['_MICHD'] == 2]

features = [feature for feature in df.columns if feature != '_MICHD']

# Calculate the number of rows and columns for subplots
n_cols = 3
n_rows = math.ceil(len(features) / n_cols)

# Create a single figure with subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, n_rows * 5))
fig.subplots_adjust(hspace=0.5)

# Iterate through features and axes to create subplots
for feature, ax in zip(features, axes.flatten()):
    sns.histplot(yes_data[feature], color='#1f77b4', label='yes', alpha=0.5, stat='density', ax=ax)
    sns.histplot(no_data[feature], color='#ff7f0e', label='no', alpha=0.5, stat='density', ax=ax)
    ax.legend(title='Heart Disease')
    ax.set_title(f'Distribution of {feature} by heart disease')
    ax.set_xlabel(feature)
    ax.set_ylabel('Density')
    
# Save the figure to disk and close it to free up memory
fig.savefig('Per-classHistALL.png', dpi=300, bbox_inches='tight')
plt.close(fig)

After carefully inspecting the different per-class histogramns for all 121 features, we have selected a couple interesting to be displayed in greater detail.
Therefore we need to change some of the scales of the features and remove some of the outliers.

In [None]:
new_features = df[['WTKG3','_BMI5','_AGEG5YR','_VEGESUM','MAXVO2_','_FRUTSUM','_MICHD']]

In [None]:
# Changing the scale of WTKG3
df['WTKG3'] = df['WTKG3'] / 100

# Changing the scale of _BMI5
df['_BMI5'] = df['_BMI5'] / 100

# Changing the scale of _VEGESUM
df['_VEGESUM'] = df['_VEGESUM'] / 100

# Changing the scale of MAXVO2_
df['MAXVO2_'] = df['MAXVO2_'] / 100
df['MAXVO2_'] = df['MAXVO2_'].replace({999:0})

# Changing the scale of _FRUTSUM
df['_FRUTSUM'] = df['_FRUTSUM'] / 100

In [None]:
df['_BMI5'] = df['_BMI5'][(np.abs(stats.zscore(df['_BMI5'])) < 3)] #Removing values outside 3 stds (99% incl)
df['_VEGESUM'] = df['_VEGESUM'][(np.abs(stats.zscore(df['_VEGESUM'])) < 3)] #Removing values outside 3 stds (99% incl)
df['_FRUTSUM'] = df['_FRUTSUM'][(np.abs(stats.zscore(df['_FRUTSUM'])) < 3)] #Removing values outside 3 stds (99% incl)
df['WTKG3'] = df['WTKG3'][(np.abs(stats.zscore(df['WTKG3'])) < 3)] #Removing values outside 3 stds (99% incl)

In [None]:
#redefining the features  
features = [feature for feature in new_features if feature != '_MICHD']

# Calculate the number of rows and columns for subplots
n_cols = 3
n_rows = math.ceil(len(features) / n_cols)

# Create a single figure with subplots
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, n_rows * 5))
fig.subplots_adjust(hspace=0.5)

# Iterate through features and axes to create subplots
for feature, ax in zip(features, axes.flatten()):
    sns.histplot(yes_data[feature], color='#1f77b4', label='yes', alpha=0.5, stat='density', ax=ax)
    sns.histplot(no_data[feature], color='#ff7f0e', label='no', alpha=0.5, stat='density', ax=ax)
    ax.set_title(f'Distribution of {feature} by heart disease')
    ax.set_xlabel(feature)
    
# Create a custom legend for the entire figure
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#1f77b4', alpha=0.5, label='Yes'),
                   Patch(facecolor='#ff7f0e', alpha=0.5, label='No')]

# Adjust the location and font size of the legend
fig.legend(handles=legend_elements, title='Heart Disease', fontsize='large')

# Save the figure to disk and close it to free up memory
fig.savefig('Per-classHist.png', dpi=300, bbox_inches='tight')
plt.close(fig)

In [None]:
# Reiterate through features and axes to create subplots but now for instances instead of density
for feature, ax in zip(features, axes.flatten()):
    sns.histplot(yes_data[feature], color='#1f77b4', label='yes', alpha=0.5, ax=ax)
    sns.histplot(no_data[feature], color='#ff7f0e', label='no', alpha=0.5, ax=ax)
    ax.set_title(f'Distribution of {feature} by heart disease')
    ax.set_xlabel(feature)
    ax.set_ylabel('Instances')
    
    
# Create a custom legend for the entire figure
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#1f77b4', alpha=0.5, label='Yes'),
                   Patch(facecolor='#ff7f0e', alpha=0.5, label='No')]

# Adjust the location and font size of the legend
fig.legend(handles=legend_elements, title='Heart Disease', fontsize='large')
    
# Save the figure to disk and close it to free up memory
fig.savefig('Per-classHist(Instances).png', dpi=300, bbox_inches='tight')
plt.close(fig)

## Apendix:


### Color palette

In [None]:
#['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
sns.color_palette() # color palette 

In [None]:
print(sns.color_palette().as_hex())