In [None]:
'''
attribute documentation: 
3 age: age in years 
4 sex: sex (1 = male; 0 = female) 
9 cp: chest pain type -- Value 1: typical angina -- Value 2: atypical angina -- Value 3: non-anginal pain -- Value 4: asymptomatic 
10 trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
12 chol: serum cholestoral in mg/dl 
16 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
19 restecg: resting electrocardiographic results -- Value 0: normal -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
32 thalach: maximum heart rate achieved 
38 exang: exercise induced angina (1 = yes; 0 = no) 
40 oldpeak = ST depression induced by exercise relative to rest 
41 slope: the slope of the peak exercise ST segment -- Value 1: upsloping -- Value 2: flat -- Value 3: downsloping 
44 ca: number of major vessels (0-3) colored by flourosopy 
51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
58 num: diagnosis of heart disease (angiographic disease status) -- Value 0: < 50% diameter narrowing -- Value 1: > 50% diameter narrowing (in any major vessel: attributes 59 through 68 are vessels) 
'''

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Load data
heart_data_raw = pd.read_excel("heart.xlsx")
heart_data_raw.head()

In [None]:
#Validate missing values 
#There are no missing values for any of the columns
print('Check for NaNs')
print(heart_data_raw.isnull().any())

#Assessing the Skewness of data 
#All values between +- 2 which is considered acceptable
print('\n Check Skew')
print(heart_data_raw.skew())

#Statistics of original data
#303 observations for each variable (validates that there are no NaNs)
print('\n Summary Statistics')
heart_data_raw.describe()

In [None]:
#Check for outliers on continuous variables
#Outlier was defined as any value exceeding 1.5x the IQR
#If an outlier was detected it was relaced with NaN and the observation was subsequently eliminated
def detect_outlier(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    
    for x in range(0, len(data)):
        if (data[x] < (Q1 - 1.5 * IQR)) | (data[x] > (Q3 + 1.5 * IQR)):
            data[x] = np.nan
            
    return data

heart_data_no_outliers = heart_data_raw.copy()

#detect outliers
heart_data_no_outliers['age'] = detect_outlier(heart_data_no_outliers['age'])
heart_data_no_outliers['trestbps'] = detect_outlier(heart_data_no_outliers['trestbps'])
heart_data_no_outliers['chol'] = detect_outlier(heart_data_no_outliers['chol'])
heart_data_no_outliers['thalach'] = detect_outlier(heart_data_no_outliers['thalach'])
heart_data_no_outliers['oldpeak'] = detect_outlier(heart_data_no_outliers['oldpeak'])

heart_data_no_outliers = heart_data_no_outliers.dropna()
heart_data_no_outliers = heart_data_no_outliers.reset_index(drop=True)

#Rerun summary statistics with outliers removed
#Validate missing values 
#There are no missing values for any of the columns
print('Check for NaNs')
print(heart_data_no_outliers.isnull().any())

#Assessing the Skewness of data 
#All values between +- 2 which is considered acceptable
print('\n Check Skew')
print(heart_data_no_outliers.skew())

#Statistics with outliers removed
print('Summary Statistics')
heart_data_no_outliers.describe()

In [None]:
#Seperate into data frames, one with categorical and one with numerical varaibles
#numeric
data_numeric = heart_data_no_outliers[['target','age','trestbps','chol','thalach','oldpeak']]
#categoric
data_categiorical = heart_data_no_outliers[['target','sex','cp','fbs','restecg','exang','slope','ca','thal']]

#Summary Statistics grouped by presence of heart disease 
#1 indicates heart disease 0 indicates no hear disease
heart_data_grouped = heart_data_no_outliers.groupby('target')
heart_data_grouped.aggregate([np.mean, np.std])

In [None]:
#Histogram for spread of data
heart_data_no_outliers.hist()

In [None]:
#created boxplots for continuous variables
graph = sns.boxplot(y='age', x='target', data=heart_data_no_outliers,palette="colorblind")
graph.set_title ('Age vs Disease Result')
graph.set_ylabel('Age')
graph.set_xlabel('')
plt.xticks([0, 1], ['No Disease', 'Has Disease'])
plt.show()

graph = sns.boxplot(y='trestbps', x='target', data=heart_data_no_outliers,palette="colorblind")
graph.set_title ('Resting Blood Pressure vs Disease Result')
graph.set_ylabel('Resting Blood Pressure (mm/Hh)')
graph.set_xlabel('')
plt.xticks([0, 1], ['No Disease', 'Has Disease'])
plt.show()

graph = sns.boxplot(y='chol', x='target', data=heart_data_no_outliers,palette="colorblind")
graph.set_title ('Serum Cholestoral vs Disease Result ')
graph.set_ylabel('Serum Cholestoral in mg/dl')
graph.set_xlabel('')
plt.xticks([0, 1], ['No Disease', 'Has Disease'])
plt.show()

graph = sns.boxplot(y='thalach', x='target', data=heart_data_no_outliers,palette="colorblind")
graph.set_title ('Maximum Heart Rate Achieved vs Disease Result')
graph.set_ylabel('Maximum Heart Rate Achieved')
graph.set_xlabel('')
plt.xticks([0, 1], ['No Disease', 'Has Disease'])
plt.show()

graph = sns.boxplot(y='oldpeak', x='target', data=heart_data_no_outliers,palette="colorblind")
graph.set_title ('ST depression induced by exercise relative to rest vs Disease Result')
graph.set_ylabel('ST depression induced by exercise vs rest')
graph.set_xlabel('')
plt.xticks([0, 1], ['No Disease', 'Has Disease'])
plt.show()

In [None]:
#Bar graphs for categorical variables
heart_data = heart_data_raw.copy()
heart_data['UID'] =  range(1, len(heart_data) + 1)
heart_data.set_index('UID')
targetData=['No Disease', 'Has Disease']

heart_data_subset = heart_data[['target','sex', 'UID']].groupby(['target', 'sex'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Gender and Disease Result')
graph.set_ylabel('Count')
plt.xticks([0, 1], [ 'Female', 'Male'])
plt.legend(targetData,loc=2)
plt.show();

heart_data_subset = heart_data[['target','cp', 'UID']].groupby(['target', 'cp'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Chest Pain Types and Disease Result')
graph.set_ylabel('Count')
graph.set_xlabel('')
plt.legend(targetData,loc=1)
plt.show();

heart_data_subset = heart_data[['target','restecg', 'UID']].groupby(['target', 'restecg'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Resting Electrocardiographic Result and Disease Result')
graph.set_ylabel('Count')
graph.set_xlabel('')
plt.xticks([0, 1,2], [ 'Normal', 'having ST-T wave abnormality ', 'Possible Left Ventricular Hypertrophy'])
plt.legend(targetData,loc=1)
plt.show();


heart_data_subset = heart_data[['target','exang', 'UID']].groupby(['target', 'exang'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Exercise Induced Angina and Disease Result')
graph.set_ylabel('Count')
graph.set_xlabel('Exercise Induced Angina')
plt.xticks([0, 1], [ 'Yes', 'No'])
plt.legend(targetData,loc=1)
plt.show();

heart_data_subset = heart_data[['target','slope', 'UID']].groupby(['target', 'slope'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Peak Exercise ST Segment Peak and Disease Result')
graph.set_ylabel('Count')
plt.xticks([0,1,2],['Upsloping', 'Flat', 'Downsloping'])
plt.legend(targetData,loc=2)
plt.show();

heart_data_subset = heart_data[['target','ca', 'UID']].groupby(['target', 'ca'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Flourosopy Coloured Major Vessels and Disease Result')
graph.set_ylabel('Count')
graph.set_xlabel('Number of Major Vessels Coloured by Flourosopy')
plt.legend(targetData,loc=1)
plt.show();

heart_data_subset = heart_data[['target','thal', 'UID']].groupby(['target', 'thal'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Thalassemia Result and Disease Result')
graph.set_ylabel('Count')
graph.set_xlabel('')
plt.legend(targetData,loc=2)
plt.show();

heart_data_subset = heart_data[['target','fbs', 'UID']].groupby(['target', 'fbs'])['UID'].count().unstack('target')
graph = heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
graph.set_title ('Comparison of Fasting Blood Sugar and Disease Result')
graph.set_ylabel('Count')
graph.set_xlabel('Fasting Blood Sugar > 120 mg/dl')
plt.legend(targetData,loc=1)
plt.xticks([0,1],['True', 'False'])
plt.show();


In [None]:
def z_normalize(data):
    mean_1 = np.mean(data)
    std_1 = np.std(data)
    
    for x in range(0, len(data)):       
        z_score = (data[x] - mean_1)/std_1
        data[x] = z_score
            
    return data

#normalize the numeric variables to their respective z-scores 
data_numeric = data_numeric.apply(z_normalize)

print(data_numeric)