In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Load data
heart_data_raw = pd.read_excel("heart.xlsx")
heart_data_raw.head()

In [None]:
#Validate missing values 
#There are no missing values for any of the columns
print('Check for NaNs')
print(heart_data_raw.isnull().any())

#Assessing the Skewness of data 
#All values between +- 2 which is considered acceptable
print('\n Check Skew')
print(heart_data_raw.skew())

#Statistics of original data
#303 observations for each variable (validates that there are no NaNs)
print('\n Summary Statistics')
heart_data_raw.describe()

In [None]:
#Check for outliers on continuous variables
#Outlier was defined as value exceeding 3 std (z score greater than 3) from the mean
#If an outlier was detected it was relaced with NaN and the observation was subsequently eliminated
def detect_outlier(data):
    outliers = []
    threshold=3
    mean = np.mean(data)
    std =np.std(data)
    
    for x in range(0, len(data)):
        z_score= (data[x] - mean)/std
        if np.abs(z_score) > threshold:
            data[x] = np.nan
            
    return data

heart_data_no_outliers = heart_data_raw.copy()

heart_data_no_outliers['age'] = detect_outlier(heart_data_no_outliers['age'])
heart_data_no_outliers['trestbps'] = detect_outlier(heart_data_no_outliers['trestbps'])
heart_data_no_outliers['chol'] = detect_outlier(heart_data_no_outliers['chol'])
heart_data_no_outliers['thalach'] = detect_outlier(heart_data_no_outliers['thalach'])
heart_data_no_outliers['oldpeak'] = detect_outlier(heart_data_no_outliers['oldpeak'])

heart_data_no_outliers = heart_data_no_outliers.dropna()

#Rerun summary statistics with outliers removed
#Validate missing values 
#There are no missing values for any of the columns
print('Check for NaNs')
print(heart_data_no_outliers.isnull().any())

#Assessing the Skewness of data 
#All values between +- 2 which is considered acceptable
print('\n Check Skew')
print(heart_data_no_outliers.skew())

#Statistics with outliers removed
print('Summary Statistics')
heart_data_no_outliers.describe()

In [None]:
#Summary Statistics grouped by presence of heart disease 
#1 indicates heart disease 0 indicates no hear disease
heart_data_grouped = heart_data_no_outliers.groupby('target')
heart_data_grouped.aggregate([np.mean, np.std])

In [None]:
#created boxplots for continuous variables
sns.boxplot(y='age', x='target', data=heart_data_no_outliers,palette="colorblind")
plt.show()
sns.boxplot(y='trestbps', x='target', data=heart_data_no_outliers,palette="colorblind")
plt.show()
sns.boxplot(y='chol', x='target', data=heart_data_no_outliers,palette="colorblind")
plt.show()
sns.boxplot(y='thalach', x='target', data=heart_data_no_outliers,palette="colorblind")
plt.show()
sns.boxplot(y='oldpeak', x='target', data=heart_data_no_outliers,palette="colorblind")
plt.show()

In [None]:
#Created bar graphs for categorical variables




In [None]:
heart_data_raw.hist()

In [None]:
heart_data_subset = heart_data[['target','ageGroup', 'chol']].groupby(['target', 'ageGroup'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(2,2))
plt.show()

In [None]:
heart_data_subset = heart_data[['target','sex', 'chol']].groupby(['target', 'sex'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True,  figsize=(3,3))
plt.show();

In [None]:
heart_data_subset = heart_data[['target','cp', 'chol']].groupby(['target', 'cp'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(2,2))
plt.show();

In [None]:
heart_data_subset = heart_data[['target', 'cp', 'chol']].groupby(['target', 'cp'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();
heart_data_subset

In [None]:

heart_data_subset = heart_data[['target','restecg', 'chol']].groupby(['target', 'restecg'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();
heart_data_subset

In [None]:
heart_data_subset = heart_data[['target','exang', 'chol']].groupby(['target', 'exang'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();

In [None]:

heart_data_subset = heart_data[['target','slope', 'chol']].groupby(['target', 'slope'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();


In [None]:
heart_data_subset = heart_data[['target','ca', 'chol']].groupby(['target', 'ca'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();

In [None]:

heart_data_subset = heart_data[['target','thal', 'chol']].groupby(['target', 'thal'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();

In [None]:
# not sure what this shows
heart_data_subset = heart_data[['target','OldPeak2', 'chol']].groupby(['target', 'OldPeak2'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();


In [None]:

# trestbps too many values
heart_data_subset = heart_data[['target','RestBloodPressure', 'chol']].groupby(['target', 'RestBloodPressure'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();



In [None]:
# chol too many values
heart_data_subset = heart_data[['target','Cholestoral', 'chol']].groupby(['target', 'Cholestoral'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();

In [None]:
# fbs both higher, not useful
heart_data_subset = heart_data[['target','fbs', 'chol']].groupby(['target', 'fbs'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();

In [None]:
# thalach too many values
heart_data_subset = heart_data[['target','MaxHeartRate', 'chol']].groupby(['target', 'MaxHeartRate'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(4,4))
plt.show();

In [None]:
heart_data_subset = heart_data[['target','RestBloodPressure', 'chol']].groupby(['target', 'RestBloodPressure'])['chol'].count().unstack('target')
heart_data_subset.plot(kind='bar', legend = True, figsize=(2,2))
plt.show();

In [None]:
#Clean ups
heart_data = heart_data_raw.copy()

#Clean up by definition
heart_data['sex'] = ['Male' if x == 1 else 'Female' for x in heart_data['sex']]
heart_data['cp'] = ['Typical Angine' if x == 0 else 'Atypical angina' if x == 1 else 'Non-anginal pain' if x==2 else 'Asymptomatic' for x in heart_data['cp']]
heart_data['fbs'] = ['True' if x == 1 else 'False' for x in heart_data['fbs']]
heart_data['restecg'] = ['Normal' if x == 0 else 'STT wave abnormality' if x ==1 else 'Hypertrophy' for x in heart_data['restecg']]
heart_data['exang'] = ['Yes' if x == 1 else 'No' for x in heart_data['exang']]
heart_data['slope'] = ['Unsloping' if x == 1 else 'Flat' if x == 2 else 'Downsloping' for x in heart_data['slope']]
heart_data['thal'] = ['Normal' if x == 3 else 'Fixed defect' if x == 6 else 'reversable defect' for x in heart_data['thal']]
heart_data['target'] = ['No' if x == 0 else 'Yes' for x in heart_data['target']]

#Additional grouping
heart_data['ageGroup'] = ['Youth' if age < 19 else 'Senior' if age > 60 else 'Adult' for age in heart_data['age']]
heart_data['OldPeak2'] = ['Lower' if x < 0.8 else 'Higher' for x in heart_data['oldpeak']]

#Additional grouping from research
#assuming resting blood pressure is diastolic blood pressure. based on https://www.webmd.com/hypertension-high-blood-pressure/guide/diastolic-and-systolic-blood-pressure-know-your-numbers#1-3
heart_data['RestBloodPressure'] = ['High' if x < 120 else 'Hypertensive Crisis' for x in heart_data['trestbps']]
heart_data['Cholestoral'] = ['Normal' if x < 130 else 'High' for x in heart_data['chol']]
def calc_maxHeartRate (num):
    age, maxrate = num
    return 'Normal' if maxrate <= 220 -age else 'High'
heart_data['MaxHeartRate'] = heart_data[['age', 'thalach']].apply(calc_maxHeartRate, axis= 1)

heart_data.head(