In [None]:
import pandas as pd

# Use for preprocessing of the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

# Figuring out if our data variable’ distribution is Gaussian
from scipy.stats import normaltest
from scipy.stats import shapiro

# Used for the models to predict the data
#Logisitic Regression Model 
from sklearn.linear_model import LogisticRegression

#Neural Net Model
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

# Getting rid out outliers
from sklearn.neighbors import LocalOutlierFactor

# Training
from sklearn.model_selection import train_test_split

# Used for visualization
#import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix

# Used for accuracy of the models
from sklearn.metrics import mean_absolute_error

# Objective #1
- Read from the heart_data.csv file into a pandas dataframe.

In [None]:
heartData = pd.read_csv("/work/heart_data.csv")
heartData.head(5)

Unnamed: 0,PatientId,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,34,M,ATA,150,214,0,ST,168,N,0.0,Up,0
1,1,59,M,ASY,178,0,1,LVH,120,Y,0.0,Flat,1
2,2,58,M,ASY,115,0,1,Normal,138,N,0.5,Up,1
3,3,60,M,ASY,130,253,0,Normal,144,Y,1.4,Up,1
4,4,52,M,ASY,165,0,1,Normal,122,Y,1.0,Up,1


In [None]:


# Columns that have been extracted to be used

le = LabelEncoder()
heartData['Sex01'] = le.fit_transform(heartData.Sex)
heartData['ChestPainType01'] = le.fit_transform(heartData.ChestPainType)
heartData['RestingECG01'] = le.fit_transform(heartData.RestingECG)
heartData['ExerciseAngina01'] = le.fit_transform(heartData.ExerciseAngina)
heartData['ST_Slope01'] = le.fit_transform(heartData.ST_Slope)


cols = heartData[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']]
colsValues = [1,4,5,6,8,10,13,14,15,16,17] # chosen columns from our data 

heartData.head(5)
heartData.columns[colsValues]

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01',
       'ST_Slope01'],
      dtype='object')

# Objective #2

Pre-processing and cleaning the data given

### Steps to be taken:
1. Checking for NULL/NaN values
2. Checking for any duplicate values 
4. Normalization
---
## Why no Standardization? 
- This data will NOT have a bell curve
- This data is not Guassian-like

In [None]:
# Before we choose to wrangle with our data we must first come to understand what underneath the hood.
heartData.describe()

Unnamed: 0,PatientId,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,818.0,818.0,818.0,818.0,818.0,818.0,818.0,818.0
mean,408.5,53.512225,132.506112,197.96088,0.229829,137.122249,0.878851,0.545232
std,236.280554,9.494595,18.690105,108.132239,0.42098,25.609098,1.068329,0.498254
min,0.0,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,204.25,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,408.5,54.0,130.0,221.0,0.0,138.0,0.5,1.0
75%,612.75,60.0,140.0,265.75,0.0,156.0,1.5,1.0
max,817.0,77.0,200.0,603.0,1.0,202.0,6.2,1.0


# Is the input variable distribution is Gaussian?

### Why do we need to figure this out?
My goal is to normalize the data for my INPUT variables. To be able to achieve this I want to use MinMaxScaler to rescale the data set such that all feature values are in the range [0, 1]

When normalizing data I should identify if they are any outliers in my dataset which could skew the data inside of it (as normalization is sensitive to outliers)  

In order to identify outliers I can check if the distribution is Gaussian

If I find that my data is  Gaussian I can use a few steps to figure out if they are any outliers within my data 

In [None]:
#[Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak]

print('\n###D’Agostino and Pearson’s Test####')
for i in heartData.columns[colsValues]:
    print(f'{i}: {"Not Gaussian" if normaltest(heartData[i].values,)[1]<0.05 else "Gaussian"}  {normaltest(heartData[i].values)}')

print('\n###SHAPIRO TEST####')
for i in heartData.columns[colsValues]:
    print(f'{i}: {"Not Gaussian" if shapiro(heartData[i])[1]<0.05 else "Gaussian"}  {shapiro(heartData[i])}')


###D’Agostino and Pearson’s Test####
Age: Not Gaussian  NormaltestResult(statistic=14.10154250215735, pvalue=0.0008667402251263827)
RestingBP: Not Gaussian  NormaltestResult(statistic=76.50982522759105, pvalue=2.4327783718283894e-17)
Cholesterol: Not Gaussian  NormaltestResult(statistic=45.6811678375251, pvalue=1.203539755796199e-10)
FastingBS: Not Gaussian  NormaltestResult(statistic=147.93997898036378, pvalue=7.503117333266766e-33)
MaxHR: Not Gaussian  NormaltestResult(statistic=13.336898191027805, pvalue=0.0012703674425520017)
Oldpeak: Not Gaussian  NormaltestResult(statistic=132.9970956996849, pvalue=1.3184012746452946e-29)

###SHAPIRO TEST####
Age: Not Gaussian  ShapiroResult(statistic=0.990276575088501, pvalue=3.1005161872599274e-05)
RestingBP: Not Gaussian  ShapiroResult(statistic=0.9544223546981812, pvalue=3.1177614133524705e-15)
Cholesterol: Not Gaussian  ShapiroResult(statistic=0.8702148795127869, pvalue=1.7137967420666714e-25)
FastingBS: Not Gaussian  ShapiroResult(statisti

In [None]:
#Step 1
# total number of entries as well as count of non-null values with datatype of all features
heartData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 818 entries, 0 to 817
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PatientId       818 non-null    int64  
 1   Age             818 non-null    int64  
 2   Sex             818 non-null    object 
 3   ChestPainType   818 non-null    object 
 4   RestingBP       818 non-null    int64  
 5   Cholesterol     818 non-null    int64  
 6   FastingBS       818 non-null    int64  
 7   RestingECG      818 non-null    object 
 8   MaxHR           818 non-null    int64  
 9   ExerciseAngina  818 non-null    object 
 10  Oldpeak         818 non-null    float64
 11  ST_Slope        818 non-null    object 
 12  HeartDisease    818 non-null    int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 83.2+ KB


In [None]:
heartData.isna().sum() # totalling all of the null values in each column

PatientId         0
Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [None]:
#Step 2
heartData.drop_duplicates() # removing any duplicates found in the dataset

Unnamed: 0,PatientId,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,0,34,M,ATA,150,214,0,ST,168,N,0.0,Up,0
1,1,59,M,ASY,178,0,1,LVH,120,Y,0.0,Flat,1
2,2,58,M,ASY,115,0,1,Normal,138,N,0.5,Up,1
3,3,60,M,ASY,130,253,0,Normal,144,Y,1.4,Up,1
4,4,52,M,ASY,165,0,1,Normal,122,Y,1.0,Up,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,813,52,M,ASY,135,0,1,Normal,128,Y,2.0,Flat,1
814,814,48,M,ASY,120,260,0,Normal,115,N,2.0,Flat,1
815,815,47,M,ASY,110,0,1,ST,149,N,2.1,Up,1
816,816,57,F,ASY,140,241,0,Normal,123,Y,0.2,Flat,1


In [None]:
# STEP 3 - Normalization 
# minmaxscalar
normalizedHeartData = heartData[heartData.columns[colsValues]]
minmax = MinMaxScaler() 
heartDataMinMax = minmax.fit_transform(normalizedHeartData)


# Objective #3 & 4
## Extract features to use from the table

In [None]:
heartData_HeartDisease_extracted = heartData[['HeartDisease']] #

# Objective #5 - 8 
### 1. Split the dataset into train and test using the train_test_split function provided
### 2. Create a Logistic Regression model and fit it to your train data.
### 3. Test the results on your test data. Report on the percentage accuracy.


In [None]:
#input data
X = heartDataMinMax

# what i am trying to predict
Y = heartData_HeartDisease_extracted

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=1)

#K FOLD
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)

# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)

# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]

# summarize the shape of the updated training dataset
print("\nsummarize the shape of the updated training dataset (outliers removed)")
print(X_train.shape, y_train.shape)

# fit the model
model = LogisticRegression()
model.fit(X_train, y_train)

# evaluate the model
yhat = model.predict(X_test)

# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, yhat))
print(classification_report(y_test, yhat))
print(confusion_matrix(y_test, yhat))
#skplt.metrics.plot_confusion_matrix(y_test, yhat, figsize=(6,6), title = "Confusion matrix")


summarize the shape of the training dataset
(327, 11) (327, 1)

summarize the shape of the updated training dataset (outliers removed)
(319, 11) (319, 1)

MAE: 0.157
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       212
           1       0.86      0.86      0.86       279

    accuracy                           0.84       491
   macro avg       0.84      0.84      0.84       491
weighted avg       0.84      0.84      0.84       491

[[174  38]
 [ 39 240]]
  y = column_or_1d(y, warn=True)


# Objective #8 & 9
- Create a simple neural network to fit your train data.
- Test the results on your test data. Report on the percentage accuracy.

In [None]:
#First Test
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)

#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelNN = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(100,6,6), random_state=1)
#fitting the model
modelNN.fit(X_train, y_train)

# make prediction
modelNNPred = modelNN.predict(X_test)

# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred))
print(classification_report(y_test, modelNNPred))
print(confusion_matrix(y_test, modelNNPred))
#skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")


summarize the shape of the training dataset
(319, 11) (319, 1)
  y = column_or_1d(y, warn=True)

MAE: 0.167
              precision    recall  f1-score   support

           0       0.80      0.82      0.81       212
           1       0.86      0.84      0.85       279

    accuracy                           0.83       491
   macro avg       0.83      0.83      0.83       491
weighted avg       0.83      0.83      0.83       491

[[174  38]
 [ 44 235]]


# Objective 10
## Neutralization is important as it ensures that all inputs are at a comparable range.One of the best practices for training a Neural Network is to normalize your data to obtain a mean close to 0. Normalizing the data generally speeds up learning and leads to faster convergence, which  also contributes to increasing the accuracy of predictions of models

# Second Test
### 100 more neurons

This test proved to be a success with the addition of 100 more nuerons. The overall accuracy increased from 79% to 80%.

In [None]:
#Second Test - 100 more neurons
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)

#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelNN2 = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(200,6,6), random_state=1)
#fitting the model
modelNN2.fit(X_train, y_train)

# make prediction
modelNNPred2 = modelNN2.predict(X_test)

# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred2))
print(classification_report(y_test, modelNNPred2))
print(confusion_matrix(y_test, modelNNPred2))
#skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")


summarize the shape of the training dataset
(319, 11) (319, 1)
  y = column_or_1d(y, warn=True)

MAE: 0.163
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       212
           1       0.85      0.87      0.86       279

    accuracy                           0.84       491
   macro avg       0.83      0.83      0.83       491
weighted avg       0.84      0.84      0.84       491

[[169  43]
 [ 37 242]]


In [None]:
testDF = pd.read_csv('/work/heart_predict.csv')

le2 = LabelEncoder()
testDF['Sex01'] = le.fit_transform(testDF.Sex)
testDF['ChestPainType01'] = le.fit_transform(testDF.ChestPainType)
testDF['RestingECG01'] = le.fit_transform(testDF.RestingECG)
testDF['ExerciseAngina01'] = le.fit_transform(testDF.ExerciseAngina)
testDF['ST_Slope01'] = le.fit_transform(testDF.ST_Slope)

xNew = testDF[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']].values
xNew = minmax.transform(xNew) 
predictions = model.predict(xNew)

  "X does not have valid feature names, but"


In [None]:
submission = pd.DataFrame({'PatientId': testDF['PatientId'], 'HeartDisease': predictions})
submission.to_csv('submission11.csv', index=False)

# Bonus
- Support Vector Machine (SVM)
- Bagged Decision Trees (Bagging)



In [None]:
#SVM TEST
from sklearn.svm import SVC

# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)

#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelSVM = SVC(kernel='poly', C=10)

#fitting the model
modelSVM.fit(X_train, y_train)

# make prediction
modelSVMPred = modelSVM.predict(X_test)

# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelSVMPred))
print(classification_report(y_test, modelSVMPred))
print(confusion_matrix(y_test, modelSVMPred))


summarize the shape of the training dataset
(319, 11) (319, 1)

MAE: 0.165
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       212
           1       0.87      0.84      0.85       279

    accuracy                           0.84       491
   macro avg       0.83      0.84      0.83       491
weighted avg       0.84      0.84      0.84       491

[[177  35]
 [ 46 233]]
  y = column_or_1d(y, warn=True)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e279e481-5592-4b23-988f-b9d98935e6dc' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>