In [79]:
# import library functions

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [80]:
# import the .csv file into Python

df = pd.read_csv (r'C:\Users\e135634\OneDrive - Blue Cross Blue Shield of Michigan\Desktop\Test Folder\heart.csv')

In [81]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [82]:
# from previous assignment, I know that this dataset has 11 variables and one binary classifier
# of the 11 variables, there are 5 that are categorical and will need to be changed to numbers for the models to work

print("Sex:" , df['Sex'].unique())
print("Exercise Angina:" , df['ExerciseAngina'].unique())
print("Chest Pain Type:" , df['ChestPainType'].unique())
print("Resting ECG:" , df['RestingECG'].unique())
print("ST Slope:" , df['ST_Slope'].unique())

Sex: ['M' 'F']
Exercise Angina: ['N' 'Y']
Chest Pain Type: ['ATA' 'NAP' 'ASY' 'TA']
Resting ECG: ['Normal' 'ST' 'LVH']
ST Slope: ['Up' 'Flat' 'Down']


In [83]:
# getting a quick count of classifier column (1 = Yes, 0 = No)

print(df['HeartDisease'].value_counts())

1    508
0    410
Name: HeartDisease, dtype: int64


In [84]:
# as before, I will convert these columns into dummy variables

df_dummies = pd.get_dummies(df, columns = ['Sex' , 'ExerciseAngina' , 'ChestPainType' , 'RestingECG' , 'ST_Slope'])
df_dummies = df_dummies.reindex(columns = [col for col in df_dummies.columns if col != 'HeartDisease'] + ['HeartDisease'])

In [85]:
df = df_dummies.astype({"Age" : 'float' , "RestingBP" : 'float' ,"Cholesterol" : 'float' ,"FastingBS" : 'float' ,
                                "MaxHR" : 'float' ,"Sex_F" : 'float' ,"Sex_M" : 'float' ,"ExerciseAngina_N" : 'float' ,
                                "ExerciseAngina_Y" : 'float' ,"ChestPainType_ASY" : 'float' ,"ChestPainType_ATA" : 'float' ,
                                "ChestPainType_NAP" : 'float' ,"ChestPainType_TA" : 'float' ,"RestingECG_LVH" : 'float' ,
                                "RestingECG_Normal" : 'float' ,"RestingECG_ST" : 'float' ,"ST_Slope_Down" : 'float' ,
                                "ST_Slope_Flat" : 'float' ,"ST_Slope_Up" : 'float',"HeartDisease" : 'int'})

In [86]:
# Noticed that several patients have incomplete data in Cholestrol- filtering out those incomplete records

df = df[df.Cholesterol > 0]
df.shape

(746, 21)

In [74]:
print(df.dtypes)

Age                  float64
RestingBP            float64
Cholesterol          float64
FastingBS            float64
MaxHR                float64
Oldpeak              float64
Sex_F                float64
Sex_M                float64
ExerciseAngina_N     float64
ExerciseAngina_Y     float64
ChestPainType_ASY    float64
ChestPainType_ATA    float64
ChestPainType_NAP    float64
ChestPainType_TA     float64
RestingECG_LVH       float64
RestingECG_Normal    float64
RestingECG_ST        float64
ST_Slope_Down        float64
ST_Slope_Flat        float64
ST_Slope_Up          float64
HeartDisease           int32
dtype: object


In [87]:
# Adding a 8-2 split of the data into training and testing sets

X = df.iloc[:, :-1]
y = df.iloc[: , -1]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [96]:
# Had to add in a scaler to my training and testing varioables to get function to work

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [97]:
# creating custon Naive Bayes algorythm from scratch using class function for modeling

class Naive_Bayes:
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [98]:
# Evaluate the model generated above

model = Naive_Bayes()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [99]:
print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

[[71 14]
 [ 7 58]]
0.86
              precision    recall  f1-score   support

           0       0.91      0.84      0.87        85
           1       0.81      0.89      0.85        65

    accuracy                           0.86       150
   macro avg       0.86      0.86      0.86       150
weighted avg       0.86      0.86      0.86       150



In [100]:
# running data through pre-built sklearn function just to see how my formula compares

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train , y_train)
predict = gnb.predict(X_test)

print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))
accuracy1 = accuracy_score(y_test, predict)
print("accuracy:" , accuracy1)

[[71 14]
 [ 7 58]]
              precision    recall  f1-score   support

           0       0.91      0.84      0.87        85
           1       0.81      0.89      0.85        65

    accuracy                           0.86       150
   macro avg       0.86      0.86      0.86       150
weighted avg       0.86      0.86      0.86       150

accuracy: 0.86
