In [1]:
from sklearn.naive_bayes import GaussianNB
import gzip
from collections import defaultdict
import math
import scipy.optimize
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import numpy as np
from sklearn import linear_model
import pandas as pd 
from ydata_profiling import ProfileReport
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load Dataset and Error Measurement

In [2]:
file_path_train = "C:/Users/jeyas/OneDrive/Documents/CSE158/HeartAttackPrediction/Datasets/smote_diabetes_train.csv"
file_path_test = "C:/Users/jeyas/OneDrive/Documents/CSE158/HeartAttackPrediction/Datasets/diabetes_test.csv"
dataTrain = pd.read_csv(file_path_train) #, index_col = "Unnamed:0")
dataTest = pd.read_csv(file_path_test)
dataTest.rename(columns = {"Unnamed: 0": "index"}, inplace = True)
dataTrain.rename(columns = {"Unnamed: 0": "index"}, inplace = True)

In [3]:
def accuracy(predictions, y):
    correct = [1 if dataTrain.loc[row]['diabetes'] == pred else 0 for pred,row in zip(predictions, dataTrain.index)]
    return sum(correct)/len(correct)

In [4]:
def BER(pred, y):
    TP = np.sum([(p and l) for (p,l) in zip(pred, y)])
    FP = np.sum([(p and not l) for (p,l) in zip(pred, y)])
    TN = np.sum([(not p and not l) for (p,l) in zip(pred, y)])
    FN = np.sum([(not p and l) for (p,l) in zip(pred, y)])
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    BER = 1 - 1/2 * (TPR + TNR)
    return BER

In [5]:
def Recall(pred, y):
    TP = np.sum([(p and l) for (p,l) in zip(pred, y)])
    FN = np.sum([(not p and l) for (p,l) in zip(pred, y)])
    return TP/(TP + FN)

In [6]:
def feature(index, dataTrain = dataTrain): 
    f1 = dataTrain.loc[index]["gender"]
    f2 = dataTrain.loc[index]["age"]
    f3 = dataTrain.loc[index]["hypertension"]
    f4 = dataTrain.loc[index]["heart_disease"]
    f5 = dataTrain.loc[index]["bmi"]
    f6 = dataTrain.loc[index]["HbA1c_level"]
    f7 = dataTrain.loc[index]["blood_glucose_level"]
    return [f1] +[f2] + [f3] + [f4] + [f5] +[f6] +[f7]
 

In [10]:
def feature2(index, dataTrain = dataTrain): 
    f1 = dataTrain.loc[index]["gender"]
    f2 = dataTrain.loc[index]["age"]
    # f3 = dataTrain.loc[index]["hypertension"]
    # f4 = dataTrain.loc[index]["heart_disease"]
    f5 = dataTrain.loc[index]["bmi"]
    f6 = dataTrain.loc[index]["HbA1c_level"]
    f7 = dataTrain.loc[index]["blood_glucose_level"]
    return [f1] +[f2] + [f5] +[f6] +[f7] #+ [f3] + [f4] 
 

In [24]:
X_train = [feature(index) for index in range(len(dataTrain))]
y_train = [dataTrain.loc[row]['diabetes'] for row in range(len(dataTrain))]
X_test = [feature(row) for row in range(len(dataTest))]
Y_test = [dataTest.loc[row]['diabetes'] for row in range(len(dataTest))]

# Logistic Regression

In [12]:
mod_log = linear_model.LogisticRegression(fit_intercept=True)
mod_log.fit(X_train,y_train)
pred_log = mod_log.predict(X_test)

In [13]:
print("accuracy " + str(accuracy(pred_log,Y_test)) + "\n")
print("BER " + str(BER(pred_log,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_log,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_log,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_log,Y_test)))


accuracy 0.8841

BER 0.23550000000000004

balanced accuracy 0.786688310604424

F1-Score 0.7264490649320479

Recall Score 0.6254


# Naive Bayes

In [14]:
gnb = GaussianNB()
pred_gnb = gnb.fit(X_train, y_train).predict(X_test)

In [15]:
print("accuracy " + str(accuracy(pred_gnb,Y_test)) + "\n")
print("BER " + str(BER(pred_gnb,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_gnb,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_gnb,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_gnb,Y_test)))


accuracy 0.8695

BER 0.24609999999999999

balanced accuracy 0.7744115973366144

F1-Score 0.7149310784200162

Recall Score 0.6172


# Decision Tree Classifier 

In [16]:
mod_dtc = DecisionTreeClassifier()
pred_dtc = mod_dtc.fit(X_train, y_train).predict(X_test)

In [17]:
print("accuracy " + str(accuracy(pred_dtc,Y_test)) + "\n")
print("BER " + str(BER(pred_dtc,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_dtc,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_dtc,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_dtc,Y_test)))


accuracy 0.987

BER 0.16300000000000003

balanced accuracy 0.8770739064856712

F1-Score 0.8052568697729988

Recall Score 0.674


# Random Forest Classifier:

In [None]:
mod_rfc = RandomForestClassifier(class_weight="balanced")
pred_rfc = mod_rfc.fit(X_train, y_train).predict(X_test)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    9.9s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s


In [34]:
print("accuracy " + str(accuracy(pred_rfc,Y_test)) + "\n")
print("BER " + str(BER(pred_rfc,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_rfc,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_rfc,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_rfc,Y_test)))


accuracy 0.9925

BER 0.15749999999999997

balanced accuracy 0.8801218618526392

F1-Score 0.8131007475970096

Recall Score 0.6852


In [23]:
dataTrain['hypertension'].value_counts()

hypertension
0    161659
1     11341
Name: count, dtype: int64

In [22]:
dataTrain['heart_disease'].value_counts()

heart_disease
0    168004
1      4996
Name: count, dtype: int64