In [43]:
from sklearn.naive_bayes import GaussianNB
import gzip
from collections import defaultdict
import math
import scipy.optimize
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import numpy as np
from sklearn import linear_model
import pandas as pd 
from ydata_profiling import ProfileReport
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
file_path_train = "C:/Users/jeyas/OneDrive/Documents/CSE158/HeartAttackPrediction/Datasets/smote_diabetes_train.csv"
file_path_test = "C:/Users/jeyas/OneDrive/Documents/CSE158/HeartAttackPrediction/Datasets/diabetes_test.csv"
dataTrain = pd.read_csv(file_path_train) #, index_col = "Unnamed:0")
dataTest = pd.read_csv(file_path_test)
dataTest.rename(columns = {"Unnamed: 0": "index"}, inplace = True)
dataTrain.rename(columns = {"Unnamed: 0": "index"}, inplace = True)

Load Dataset and Error Measurement

In [26]:
def accuracy(predictions, y):
    correct = [1 if dataTrain.loc[row]['diabetes'] == pred else 0 for pred,row in zip(predictions, dataTrain.index)]
    return sum(correct)/len(correct)

In [27]:
def BER(pred, y):
    TP = np.sum([(p and l) for (p,l) in zip(pred, y)])
    FP = np.sum([(p and not l) for (p,l) in zip(pred, y)])
    TN = np.sum([(not p and not l) for (p,l) in zip(pred, y)])
    FN = np.sum([(not p and l) for (p,l) in zip(pred, y)])
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    BER = 1 - 1/2 * (TPR + TNR)
    return BER

In [29]:
def Recall(pred, y):
    TP = np.sum([(p and l) for (p,l) in zip(pred, y)])
    FN = np.sum([(not p and l) for (p,l) in zip(pred, y)])
    return TP/(TP + FN)

In [33]:
def feature(index, dataTrain = dataTrain): 
    f1 = dataTrain.loc[index]["gender"]
    f2 = dataTrain.loc[index]["age"]
    f3 = dataTrain.loc[index]["hypertension"]
    f4 = dataTrain.loc[index]["heart_disease"]
    f5 = dataTrain.loc[index]["bmi"]
    f6 = dataTrain.loc[index]["HbA1c_level"]
    f7 = dataTrain.loc[index]["blood_glucose_level"]
    return [f1] +[f2] + [f3] + [f4] + [f5] +[f6] +[f7]
 

In [34]:
X_train = [feature(index) for index in range(len(dataTrain))]
y_train = [dataTrain.loc[row]['diabetes'] for row in range(len(dataTrain))]
X_test = [feature(row) for row in range(len(dataTest))]
Y_test = [dataTest.loc[row]['diabetes'] for row in range(len(dataTest))]

Logistic Regression

In [None]:
mod_log = linear_model.LogisticRegression(fit_intercept=True)
mod_log.fit(X_train,y_train)
pred_log = mod_log.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
print("accuracy " + str(accuracy(pred_log,Y_test)) + "\n")
print("BER " + str(BER(pred_log,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_log,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_log,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_log,Y_test)))


accuracy 0.8784

BER 0.24140000000000006

balanced accuracy 0.7827045180248727

F1-Score 0.7173302107728338

Recall Score 0.6126


Naive Bayes

In [36]:
gnb = GaussianNB()
pred_gnb = gnb.fit(X_train, y_train).predict(X_test)

In [37]:
print("accuracy " + str(accuracy(pred_gnb,Y_test)) + "\n")
print("BER " + str(BER(pred_gnb,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_gnb,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_gnb,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_gnb,Y_test)))


accuracy 0.8683

BER 0.24469999999999992

balanced accuracy 0.7724072749323719

F1-Score 0.7202469418086201

Recall Score 0.63


Decision Tree Classifier 

In [41]:
mod_dtc = DecisionTreeClassifier()
pred_dtc = mod_dtc.fit(X_train, y_train).predict(X_test)

In [42]:
print("accuracy " + str(accuracy(pred_dtc,Y_test)) + "\n")
print("BER " + str(BER(pred_dtc,Y_test)) + "\n")
print("balanced accuracy " + str(balanced_accuracy_score(pred_dtc,Y_test)) + "\n")
print("F1-Score " + str(f1_score(pred_dtc,Y_test)) + "\n")
print("Recall Score " + str(Recall(pred_dtc,Y_test)))


accuracy 0.992

BER 0.15799999999999992

balanced accuracy 0.8799392097264438

F1-Score 0.8123515439429929

Recall Score 0.684
