In [None]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import matplotlib.pyplot as plt
from sklearn import svm
import numpy as np
from sklearn import linear_model
import pandas as pd 
from ydata_profiling import ProfileReport
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score

In [29]:
file_path_train = "C:/Users/jeyas/OneDrive/Documents/CSE158/HeartAttackPrediction/Datasets/smote_diabetes_train.csv"
file_path_test = "C:/Users/jeyas/OneDrive/Documents/CSE158/HeartAttackPrediction/Datasets/diabetes_test.csv"
dataTrain = pd.read_csv(file_path_train) #, index_col = "Unnamed:0")
dataTest = pd.read_csv(file_path_test)
dataTest.rename(columns = {"Unnamed: 0": "index"}, inplace = True)
dataTrain.rename(columns = {"Unnamed: 0": "index"}, inplace = True)

# Error Metrics

In [55]:
def accuracy(predictions, y, dataTrain = dataTest):
    correct = [1 if dataTrain.loc[row]['diabetes'] == pred else 0 for pred,row in zip(predictions, dataTrain.index)]
    return sum(correct) / len(correct)

In [61]:
def BER(pred, y):
    pred = list(pred)
    TP = np.sum([(p and l) for (p,l) in zip(pred, y)])
    FP = np.sum([(p and not l) for (p,l) in zip(pred, y)])
    TN = np.sum([(not p and not l) for (p,l) in zip(pred, y)])
    FN = np.sum([(not p and l) for (p,l) in zip(pred, y)])
    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)
    BER = 1 - 1/2 * (TPR + TNR)
    return BER

In [65]:
def Recall(pred, y):
    pred = list(pred)
    TP = np.sum([(p and l) for (p,l) in zip(pred, y)])
    FN = np.sum([(not p and l) for (p,l) in zip(pred, y)])
    return TP/(TP + FN)

# Features 

In [33]:
def feature(row, dataTrain = dataTrain):
    feat1 = dataTrain.loc[row]['HbA1c_level']
    feat2 = dataTrain.loc[row]['blood_glucose_level']
    return [1] + [feat1] + [feat2]

In [34]:
X_train = [feature(row) for row in range(len(dataTrain))]
y_train = [dataTrain.loc[row]['diabetes'] for row in range(len(dataTrain))]
X_test = [feature(row) for row in range(len(dataTest))]
Y_test = [dataTest.loc[row]['diabetes'] for row in range(len(dataTest))]

# Logistic Regression Model

In [66]:
mod = linear_model.LogisticRegression(class_weight='balanced')
mod.fit(X_train,y_train)
pred = mod.predict(X_test)
print("BER " + str(BER(pred, Y_test)))
print("balanced_accuracy_score " + str(balanced_accuracy_score(pred,Y_test,adjusted = True))) #,sample_weight = [86500,3500])
print("F1-Score " + str(f1_score(pred,Y_test)))
print("Recall Score " + str(Recall(pred,Y_test)))

BER 0.27480000000000004
balanced_accuracy_score 0.4804266666666668
F1-Score 0.6859428571428572
Recall Score 0.6002


# Ridge Classifier

In [68]:
mod = linear_model.RidgeClassifier(class_weight='balanced')
mod.fit(X_train,y_train)
pred = mod.predict(X_test)
print("BER: " + str(BER(pred, Y_test)))
print("balanced_accuracy_score: " + str(balanced_accuracy_score(pred,Y_test,adjusted = True))) #,sample_weight = [86500,3500])
print("F1-Score: " + str(f1_score(pred,Y_test)))
print("Recall Score " + str(Recall(pred,Y_test)))

BER: 0.27160000000000006
balanced_accuracy_score: 0.5046755404712662
F1-Score: 0.6789598108747045
Recall Score 0.5744


# Baseline Model Based on Threshold with HbA1c Levels

In [43]:
best_accuracy = 0 
best_threshold = 0
for threshold in np.arange(6.5, 6.7, 0.1):
    ypred = [1 if dataTrain.loc[row]['HbA1c_level'] > threshold else 0 for row in dataTrain.index]
    correct = [1 if dataTrain.loc[row]['diabetes'] == pred else 0 for pred,row in zip(ypred, dataTrain.index)]
    accuracy = sum(correct)/len(correct)
    if accuracy > best_accuracy:
        best_accuracy = accuracy 
        best_threshold = threshold 
    print(best_threshold)


6.5
6.6
6.6


In [69]:
# best_threshold = 6.609999999999998
ypred = [1 if dataTest.loc[row]['HbA1c_level'] > best_threshold else 0 for row in dataTest.index]
correct = [1 if dataTest.loc[row]['diabetes'] == pred else 0 for pred,row in zip(ypred, dataTest.index)]
y = [dataTest.loc[row]['diabetes'] for row in dataTest.index]
accuracy = sum(correct)/len(correct)
print("BER: " + str(BER(pred, Y_test)))
print("balanced_accuracy_score: " + str(balanced_accuracy_score(pred,Y_test,adjusted = True))) #,sample_weight = [86500,3500])
print("F1-Score: " + str(f1_score(pred,Y_test)))
print("Recall Score " + str(Recall(pred,Y_test)))

BER: 0.27160000000000006
balanced_accuracy_score: 0.5046755404712662
F1-Score: 0.6789598108747045
Recall Score 0.5744
