# Predicting Heart Disease with K-NN and Logistic Regression

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix,plot_confusion_matrix, plot_roc_curve
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier

# Data

In [None]:
df = pd.read_csv('Heart_Disease_Prediction.csv')
df.head()

In [None]:
df.describe(include='all').T

In [None]:
df.shape

In [None]:
# Checking for missing values.
df.isnull().values.any()

In [None]:
# Checking for imbalanced data based on sex.
df['Sex'].value_counts()

In [None]:
# Checking for imbalanced data based on outcome.
df['Heart Disease'].value_counts()

In [None]:
sns.countplot(x='Heart Disease', data=df)

In [None]:
# Checking for any correlations.
plt.figure(figsize = (30, 25))
sns.heatmap(df.corr(),annot=True,cmap="YlGnBu")
plt.show()

# K-Nearest Neighbours 

In [None]:
df.head()

In [None]:
# Splitting the dataset into training and testing sets.
X = df.iloc[:, :-2]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.20)

In [None]:
# Using standard scaler as a standardization technique.
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [None]:
# Encode y label/ dependent column
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

for i in range(0,2):
    print("Class Label: {0}, --> Heart Disease: {1}".format(i,le.inverse_transform([i])))

In [None]:
# Creating KNN Model.
knn_clf = KNeighborsClassifier(n_neighbors = 3, p = 2, metric = 'euclidean')
knn_clf.fit(X_train,y_train)

In [None]:
y_pred = knn_clf.predict(X_test)
y_pred

In [None]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
# Define function to perform model result analysis
def getModelPerf(X_train,y_train,X_test,y_test,clf,):
    class_label = ['Absence', 'Presence']
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    
    # Performance Metrics
    accuracy  = round(100*accuracy_score(y_test,y_pred),2)
    precision = round(100*precision_score(y_test,y_pred),2)
    recall    = round(100*recall_score(y_test,y_pred),2) 
    f1        = round(f1_score(y_test,y_pred),2)
    
    cm = confusion_matrix(y_test,y_pred)
    print(" Confusion Matrix: ")
    print("----------------------")
    plot_confusion_matrix(clf,X_test,y_test,display_labels=class_label,cmap='viridis')
    plt.show()
    print("\n Accuracy metrics: ")
    print("----------------------")
    print("Accuracy:  {0}".format(accuracy))
    print("Precision: {0}".format(precision))
    print("Recall:    {0}".format(recall))
    print("F1:        {0}".format(f1))
    print("\n ROC Curve: ")
    print("----------------------")
    plot_roc_curve(clf, X_test, y_test)
    plt.show()
#     return accuracy, precision, recall, f1,clf

In [None]:
getModelPerf(X_train, y_train, X_test,y_test,knn_clf)

In [None]:
#TODO: Try different values of 'k' and check the accuracy. Select optimal 'k' value based on this.

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_clf = LogisticRegression(random_state=42)

In [None]:
getModelPerf(X_train, y_train, X_test,y_test,log_clf)