In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#for visualization of Dataset
import pandas
from pandas.plotting import scatter_matrix
import pylab

# importing necessary libraries
from sklearn import datasets
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## **Load Dataset from Local**

In [None]:
from google.colab import files

uploaded = files.upload()

## **Store the dataset into pandas data frame**

In [None]:
# load the dataset (local path)
dataset= pd.read_csv("processed.cleveland.data.csv", names=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','output'])
dataset

In [None]:
dataset.head()

## **Checking Missing Values**

In [None]:
dataset.isna()

In [None]:
dataset.isna().values

In [None]:
dataset.isna().sum()

## **Fill the missing values with mean**

In [None]:
dataset_mean= pd.read_csv("processed.cleveland.data.csv", names=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','output'])
dataset_mean

## **Missing Values replaced with mean**

In [None]:
# Filling missing values Statistics measures
print("*****Before Fill Missing values Row 166,192,287,302********")
print(dataset_mean.loc[287])

In [None]:
dataset1=dataset_mean
df1=pd.DataFrame(dataset1)
print(df1)

In [None]:
print("-------- Mean of Column 11 'ca' --------")
print(df1['ca'].mean())

In [None]:
df1.fillna(df1.mean(), inplace=True)
print("*****After Fill Missing values Row 166,192,287,302********")
print(df1.loc[[166,192,287,302]])

In [None]:
print("-------- Mean of Column 12 'thal' --------")
print(df1['thal'].mean())
df1.fillna(df1.mean(), inplace=True)
print("*****After Fill Missing values Row 87,266********")
print(df1.loc[[87,266]])

## **Dataset after filling all missing values**

In [None]:
dataset_new=df1
print(dataset_new)

In [None]:
dataset_new.shape

## **Dataset Summary - Number of Samples and Features**

In [None]:
#Number of patients
n_patients = dataset_new.shape[0]

#Number of features
n_features = dataset_new.shape[1]-1

dataset_new["output"].replace(to_replace=[1,2,3,4],value=1,inplace=True)

#With Parkinsons disease
heart_disease = dataset_new[dataset_new['output'] == 1].shape[0]

#Without Parkinsons (Healthy individuals)
no_heart_disease = dataset_new[dataset_new['output'] == 0].shape[0]

#Result Output
print("Total number of patients: {} ".format(n_patients))
print("Number of features: {}".format(n_features))
print("Number of patients with heart disease: {}".format(heart_disease))
print("Number of patients without heart disease: {}".format(no_heart_disease))

In [None]:
dataset_new.groupby(['output']).size()

In [None]:
# Extract feature columns
feature_cols = list(dataset_new.columns[0:13])


# Show the list of columns
print("Feature columns:\n{}".format(feature_cols))

In [None]:
# Separate the data into feature data and target data (X_all and y_all, respectively)
X= dataset_new[feature_cols]
y= dataset_new['output'].values

# Show the feature information by printing the first five rows
print("\nFeature values:")
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)
print(X_train)

In [None]:
print(X_train.shape)
print(y_train.shape)
print (X_test.shape)
print (y_test.shape)

In [None]:
y_train

In [None]:
df_label_train = pd.DataFrame(y_train)
df_label_train

## **Normalization - Z-score**

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
print("----After Z-score Normalization on X_train-------")
print(X_train)

scaler.fit(X_test)
X_test = scaler.transform(X_test)
print("----After Z-score Normalization on X_test-------")
print(X_test)

## **Classification**

### **k-NN Classifier**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

print("Knn results on Test data")

# Classifier declared
knn = KNeighborsClassifier(n_neighbors = 5)

#Training
knn.fit(X_train, y_train)

#Testing
knn_predictions = knn.predict(X_test)

# creating a confusion matrix
cm = confusion_matrix(y_test, knn_predictions)
print(cm)

# print performance measures
print("Accuracy:",accuracy_score(y_test, knn_predictions))
print("Precision:",precision_score(y_test, knn_predictions))
print("Recall:",recall_score(y_test, knn_predictions))
print("F1-score:",f1_score(y_test, knn_predictions))

### **Naive-Bayes Classifier**

In [None]:
# training and prediction through a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

print("Naive Bayes results on Test data")

# Classifier declared
gnb = GaussianNB()

#training
gnb.fit(X_train, y_train)

#testing
gnb_predictions = gnb.predict(X_test)

# creating a confusion matrix
cm = confusion_matrix(y_test, gnb_predictions)
print(cm)

# print performance measures
print("Accuracy:",accuracy_score(y_test, gnb_predictions))
print("Precision:",precision_score(y_test, gnb_predictions))
print("Recall:",recall_score(y_test, gnb_predictions))
print("F1-score:",f1_score(y_test, gnb_predictions))

### **Knn with best K - How to choose?**

In [None]:
# training and prediction through a KNN classifier
from sklearn.neighbors import KNeighborsClassifier

for i in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    # accuracy on X_test
    accuracy = knn.score(X_test, y_test)
    if i%1 == 0:
        print("Accuracy for K="+str(i)+":",accuracy)
        # creating a confusion matrix
        knn_predictions = knn.predict(X_test)
        cm = confusion_matrix(y_test, knn_predictions)
        print(cm)

In [None]:
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 10), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')