In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('HeartAttack.csv')
df.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,63,1,66,160,83,160.0,1.8,0.012,negative
1,20,1,94,98,46,296.0,6.75,1.06,positive
2,56,1,64,160,77,270.0,1.99,0.003,negative
3,66,1,70,120,55,270.0,13.87,0.122,positive
4,54,1,64,112,65,300.0,1.08,0.003,negative


In [3]:
df.shape

(1319, 9)

In [4]:
df.isnull().sum()

Age                         0
Gender                      0
Heart rate                  0
Systolic blood pressure     0
Diastolic blood pressure    0
Blood sugar                 0
CK-MB                       0
Troponin                    0
Result                      0
dtype: int64

In [5]:
df.duplicated().sum()

0

In [6]:
le = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = le.fit_transform(df[column])

In [7]:
columns_to_scale = ['Age', 'Heart rate', 'Systolic blood pressure', 'Diastolic blood pressure', 'Blood sugar', 'CK-MB']

scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

df.head()

Unnamed: 0,Age,Gender,Heart rate,Systolic blood pressure,Diastolic blood pressure,Blood sugar,CK-MB,Troponin,Result
0,0.499279,1,-0.239032,1.257215,0.764927,0.178459,-0.290962,0.012,0
1,-2.654832,1,0.303491,-1.117098,-1.872542,1.994344,-0.184072,1.06,1
2,-0.014181,1,-0.277784,1.257215,0.337229,1.647189,-0.286859,0.003,0
3,0.719333,1,-0.161529,-0.2746,-1.230995,1.647189,-0.030324,0.122,1
4,-0.160884,1,-0.277784,-0.580963,-0.518166,2.047752,-0.306509,0.003,0


In [8]:
X = df.drop(columns=['Result']) 
y = df['Result'] 

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=0)

In [10]:
model = GaussianNB()

In [11]:
def perform_kfold_cv(model, X, Y, k):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, Y, cv=kf)
    mean_score = np.mean(scores)
    print(f"Scores for k={k}: {scores}")
    print(f"Mean score for k={k}: {mean_score}")
    return mean_score

# Function to find the best k and fit the model
def find_best_k_and_fit(model, X, Y, k_values):
    mean_scores = {}
    for k in k_values:
        mean_scores[k] = perform_kfold_cv(model, X, Y, k)
    
    best_k = max(mean_scores, key=mean_scores.get)
    print(f"Best k is {best_k} with a mean score of {mean_scores[best_k]}")
    
    # Fit the model using all the training data
    model.fit(X, Y)
    print("Model fitted with the entire dataset.")
    return model


In [12]:
k_values=[5,7,10]
print("Gaussian Naive Bayes Model:")
best_nb_model = find_best_k_and_fit(model, x_train, y_train, k_values)

Gaussian Naive Bayes Model:
Scores for k=5: [0.94339623 0.75471698 0.75471698 0.73076923 0.69230769]
Mean score for k=5: 0.7751814223512337
Scores for k=7: [0.78947368 0.94736842 0.86842105 0.73684211 0.7027027  0.75675676
 0.7027027 ]
Mean score for k=7: 0.7863239179028654
Scores for k=10: [0.74074074 0.96296296 0.66666667 0.84615385 0.73076923 0.76923077
 0.65384615 0.84615385 0.69230769 0.69230769]
Mean score for k=10: 0.7601139601139602
Best k is 7 with a mean score of 0.7863239179028654
Model fitted with the entire dataset.


In [13]:
y_pred = model.predict(x_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy:{accuracy}")
print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"F1-Score:{f1}")


Accuracy:0.7660984848484849
Precision:0.8135019635019636
Recall:0.8041130270130976
F1-Score:0.7658698421161765


In [14]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[421,   2],
       [245, 388]], dtype=int64)

In [15]:
dt_model = DecisionTreeClassifier(random_state=42)
print("Decision Tree Classifier:")
best_dt_model = find_best_k_and_fit(dt_model, x_train, y_train, k_values)

Decision Tree Classifier:
Scores for k=5: [0.98113208 1.         0.98113208 0.92307692 0.94230769]
Mean score for k=5: 0.9655297532656023
Scores for k=7: [0.97368421 0.97368421 1.         1.         0.89189189 0.97297297
 0.94594595]
Mean score for k=7: 0.9654541759804918
Scores for k=10: [0.96296296 0.96296296 1.         1.         1.         0.96153846
 0.84615385 0.92307692 1.         0.92307692]
Mean score for k=10: 0.9579772079772081
Best k is 5 with a mean score of 0.9655297532656023
Model fitted with the entire dataset.


In [16]:
y_pred = dt_model.predict(x_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy:{accuracy}")
print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"F1-Score:{f1}")
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

Accuracy:0.9763257575757576
Precision:0.9784404502541757
Recall:0.9724098909840566
F1-Score:0.9751998805085152


array([[403,  20],
       [  5, 628]], dtype=int64)

In [17]:
rf_model = RandomForestClassifier(random_state=42)
print("Random Forest Classifier:")
best_rf_model = find_best_k_and_fit(rf_model, x_train, y_train, k_values)

Random Forest Classifier:
Scores for k=5: [0.96226415 0.98113208 0.96226415 0.94230769 0.98076923]
Mean score for k=5: 0.9657474600870828
Scores for k=7: [0.97368421 0.97368421 0.97368421 1.         0.94594595 0.97297297
 1.        ]
Mean score for k=7: 0.9771387929282666
Scores for k=10: [0.96296296 0.96296296 1.         0.96153846 1.         1.
 0.92307692 0.96153846 1.         0.96153846]
Mean score for k=10: 0.9733618233618234
Best k is 7 with a mean score of 0.9771387929282666
Model fitted with the entire dataset.


In [18]:
y_pred = rf_model.predict(x_test) 
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
print(f"Accuracy:{accuracy}")
print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"F1-Score:{f1}")
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

Accuracy:0.9753787878787878
Precision:0.9772090861338584
Recall:0.9716200015685748
F1-Score:0.9742187500000001


array([[403,  20],
       [  6, 627]], dtype=int64)