In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn import metrics

In [3]:
data = pd.read_csv('Heart Attack.csv')

In [4]:
data.head()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.8,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.06,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative


In [5]:
missing_value = data.isnull().sum()
missing_value

age              0
gender           0
impluse          0
pressurehight    0
pressurelow      0
glucose          0
kcm              0
troponin         0
class            0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1319 non-null   int64  
 1   gender         1319 non-null   int64  
 2   impluse        1319 non-null   int64  
 3   pressurehight  1319 non-null   int64  
 4   pressurelow    1319 non-null   int64  
 5   glucose        1319 non-null   float64
 6   kcm            1319 non-null   float64
 7   troponin       1319 non-null   float64
 8   class          1319 non-null   object 
dtypes: float64(3), int64(5), object(1)
memory usage: 92.9+ KB


In [7]:
data.describe()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,56.191812,0.659591,78.336619,127.170584,72.269143,146.634344,15.274306,0.360942
std,13.647315,0.474027,51.63027,26.12272,14.033924,74.923045,46.327083,1.154568
min,14.0,0.0,20.0,42.0,38.0,35.0,0.321,0.001
25%,47.0,0.0,64.0,110.0,62.0,98.0,1.655,0.006
50%,58.0,1.0,74.0,124.0,72.0,116.0,2.85,0.014
75%,65.0,1.0,85.0,143.0,81.0,169.5,5.805,0.0855
max,103.0,1.0,1111.0,223.0,154.0,541.0,300.0,10.3


In [8]:
data

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.80,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.060,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,negative
1315,66,1,84,125,55,149.0,1.33,0.172,positive
1316,45,1,85,168,104,96.0,1.24,4.250,positive
1317,54,1,58,117,68,443.0,5.80,0.359,positive


In [9]:
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])

In [10]:
data

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.80,0.012,0
1,21,1,94,98,46,296.0,6.75,1.060,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0
...,...,...,...,...,...,...,...,...,...
1314,44,1,94,122,67,204.0,1.63,0.006,0
1315,66,1,84,125,55,149.0,1.33,0.172,1
1316,45,1,85,168,104,96.0,1.24,4.250,1
1317,54,1,58,117,68,443.0,5.80,0.359,1


In [11]:
data['class'].value_counts()

class
1    810
0    509
Name: count, dtype: int64

In [12]:
X = data.drop('class',axis=1)
Y = data['class']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [14]:
model = DecisionTreeClassifier()

In [15]:
param_grid= {
    'criterion': ['gini','entropy'],
    'max_depth': [12,14,15],
    'min_samples_split' : [5,7,8],
    'min_samples_leaf' : [1,2,5],
    'splitter': ['random','best'],
    'max_features': [5,3,8,9]
}

In [16]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [17]:
best_param = grid_search.best_params_

In [18]:
best_clf= DecisionTreeClassifier(random_state=42, **best_param)
best_clf.fit(X_train,y_train)

In [19]:
y_pred = best_clf.predict(X_test)

In [20]:
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred, average='weighted')
recall = metrics.recall_score(y_test, y_pred, average='weighted')
f1_score = metrics.f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


In [21]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

Random Forest

In [65]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42,max_depth=5)
rf_clf.fit(X_train, y_train)

In [66]:
rf_y_pred = rf_clf.predict(X_test)

In [67]:
rf_accuracy = metrics.accuracy_score(y_test, rf_y_pred)
rf_precision = metrics.precision_score(y_test, rf_y_pred, average='weighted')
rf_recall = metrics.recall_score(y_test, rf_y_pred, average='weighted')
rf_f1_score = metrics.f1_score(y_test, rf_y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {rf_accuracy:.2f}")
print(f"Precision: {rf_precision:.2f}")
print(f"Recall: {rf_recall:.2f}")
print(f"F1 Score: {rf_f1_score:.2f}")

Accuracy: 0.97
Precision: 0.97
Recall: 0.97
F1 Score: 0.97


Adaboost

In [68]:
ab_cl = AdaBoostClassifier(n_estimators=50, random_state=42,algorithm='SAMME.R',learning_rate=2)
ab_cl.fit(X_train, y_train)



In [69]:
ab_y_pred = ab_cl.predict(X_test)

In [70]:
ab_accuracy = metrics.accuracy_score(y_test, ab_y_pred)
ab_precision = metrics.precision_score(y_test, ab_y_pred, average='weighted')
ab_recall = metrics.recall_score(y_test, ab_y_pred, average='weighted')
ab_f1_score = metrics.f1_score(y_test, ab_y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {ab_accuracy:.2f}")
print(f"Precision: {ab_precision:.2f}")
print(f"Recall: {ab_recall:.2f}")
print(f"F1 Score: {ab_f1_score:.2f}")

Accuracy: 0.95
Precision: 0.95
Recall: 0.95
F1 Score: 0.95


Gradient Boosting

In [71]:
gb_cl = GradientBoostingClassifier(n_estimators=100, random_state=42, criterion='squared_error',learning_rate=0.1)
gb_cl.fit(X_train, y_train)

In [72]:
gb_y_pred = gb_cl.predict(X_test)

In [73]:
gb_accuracy = metrics.accuracy_score(y_test, gb_y_pred)
gb_precision = metrics.precision_score(y_test, gb_y_pred, average='weighted')
gb_recall = metrics.recall_score(y_test, gb_y_pred, average='weighted')
gb_f1_score = metrics.f1_score(y_test, gb_y_pred, average='weighted')

# Print evaluation metrics
print(f"Accuracy: {gb_accuracy:.2f}")
print(f"Precision: {gb_precision:.2f}")
print(f"Recall: {gb_recall:.2f}")
print(f"F1 Score: {gb_f1_score:.2f}")

Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98


In [74]:
best_clf.score(X_train, y_train), best_clf.score(X_test, y_test)

(0.995260663507109, 0.9810606060606061)

In [75]:
rf_clf.score(X_train,y_train), rf_clf.score(X_test,y_test)

(0.9924170616113744, 0.9734848484848485)

In [76]:
print(f"Decision Tree Accuracy: {accuracy:.2f}")
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")
print(f"Adaboost Accuracy: {ab_accuracy:.2f}")
print(f"Gradient Boosting Accuracy: {gb_accuracy:.2f}")

Decision Tree Accuracy: 0.98
Random Forest Accuracy: 0.97
Adaboost Accuracy: 0.95
Gradient Boosting Accuracy: 0.98
