In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

In [22]:
data = pd.read_csv("../dataset/Heart Attack.csv")
data.head()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.8,0.012,negative
1,21,1,94,98,46,296.0,6.75,1.06,positive
2,55,1,64,160,77,270.0,1.99,0.003,negative
3,64,1,70,120,55,270.0,13.87,0.122,positive
4,55,1,64,112,65,300.0,1.08,0.003,negative


In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1319 non-null   int64  
 1   gender         1319 non-null   int64  
 2   impluse        1319 non-null   int64  
 3   pressurehight  1319 non-null   int64  
 4   pressurelow    1319 non-null   int64  
 5   glucose        1319 non-null   float64
 6   kcm            1319 non-null   float64
 7   troponin       1319 non-null   float64
 8   class          1319 non-null   object 
dtypes: float64(3), int64(5), object(1)
memory usage: 92.9+ KB


In [25]:

# Step 2: Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'class' column
data['class'] = label_encoder.fit_transform(data['class'])

# Step 3: Display the DataFrame
print(data.head())

   age  gender  impluse  pressurehight  pressurelow  glucose    kcm  troponin  \
0   64       1       66            160           83    160.0   1.80     0.012   
1   21       1       94             98           46    296.0   6.75     1.060   
2   55       1       64            160           77    270.0   1.99     0.003   
3   64       1       70            120           55    270.0  13.87     0.122   
4   55       1       64            112           65    300.0   1.08     0.003   

   class  
0      0  
1      1  
2      0  
3      1  
4      0  


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1319 non-null   int64  
 1   gender         1319 non-null   int64  
 2   impluse        1319 non-null   int64  
 3   pressurehight  1319 non-null   int64  
 4   pressurelow    1319 non-null   int64  
 5   glucose        1319 non-null   float64
 6   kcm            1319 non-null   float64
 7   troponin       1319 non-null   float64
 8   class          1319 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 92.9 KB


In [27]:
data.head()

Unnamed: 0,age,gender,impluse,pressurehight,pressurelow,glucose,kcm,troponin,class
0,64,1,66,160,83,160.0,1.8,0.012,0
1,21,1,94,98,46,296.0,6.75,1.06,1
2,55,1,64,160,77,270.0,1.99,0.003,0
3,64,1,70,120,55,270.0,13.87,0.122,1
4,55,1,64,112,65,300.0,1.08,0.003,0


In [11]:
data.isnull().sum()

age              0
gender           0
impluse          0
pressurehight    0
pressurelow      0
glucose          0
kcm              0
troponin         0
class            0
dtype: int64

In [28]:
data.dropna(inplace=True)


In [29]:
X = data.drop('class', axis=1)
y = data['class']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [31]:
clf = LogisticRegression(random_state=42)


In [32]:
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')


In [33]:
clf.fit(X_train, y_train)


In [34]:
y_pred = clf.predict(X_test)


In [35]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [36]:
print("K-Fold Cross-Validation Results:")
for i, score in enumerate(cv_scores, 1):
    print(f"Fold {i}: {score:.2f}")

K-Fold Cross-Validation Results:
Fold 1: 0.77
Fold 2: 0.83
Fold 3: 0.80
Fold 4: 0.78
Fold 5: 0.85


In [38]:
# param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],         # Inverse of regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Type of regularization
    'solver': ['liblinear', 'saga']               # Algorithms for optimization
}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [40]:
best_params = grid_search.best_params_
best_params


{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}

In [41]:
best_clf = LogisticRegression(random_state=42, **best_params)
best_clf.fit(X_train, y_train)

In [43]:
y_pred_tuned = best_clf.predict(X_test)


In [44]:
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)
print("\nGrid Search Results:")
print(f"Best Hyperparameters: {best_params}")
print(f"Test Set Accuracy with Tuned Model: {accuracy_tuned:.2f}")


Grid Search Results:
Best Hyperparameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Test Set Accuracy with Tuned Model: 0.91


In [45]:
# Perform K-Fold Cross-Validation and store the accuracy scores
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')

# Calculate the accuracy of the tuned model
accuracy_tuned = accuracy_score(y_test, y_pred_tuned)

# Print and compare the results side by side
print("K-Fold Cross-Validation Results:")
print(f"Mean Accuracy: {np.mean(cv_scores):.2f}")
print(f"Test Set Accuracy: {accuracy:.2f}")

print("\nGrid Search Results:")
print(f"Best Hyperparameters: {best_params}")
print(f"Test Set Accuracy with Tuned Model: {accuracy_tuned:.2f}")

K-Fold Cross-Validation Results:
Mean Accuracy: 0.80
Test Set Accuracy: 0.80

Grid Search Results:
Best Hyperparameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Test Set Accuracy with Tuned Model: 0.91
