In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score, log_loss

from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import zscore

import pickle

In [2]:
# Load Data
df = pd.read_csv('heart.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# EDA
df.info() # No Null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [6]:
x = df.drop('target', axis = 1)
y = df['target']

In [7]:
# Check Outliers
z_scores = np.abs(zscore(x))
outliers = x[(z_scores > 3)]
outliers

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
28,65,0,2,140,417,1,0,157,0,0.8,2,1,2
48,53,0,2,128,216,0,0,115,0,0.0,2,0,0
85,67,0,2,115,564,0,0,160,0,1.6,1,0,3
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2
158,58,1,1,125,220,0,1,144,0,0.4,1,4,3
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2
204,62,0,0,160,164,0,0,145,0,6.2,0,3,3
220,63,0,0,150,407,0,0,154,0,4.0,1,3,3
221,55,1,0,140,217,0,1,111,1,5.6,0,0,3


In [8]:
# Handling Outliers using Standardization

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns = x.columns)
x_scaled.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.952197,0.681005,1.973123,0.763956,-0.256334,2.394438,-1.005832,0.015443,-0.696631,1.087338,-2.274579,-0.714429,-2.148873
1,-1.915313,0.681005,1.002577,-0.092738,0.072199,-0.417635,0.898962,1.633471,-0.696631,2.122573,-2.274579,-0.714429,-0.512922
2,-1.474158,-1.468418,0.032031,-0.092738,-0.816773,-0.417635,-1.005832,0.977514,-0.696631,0.310912,0.976352,-0.714429,-0.512922
3,0.180175,0.681005,0.032031,-0.663867,-0.198357,-0.417635,0.898962,1.239897,-0.696631,-0.206705,0.976352,-0.714429,-0.512922
4,0.290464,-1.468418,-0.938515,-0.663867,2.08205,-0.417635,0.898962,0.583939,1.435481,-0.379244,0.976352,-0.714429,-0.512922


In [9]:
# Check Multicollinearity
vif_df = pd.DataFrame()
vif_df['Features'] = x_scaled.columns
vif_df['VIF'] = [variance_inflation_factor(x_scaled.values, i) for i in range(x.shape[1])]
print("Variance Inflation Factor:")
vif_df

# No multicollinearity as VIF is not greater than 5 for any of the features

Variance Inflation Factor:


Unnamed: 0,Features,VIF
0,age,1.443474
1,sex,1.161866
2,cp,1.284456
3,trestbps,1.170591
4,chol,1.150174
5,fbs,1.087379
6,restecg,1.060998
7,thalach,1.613726
8,exang,1.402001
9,oldpeak,1.705857


In [10]:
# Split training and testing data
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 42, stratify = y)

In [11]:
log_reg_model = LogisticRegression()
log_reg_model.fit(x_train, y_train)

In [12]:
# Model Evaluation
y_test_pred = log_reg_model.predict(x_test)

In [13]:
y_test_pred[:5]

array([0, 0, 0, 1, 1])

In [14]:
y_test[:5]

179    0
197    0
285    0
194    0
188    0
Name: target, dtype: int64

In [15]:
y_train_pred = log_reg_model.predict(x_train)

In [16]:
y_train_pred[:5]

array([1, 0, 0, 0, 1])

In [17]:
y_train[:5]

19     1
247    0
289    0
288    0
60     1
Name: target, dtype: int64

In [20]:
# Evaluation Metrics
# Testing Data Evaluation
print("Confusion Matrix :")
print(confusion_matrix(y_test, y_test_pred))

print("Classification Report:")
print(classification_report(y_test, y_test_pred))

print(f"F1 Score: {f1_score(y_test, y_test_pred)}")

y_test_prob = log_reg_model.predict_log_proba(x_test)[:,1]
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_test_prob)}")
print(f"Log Loss: {log_loss(y_test, y_test_prob)}")

Confusion Matrix :
[[19  9]
 [ 3 30]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.68      0.76        28
           1       0.77      0.91      0.83        33

    accuracy                           0.80        61
   macro avg       0.82      0.79      0.80        61
weighted avg       0.81      0.80      0.80        61

F1 Score: 0.8333333333333334
ROC-AUC Score: 0.8690476190476191
Log Loss: 19.49902560394863


In [21]:
#Training Data Evaluation
y_train_prob = log_reg_model.predict_log_proba(x_train)[:,1]
print(f"F1 Score: {f1_score(y_train, y_train_pred)}")
print(f"ROC-AUC Score: {roc_auc_score(y_train, y_train_prob)}")
print(f"Log Loss: {log_loss(y_train, y_train_prob)}")

F1 Score: 0.8623188405797102
ROC-AUC Score: 0.9278236914600552
Log Loss: 19.660174575882078


In [22]:
# Save Model
with open ('log_reg_heart.pkl', 'wb') as f:
    pickle.dump(log_reg_model, f)

# Save Standardization
with open ('std_scaler_heart.pkl', 'wb') as f:
    pickle.dump(scaler, f)