In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, confusion_matrix, classification_report
from scipy.stats import ttest_ind, chi2_contingency, zscore
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

data = pd.read_csv('heart_disease_prediction.csv')
data


Unnamed: 0,gender,age,educationLevel,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,tenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.10,85.0,85.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4233,1,50,1.0,1,1.0,0.0,0,1,0,313.0,179.0,92.0,25.97,66.0,86.0,1
4234,1,51,3.0,1,43.0,0.0,0,0,0,207.0,126.5,80.0,19.71,65.0,68.0,0
4235,0,48,2.0,1,20.0,,0,0,0,248.0,131.0,72.0,22.00,84.0,86.0,0
4236,0,44,1.0,1,15.0,0.0,0,0,0,210.0,126.5,87.0,19.16,86.0,,0


In [12]:
#check for missing values
print(data.isnull().sum())

gender               0
age                  0
educationLevel     105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
tenYearCHD           0
dtype: int64


In [13]:
#fill missing values
data.fillna(data.median(), inplace=True)

#check for missing values again
print(data.isnull().sum())

gender             0
age                0
educationLevel     0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
tenYearCHD         0
dtype: int64


In [14]:
desc_stats = data.describe()
desc_stats

Unnamed: 0,gender,age,educationLevel,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,tenYearCHD
count,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0,4238.0
mean,0.429212,49.584946,1.979471,0.494101,8.941482,0.029259,0.005899,0.310524,0.02572,236.689476,132.352407,82.893464,25.800205,75.878716,81.603587,0.151958
std,0.495022,8.57216,1.007081,0.500024,11.902399,0.168552,0.076587,0.462763,0.158316,44.327427,22.038097,11.91085,4.071041,12.025185,22.865246,0.359023
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,72.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,262.0,144.0,89.875,28.0375,83.0,85.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [15]:
import statsmodels.api as sm

X = data.drop(columns=['tenYearCHD'])
y = data['tenYearCHD']

X1 = sm.add_constant(X)
model = sm.OLS(y, X1)
model = model.fit()
model.tvalues

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:             tenYearCHD   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.094
Method:                 Least Squares   F-statistic:                     30.31
Date:                Wed, 25 Jun 2025   Prob (F-statistic):           9.97e-83
Time:                        00:15:43   Log-Likelihood:                -1454.9
No. Observations:                4238   AIC:                             2942.
Df Residuals:                    4222   BIC:                             3044.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              -0.5592      0.074     

In [16]:
# initialize the new data set
X = data[['gender', 'age', 'cigsPerDay', 'prevalentStroke', 'sysBP', 'glucose']] # with outliers
y = data['tenYearCHD']

In [17]:
# Split data into 90% and 10%(unseen data)
X_main, X_unseen, y_main, y_unseen = train_test_split(X, y, test_size=0.1, random_state=10)

#Split MAIN data into 80%(training) and 20%(testing)
X_train, X_test, y_train, y_test = train_test_split(X_main, y_main, test_size=0.2, random_state=10)

# Instantiate and fit logistic regression model
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# Print coefficients and intercept
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

Intercept: [-8.67907324]
Coefficients: [[0.58372393 0.0634818  0.02119323 0.77535933 0.0182996  0.00853343]]


In [18]:
#Make predictions on test data
y_pred = model.predict(X_test)

#Compute Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

#Compute Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

#Compute R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)


Mean Squared Error: 0.13892529488859764
Mean Absolute Error: 0.13892529488859764
R-squared: -0.10113002042205577


In [19]:
#check the performance values
model = LogisticRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)

model.score(X_test, y_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[644   6]
 [100  13]]
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       650
           1       0.68      0.12      0.20       113

    accuracy                           0.86       763
   macro avg       0.77      0.55      0.56       763
weighted avg       0.84      0.86      0.82       763



In [20]:
#Performance Metrics using Confusion Matrix on the 10% unseen data
y_predict_unseen = model.predict(X_unseen)

model.score(X_unseen, y_unseen)

print(confusion_matrix(y_unseen, y_predict_unseen))
print(classification_report(y_unseen, y_predict_unseen))

[[353   6]
 [ 61   4]]
              precision    recall  f1-score   support

           0       0.85      0.98      0.91       359
           1       0.40      0.06      0.11        65

    accuracy                           0.84       424
   macro avg       0.63      0.52      0.51       424
weighted avg       0.78      0.84      0.79       424

