In [26]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For model building and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import statsmodels.api as sm
from statsmodels.tools import add_constant

In [45]:
df = pd.read_csv("heart_disease_prediction.csv")

# Display the first few rows
df.head()

Unnamed: 0,gender,age,educationLevel,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,tenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [28]:
# Check column info and missing values
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   educationLevel   4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  tenYearCHD       4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


gender               0
age                  0
educationLevel     105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
tenYearCHD           0
dtype: int64

In [29]:
# Drop rows with missing values
df_clean = df.dropna()

# Re-check
df_clean.isnull().sum()

gender             0
age                0
educationLevel     0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
tenYearCHD         0
dtype: int64

In [30]:
# Identify numeric columns only
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()

#  to detect outliers using IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return data[(data[column] < lower) | (data[column] > upper)]

# Detect and count outliers for each column
for col in numeric_cols:
    outliers = detect_outliers_iqr(df_clean, col)
    print(f"{col}: {len(outliers)} outliers")

gender: 0 outliers
age: 0 outliers
educationLevel: 0 outliers
currentSmoker: 0 outliers
cigsPerDay: 10 outliers
BPMeds: 111 outliers
prevalentStroke: 21 outliers
prevalentHyp: 0 outliers
diabetes: 99 outliers
totChol: 46 outliers
sysBP: 108 outliers
diaBP: 69 outliers
BMI: 85 outliers
heartRate: 80 outliers
glucose: 175 outliers
tenYearCHD: 557 outliers


In [31]:
# Remove outliers
for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

# Confirm that all outliers are removed
print("Remaining outliers after filtering (should be 0):")
for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = df_clean[(df_clean[col] < lower) | (df_clean[col] > upper)]
    print(f"{col}: {len(outliers)} outliers")

Remaining outliers after filtering (should be 0):
gender: 0 outliers
age: 0 outliers
educationLevel: 0 outliers
currentSmoker: 0 outliers
cigsPerDay: 0 outliers
BPMeds: 0 outliers
prevalentStroke: 0 outliers
prevalentHyp: 578 outliers
diabetes: 0 outliers
totChol: 5 outliers
sysBP: 31 outliers
diaBP: 13 outliers
BMI: 16 outliers
heartRate: 63 outliers
glucose: 0 outliers
tenYearCHD: 0 outliers


In [48]:
# Prepare the data
X = df_clean.drop('tenYearCHD', axis=1)
y = df_clean['tenYearCHD']

# Add constant for statsmodels
X_const = sm.add_constant(X)

# Fit logistic regression model to get p-values
logit_model = sm.Logit(y, X_const)
result = logit_model.fit()

# Get p-values and select significant features (p < 0.05)
p_values = result.pvalues[1:]  # Exclude the constant term
significant_features = p_values[p_values < 0.05].index.tolist()

print("Significant features (p < 0.05):")
print(significant_features)


         Current function value: 0.000000
         Iterations: 35




LinAlgError: Singular matrix

In [49]:
# Use only the significant features
X_selected = df_clean[significant_features]

# Split data (maintain 90% for training/testing, 10% unseen)
X_main, X_unseen, y_main, y_unseen = train_test_split(
    X_selected, y, test_size=0.1, random_state=42, stratify=y
)

# Split main data into train and test (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X_main, y_main, test_size=0.2, random_state=42, stratify=y_main
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print("\nClass distribution before SMOTE:")
print(y_train.value_counts())
print("Class distribution after SMOTE:")
print(pd.Series(y_train_balanced).value_counts())

ValueError: at least one array or dtype is required