In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('heart_disease_prediction.csv')

# Display basic info
print(df.info())
print("\nMissing values per column:")
print(df.isna().sum())
print("\nDescriptive statistics:")
print(df.describe())
print("\nTarget variable distribution:")
print(df['tenYearCHD'].value_counts(normalize=True))

ModuleNotFoundError: No module named 'imblearn'

In [2]:
!pip install numpy matplotlib seaborn scikit-learn imbalanced-learn


Collecting matplotlib
  Downloading matplotlib-3.10.3-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp310-cp310-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.4-cp310-cp310-win_amd64.whl.metadata (108 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp310-cp310-win_amd64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Collecting pandas>=1.2 (from seaborn)

In [9]:
# Handle missing values
# For numerical columns, fill with median
num_cols = ['educationLevel', 'cigsPerDay', 'totChol', 'BMI', 'glucose']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# For BPMeds (binary), fill with mode
df['BPMeds'].fillna(df['BPMeds'].mode()[0], inplace=True)

# Feature selection using ANOVA F-value
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']

selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)

# Get selected features
selected_features = X.columns[selector.get_support()]
print("\nSelected features based on ANOVA F-value:")
print(selected_features)

# Update dataframe with selected features
df = df[list(selected_features) + ['tenYearCHD']]

# Handle class imbalance with SMOTE
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Scale numerical features
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_res = pd.DataFrame(X_res_scaled, columns=X_res.columns)


Selected features based on ANOVA F-value:
Index(['gender', 'age', 'BPMeds', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'glucose'],
      dtype='object')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [5]:
print("\nRemaining NaNs after fill:")
print(df.isna().sum())



Remaining NaNs after fill:
gender             0
age                0
educationLevel     0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          1
glucose            0
tenYearCHD         0
dtype: int64


In [6]:
# Fill all remaining NaNs
for col in df.columns:
    if df[col].isna().sum() > 0:
        if df[col].dtype in ['float64', 'int64']:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [7]:
assert df.isna().sum().sum() == 0, "There are still missing values!"


In [8]:
# Feature selection using ANOVA F-value
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)


In [10]:
# First split: separate 10% for final validation
X_train_val, X_test, y_train_val, y_test = train_test_split(X_res, y_res, test_size=0.1, random_state=42)

# Second split: 80-20 split of remaining 90%
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Display model coefficients
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_[0],
    'Odds Ratio': np.exp(model.coef_[0])
})
print("\nModel coefficients:")
print(coefficients.sort_values('Coefficient', ascending=False))


Model coefficients:
        Feature  Coefficient  Odds Ratio
1           age     0.535381    1.708099
6         sysBP     0.466968    1.595150
9       glucose     0.318672    1.375300
7         diaBP     0.152500    1.164742
2        BPMeds     0.097831    1.102776
5       totChol     0.029968    1.030422
8           BMI    -0.018133    0.982030
0        gender    -0.085000    0.918512
4      diabetes    -0.138577    0.870596
3  prevalentHyp    -0.352902    0.702646


In [11]:
# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

print(f"\nAccuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Confusion Matrix:
[[418 244]
 [189 443]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66       662
           1       0.64      0.70      0.67       632

    accuracy                           0.67      1294
   macro avg       0.67      0.67      0.67      1294
weighted avg       0.67      0.67      0.67      1294


Accuracy: 0.67
Precision: 0.64
Recall: 0.70
F1 Score: 0.67


In [12]:
# Predict on the held-out 10% test set
final_pred = model.predict(X_test)

# Evaluate final performance
print("\nFinal Test Set Performance:")
print(confusion_matrix(y_test, final_pred))
print(classification_report(y_test, final_pred))

# Create a dataframe with actual and predicted values
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': final_pred,
    'Probability': model.predict_proba(X_test)[:, 1]
})

# Display some sample predictions
print("\nSample predictions:")
print(results.sample(10))


Final Test Set Performance:
[[234 135]
 [123 227]]
              precision    recall  f1-score   support

           0       0.66      0.63      0.64       369
           1       0.63      0.65      0.64       350

    accuracy                           0.64       719
   macro avg       0.64      0.64      0.64       719
weighted avg       0.64      0.64      0.64       719


Sample predictions:
      Actual  Predicted  Probability
2133       0          1     0.783669
6487       1          0     0.289491
5018       1          1     0.649788
2955       0          0     0.366740
1544       0          0     0.326829
3671       1          1     0.703136
6486       1          0     0.433003
2718       1          1     0.658910
5152       1          1     0.519506
5954       1          1     0.823034


In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('heart_disease_prediction.csv')

# Data preprocessing
# Handle missing values
num_cols = ['educationLevel', 'cigsPerDay', 'totChol', 'BMI', 'glucose']
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)
df['BPMeds'].fillna(df['BPMeds'].mode()[0], inplace=True)

# Feature selection
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']
selector = SelectKBest(f_classif, k=10)
selector.fit(X, y)
selected_features = X.columns[selector.get_support()]
df = df[list(selected_features) + ['tenYearCHD']]

# Handle class imbalance
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Scale features
scaler = StandardScaler()
X_res_scaled = scaler.fit_transform(X_res)
X_res = pd.DataFrame(X_res_scaled, columns=X_res.columns)

# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X_res, y_res, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
y_pred = model.predict(X_val)
print("Validation Set Performance:")
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))

# Final evaluation on test set
final_pred = model.predict(X_test)
print("\nTest Set Performance:")
print(confusion_matrix(y_test, final_pred))
print(classification_report(y_test, final_pred))

# Show coefficients
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_[0],
    'Odds Ratio': np.exp(model.coef_[0])
})
print("\nModel Coefficients:")
print(coefficients.sort_values('Coefficient', ascending=False))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Load the dataset
df = pd.read_csv('heart_disease_prediction.csv')

# Separate features and target
X = df.drop('tenYearCHD', axis=1)
y = df['tenYearCHD']

# Create a preprocessing pipeline that handles missing values first
preprocessor = ImbPipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Handle missing values
    ('selector', SelectKBest(f_classif, k=10)),     # Feature selection
    ('scaler', StandardScaler())                    # Feature scaling
])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X, y)

# Get selected feature names
selected_features = X.columns[preprocessor.named_steps['selector'].get_support()]
print("Selected features:", selected_features)

# Create new dataframe with processed data
processed_df = pd.DataFrame(X_processed, columns=selected_features)
processed_df['tenYearCHD'] = y.values

# Handle class imbalance with SMOTE
X_res, y_res = SMOTE(random_state=42).fit_resample(processed_df.drop('tenYearCHD', axis=1), 
                                                 processed_df['tenYearCHD'])

# Split data
X_train_val, X_test, y_train_val, y_test = train_test_split(X_res, y_res, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
y_pred = model.predict(X_val)
print("\nValidation Set Performance:")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Final evaluation on test set
final_pred = model.predict(X_test)
print("\nTest Set Performance:")
print("Confusion Matrix:")
print(confusion_matrix(y_test, final_pred))
print("\nClassification Report:")
print(classification_report(y_test, final_pred))

# Show coefficients
coefficients = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': model.coef_[0],
    'Odds Ratio': np.exp(model.coef_[0])
})
print("\nModel Coefficients:")
print(coefficients.sort_values('Coefficient', ascending=False))

Selected features: Index(['gender', 'age', 'BPMeds', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'glucose'],
      dtype='object')

Validation Set Performance:
Confusion Matrix:
[[423 239]
 [203 429]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.64      0.66       662
           1       0.64      0.68      0.66       632

    accuracy                           0.66      1294
   macro avg       0.66      0.66      0.66      1294
weighted avg       0.66      0.66      0.66      1294


Test Set Performance:
Confusion Matrix:
[[235 134]
 [110 240]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.64      0.66       369
           1       0.64      0.69      0.66       350

    accuracy                           0.66       719
   macro avg       0.66      0.66      0.66       719
weighted avg       0.66      0.66      0.66       719


Model