In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [49]:
# Load the dataset
df = pd.read_csv("heart.csv")

In [50]:
# Basic dataset inspection
print("Dataset Shape:", df.shape)
print("Missing Values:")
print(df.isnull().sum())

Dataset Shape: (1025, 14)
Missing Values:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [51]:
# Splitting features and target
target = 'target'  # Adjust based on dataset
y = df[target]
X = df.drop(columns=[target])

In [52]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2


In [91]:
X["ca"].value_counts()

ca
0    578
1    226
2    134
3     69
4     18
Name: count, dtype: int64

In [53]:
X.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
dtype: object

In [54]:
y.value_counts()


target
1    526
0    499
Name: count, dtype: int64

In [55]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [56]:
model = LogisticRegression(max_iter=1000)

In [76]:
# Get the mask of selected features
selected_feature_mask = rfe.support_

In [78]:
# Get the names of the selected features
selected_features = X.columns[selected_feature_mask]

In [80]:
# Filter the training data to only include the selected features
X_train_selected = X_train[selected_features]

In [81]:
scaler = StandardScaler().fit(X_train_selected)
rfe = RFE(estimator=model, n_features_to_select=3).fit(X_train_selected, y_train)

In [82]:
# Initialize the model
model = LogisticRegression(max_iter=1000)

# Apply RFE on the scaled training data
rfe = RFE(estimator=model, n_features_to_select=3)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)

# Get the names of the selected features
selected_feature_mask = rfe.support_
selected_features = X.columns[selected_feature_mask]

print("Selected Features:", selected_features.tolist())

Selected Features: ['cp', 'oldpeak', 'ca']


In [83]:
# Transform the test data to include only selected features
X_test_rfe = rfe.transform(X_test_scaled)

In [84]:
model.fit(X_train_rfe, y_train)

In [85]:
# Evaluate the model on the test data using the selected features
X_test_rfe = rfe.transform(X_test)  # Transform test data to the same features as training data
y_pred = model.predict(X_test_rfe)



In [86]:
# Print classification report and confusion matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.90      0.79       100
           1       0.87      0.63      0.73       105

    accuracy                           0.76       205
   macro avg       0.78      0.76      0.76       205
weighted avg       0.79      0.76      0.76       205



In [87]:
joblib.dump(scaler, "scaler.pkl")
joblib.dump(model, "logistic_model.pkl")
joblib.dump(rfe, "rfe.pkl")

['rfe.pkl']

In [88]:
# Save selected features for Streamlit app
joblib.dump(selected_features.tolist(), "selected_features.pkl")

['selected_features.pkl']