In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
import pickle

In [2]:
heart_data = pd.read_csv('Datasets/heart.csv')
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [4]:
numerical_columns = ['age','trestbps','chol','thalach','oldpeak']
z_scores = heart_data[numerical_columns].apply(zscore)
z_scores

Unnamed: 0,age,trestbps,chol,thalach,oldpeak
0,0.952197,0.763956,-0.256334,0.015443,1.087338
1,-1.915313,-0.092738,0.072199,1.633471,2.122573
2,-1.474158,-0.092738,-0.816773,0.977514,0.310912
3,0.180175,-0.663867,-0.198357,1.239897,-0.206705
4,0.290464,-0.663867,2.082050,0.583939,-0.379244
...,...,...,...,...,...
298,0.290464,0.478391,-0.101730,-1.165281,-0.724323
299,-1.033002,-1.234996,0.342756,-0.771706,0.138373
300,1.503641,0.706843,-1.029353,-0.378132,2.036303
301,0.290464,-0.092738,-2.227533,-1.515125,0.138373


In [9]:
threshold = 3
outliers = (z_scores.abs() > threshold).any(axis=1)
outliers

0      False
1      False
2      False
3      False
4      False
       ...  
298    False
299    False
300    False
301    False
302    False
Length: 303, dtype: bool

In [11]:
data_without_outliers = heart_data[~outliers]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_without_outliers[numerical_columns])
scaled_data = pd.DataFrame(scaled_features, columns = [col + '_scaled' for col in numerical_columns])
categorical_columns = [col for col in heart_data.columns if col not in numerical_columns]
final_data = pd.concat([data_without_outliers[categorical_columns].reset_index(drop =True), scaled_data], axis=1)
final_data

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,ca,thal,target,age_scaled,trestbps_scaled,chol_scaled,thalach_scaled,oldpeak_scaled
0,1,3,1,0,0,0,0,1,1,0.969920,0.830906,-0.232573,0.004999,1.223393
1,1,2,0,1,0,0,0,2,1,-1.888241,-0.063043,0.138801,1.652914,2.340694
2,0,1,0,0,0,2,0,2,1,-1.448524,-0.063043,-0.866093,0.984840,0.385418
3,1,1,0,1,0,2,0,2,1,0.200415,-0.659009,-0.167037,1.252070,-0.173232
4,0,0,0,1,1,2,0,2,1,0.310345,-0.659009,2.410735,0.583996,-0.359449
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0,0,0,1,1,1,0,3,0,0.310345,0.532923,-0.057809,-1.197533,-0.731883
290,1,3,0,1,0,1,0,3,0,-1.008807,-1.254974,0.444638,-0.796689,0.199201
291,1,0,1,1,0,1,2,3,0,1.519566,0.771310,-1.106394,-0.395845,2.247586
292,1,0,0,1,1,1,1,3,0,0.310345,-0.063043,-2.460816,-1.553839,0.199201


In [13]:
X = final_data.drop(columns ='target',axis=1)
Y=final_data['target']
X,Y

(     sex  cp  fbs  restecg  exang  slope  ca  thal  age_scaled  \
 0      1   3    1        0      0      0   0     1    0.969920   
 1      1   2    0        1      0      0   0     2   -1.888241   
 2      0   1    0        0      0      2   0     2   -1.448524   
 3      1   1    0        1      0      2   0     2    0.200415   
 4      0   0    0        1      1      2   0     2    0.310345   
 ..   ...  ..  ...      ...    ...    ...  ..   ...         ...   
 289    0   0    0        1      1      1   0     3    0.310345   
 290    1   3    0        1      0      1   0     3   -1.008807   
 291    1   0    1        1      0      1   2     3    1.519566   
 292    1   0    0        1      1      1   1     3    0.310345   
 293    0   1    0        0      0      1   1     2    0.310345   
 
      trestbps_scaled  chol_scaled  thalach_scaled  oldpeak_scaled  
 0           0.830906    -0.232573        0.004999        1.223393  
 1          -0.063043     0.138801        1.652914      

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [17]:
print("Class Distribution Before SMOTE:", np.unique(Y_train, return_counts=True))
smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)
print("Class Distribution After SMOTE:", np.unique(Y_train_resampled, return_counts=True))

Class Distribution Before SMOTE: (array([0, 1], dtype=int64), array([105, 130], dtype=int64))
Class Distribution After SMOTE: (array([0, 1], dtype=int64), array([130, 130], dtype=int64))


In [19]:
scaler= StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

In [21]:
base_estimators =[
    ('rf',RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)),
    ('svm', SVC(kernel='rbf', C=100, gamma=0.001, probability=True, class_weight='balanced'))
]

model = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression(class_weight='balanced', random_state=42))
model.fit(X_train_scaled, Y_train_resampled)
print("Train Accuracy:", model.score(X_train_scaled, Y_train_resampled))
print("Test Accuracy:", model.score(X_test_scaled, Y_test))
print("Classification Report:\n", classification_report(Y_test, model.predict(X_test_scaled)))

Train Accuracy: 0.9538461538461539
Test Accuracy: 0.8983050847457628
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88        26
           1       0.91      0.91      0.91        33

    accuracy                           0.90        59
   macro avg       0.90      0.90      0.90        59
weighted avg       0.90      0.90      0.90        59



In [23]:
def predict_heart_disease(input_data):
    np_array = np.asarray(input_data)
    input_data_reshaped = np_array.reshape(1, -1)
    scaled_data = scaler.transform(input_data_reshaped)
    prediction = model.predict(scaled_data)
    probabilities = model.predict_proba(scaled_data)
    print(f"Prediction: {'Heart Disease' if prediction[0] == 1 else 'No Heart Disease'}")
    print(f"Prediction Probabilities: {probabilities}")

In [25]:
input_data_1 = (1,2,1,2,1,2,1,3,65,150,280,120,2.5)
predict_heart_disease(input_data_1)

Prediction: Heart Disease
Prediction Probabilities: [[0.37791201 0.62208799]]




In [27]:
input_data_2 = (0,0,0,1,1,1,0,3,57,140,241,123,0.2)
predict_heart_disease(input_data_2)

Prediction: No Heart Disease
Prediction Probabilities: [[0.51336237 0.48663763]]




In [29]:
filename = 'Models/heart_disease_model.sav'
pickle.dump(model, open(filename,'wb'))

In [31]:
loaded_model = pickle.load(open('Models/heart_disease_model.sav','rb'))