In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
heart_data = pd.read_csv('Datasets/heart.csv')
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [12]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [14]:
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler

In [16]:
numerical_columns = ['age','trestbps','chol','thalach','oldpeak']
z_scores = heart_data[numerical_columns].apply(zscore)

In [18]:
threshold = 3
outliers = (z_scores.abs() > threshold).any(axis=1)

In [20]:
data_without_outliers = heart_data[~outliers]

In [22]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_without_outliers[numerical_columns])

In [24]:
scaled_data = pd.DataFrame(scaled_features, columns = [col + '_scaled' for col in numerical_columns])
categorical_columns = [col for col in heart_data.columns if col not in numerical_columns]
final_data = pd.concat([data_without_outliers[categorical_columns].reset_index(drop=True), scaled_data], axis =1)

In [26]:
X= final_data.drop(columns='target', axis =1)
Y=final_data['target']

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [30]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [32]:
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print(feature_importance)

            Feature  Coefficient
1                cp     0.690385
5             slope     0.600146
3           restecg     0.436675
11   thalach_scaled     0.407591
8        age_scaled    -0.057269
9   trestbps_scaled    -0.162498
10      chol_scaled    -0.223754
2               fbs    -0.320205
12   oldpeak_scaled    -0.518883
6                ca    -0.760129
4             exang    -0.767884
7              thal    -0.905907
0               sex    -1.261647


In [34]:
model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
model.fit(X_train, Y_train)

# Evaluate
print("Training Accuracy:", model.score(X_train, Y_train))
print("Testing Accuracy:", model.score(X_test, Y_test))

Training Accuracy: 0.8468085106382979
Testing Accuracy: 0.8813559322033898


In [36]:
from sklearn import svm
classifier = svm.SVC(kernel = 'linear')

In [38]:
classifier.fit(X_train, Y_train)

In [40]:
print("Training Accuracy:", classifier.score(X_train, Y_train))
print("Testing Accuracy:", classifier.score(X_test, Y_test))

Training Accuracy: 0.8468085106382979
Testing Accuracy: 0.864406779661017


In [42]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [44]:
param_grid = {
    'C': [1,10,100,1000],
    'gamma': [0.0001,0.001,0.01,0.1],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2,3]
}

In [46]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=9, scoring='accuracy')
grid_search.fit(X_train_scaled, Y_train)

In [47]:
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Train Accuracy:", best_model.score(X_train_scaled, Y_train))
print("Test Accuracy:", best_model.score(X_test_scaled, Y_test))

Best Parameters: {'C': 100, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
Train Accuracy: 0.8425531914893617
Test Accuracy: 0.8813559322033898


In [48]:
##Combining SVM with other models using Stacking

In [49]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('svm', SVC(kernel='rbf', C=100, gamma=0.001))
]

model = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression())
model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", model.score(X_train_scaled, Y_train))
print("Test Accuracy:", model.score(X_test_scaled, Y_test))

Train Accuracy: 0.9148936170212766
Test Accuracy: 0.8983050847457628


In [50]:

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
rf_grid_search.fit(X_train_scaled, Y_train)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}


In [51]:
svm_param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf']
}
svm_grid_search = GridSearchCV(
    SVC(),
    svm_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
svm_grid_search.fit(X_train_scaled, Y_train)
print("Best SVM Parameters:", svm_grid_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best SVM Parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


In [52]:
optimized_rf = rf_grid_search.best_estimator_
optimized_svm = svm_grid_search.best_estimator_

base_estimators = [
    ('rf', optimized_rf),
    ('svm', optimized_svm)
]

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression()
)

stacking_model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", stacking_model.score(X_train_scaled, Y_train))
print("Test Accuracy:", stacking_model.score(X_test_scaled, Y_test))

Train Accuracy: 0.8680851063829788
Test Accuracy: 0.8813559322033898


In [53]:
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2)
np_array=np.asarray(input_data)

input_data_reshaped= np_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
    print("The person does not have a Heart Disease")
else:
    print("The Person has Heart Disease")


[1]
The Person has Heart Disease


In [54]:
input_data = (45,1,3,110,264,0,1,132,0,1.2,1,0,3)
np_array=np.asarray(input_data)

input_data_reshaped= np_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
    print("The person does not have a Heart Disease")
else:
    print("The Person has Heart Disease")

[1]
The Person has Heart Disease


In [99]:
for col in X.columns:
    print(col)

sex
cp
fbs
restecg
exang
slope
ca
thal
age_scaled
trestbps_scaled
chol_scaled
thalach_scaled
oldpeak_scaled
