In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings




In [None]:
warnings.filterwarnings('ignore')

df = pd.read_csv('heart_disease.csv')
print(df.head())




   HeartDiseaseorAttack  HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  \
0                     0       1         1          1   40       1       0   
1                     0       0         0          0   25       1       0   
2                     0       1         1          1   28       0       0   
3                     0       1         0          1   27       0       0   
4                     0       1         1          1   24       0       0   

   Diabetes  PhysActivity  Fruits  ...  PhysHlth  DiffWalk  Sex  Age  \
0         0             0       0  ...        15         1    0    9   
1         0             1       0  ...         0         0    0    7   
2         0             0       1  ...        30         1    0    9   
3         0             1       1  ...         0         0    0   11   
4         0             1       1  ...         0         0    0   11   

   Education  Income  HTIC  PRMC  SFC  MCD  
0          4       3     1     0    1    0  
1          6  

In [None]:
df = pd.DataFrame(df)
df.dropna(inplace=True)
data = df.drop(['CholCheck','Fruits','Veggies','AnyHealthcare','NoDocbcCost','GenHlth','MentHlth','PhysHlth','DiffWalk','Education','Income','HTIC','PRMC','SFC','MCD'], axis=1)

print(data.head())


   HeartDiseaseorAttack  HighBP  HighChol  BMI  Smoker  Stroke  Diabetes  \
0                     0       1         1   40       1       0         0   
1                     0       0         0   25       1       0         0   
2                     0       1         1   28       0       0         0   
3                     0       1         0   27       0       0         0   
4                     0       1         1   24       0       0         0   

   PhysActivity  HvyAlcoholConsump  Sex  Age  
0             0                  0    0    9  
1             1                  0    0    7  
2             0                  0    0    9  
3             1                  0    0   11  
4             1                  0    0   11  


In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.drop('HeartDiseaseorAttack', axis=1))

In [None]:
# Splitting the dataset into features and target variable
X = pd.DataFrame(scaled_features, columns=data.columns[:-1])
y = data['HeartDiseaseorAttack']


selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X, y)
selected_features = selector.get_support(indices=True)
X_selected = X.iloc[:, selected_features]

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC()
}


In [None]:
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f'{model_name} Performance:')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}\n')


Logistic Regression Performance:
Accuracy: 0.9065
Precision: 0.5158
Recall: 0.0822
F1 Score: 0.1418

Decision Tree Performance:
Accuracy: 0.8952
Precision: 0.3121
Recall: 0.0954
F1 Score: 0.1462

Random Forest Performance:
Accuracy: 0.8968
Precision: 0.3476
Recall: 0.1124
F1 Score: 0.1699

Support Vector Machine Performance:
Accuracy: 0.9068
Precision: 0.5615
Recall: 0.0373
F1 Score: 0.0700



In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
# Best parameters and model evaluation
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)



accuracy = accuracy_score(y_test, y_pred_best_rf)
precision = precision_score(y_test, y_pred_best_rf)
recall = recall_score(y_test, y_pred_best_rf)
f1 = f1_score(y_test, y_pred_best_rf)


In [None]:
print('Best Random Forest Performance:')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}\n')



Best Random Forest Performance:
Accuracy: 0.9070
Precision: 0.5686
Recall: 0.0426
F1 Score: 0.0792



In [None]:
# Cross-validation
cv_scores = cross_val_score(best_rf, X_selected, y, cv=5)
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean():.4f}')

# # Save the model
# import joblib
# joblib.dump(best_rf, 'disease_prediction_model.pkl')

Cross-Validation Scores: [0.90704825 0.90687086 0.90610218 0.90622044 0.90663434]
Mean CV Score: 0.9066
