In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

#load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", 
           "exang", "oldpeak", "slope", "ca", "thal", "target"]

df = pd.read_csv(url, names=columns)
df.replace('?', pd.NA, inplace=True)
df = df.apply(pd.to_numeric)
print(df.info())
print(df.describe())
print(df['target'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  target    303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB
None
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.438944    0.679868    3.158416  131.68976

In [14]:
X = df.drop('target', axis=1)
y = df['target'].apply(lambda x: 1 if x > 0 else 0)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  
numeric_features = ["age", "trestbps", "chol", "thalach", "oldpeak"]
categorical_features = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)


In [None]:
define supervised learning.

In [15]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

param_grid = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Decision Tree': {'max_depth': [None, 10, 20, 30]},
    'Random Forest': {'n_estimators': [10, 50, 100]}
}

best_models = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_preprocessed, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"{name}: Best Parameters -> {grid_search.best_params_}")


Logistic Regression: Best Parameters -> {'C': 0.1}
Decision Tree: Best Parameters -> {'max_depth': 10}
Random Forest: Best Parameters -> {'n_estimators': 10}


In [16]:
for name, model in best_models.items():
    y_val_pred = model.predict(X_val_preprocessed)
    print(f"{name} Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")

best_model_name = max(best_models, key=lambda name: accuracy_score(y_val, best_models[name].predict(X_val_preprocessed)))
best_model = best_models[best_model_name]

y_test_pred = best_model.predict(X_test_preprocessed)
print(f"Best Model: {best_model_name}")
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"ROC AUC: {roc_auc_score(y_test, y_test_pred)}")
print(classification_report(y_test, y_test_pred))


Logistic Regression Validation Accuracy: 0.8524590163934426
Decision Tree Validation Accuracy: 0.6885245901639344
Random Forest Validation Accuracy: 0.7377049180327869
Best Model: Logistic Regression
Test Set Performance:
Accuracy: 0.8360655737704918
ROC AUC: 0.8372844827586207
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        29
           1       0.87      0.81      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

