# Exercise 15 – Heart Disease Prediction
- Removes outliers using Z-scores.  
- Encodes categorical features.  
- Scales features for model consistency.  
- PCA reduces dimensions (keeps 95% variance).  
- Compares Logistic, SVM, and Random Forest.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from scipy import stats

In [2]:
# Load dataset
df = pd.read_csv("/Users/jacobfrancis/dev/csc180/csv/heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
# Remove outliers using Z-score
z_scores = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
df_no_outliers = df[(z_scores < 3).all(axis=1)]
print("Before:", df.shape)
print("After removing outliers:", df_no_outliers.shape)

Before: (918, 12)
After removing outliers: (899, 12)


In [4]:
# Encode categorical variables
df_encoded = df_no_outliers.copy()
label_cols = df_encoded.select_dtypes(include='object').columns
for col in label_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [5]:
# Split features and target
X = df_encoded.drop("HeartDisease", axis=1)
y = df_encoded["HeartDisease"]

In [6]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [9]:
# Train multiple classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")

Logistic Regression Accuracy: 0.8556
SVM Accuracy: 0.8778
Random Forest Accuracy: 0.8722


In [10]:
# Apply PCA (retain 95% of variance)
pca = PCA(0.95)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)

In [11]:
# Evaluate models after PCA
for name, model in models.items():
    model.fit(X_train_pca, y_train_pca)
    y_pred_pca = model.predict(X_test_pca)
    acc_pca = accuracy_score(y_test_pca, y_pred_pca)
    print(f"{name} Accuracy after PCA: {acc_pca:.4f}")

Logistic Regression Accuracy after PCA: 0.8556
SVM Accuracy after PCA: 0.8722
Random Forest Accuracy after PCA: 0.8111
