In [4]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier


In [5]:
spambase = fetch_openml("spambase", version=1)
X_spam, y_spam = spambase.data, spambase.target.astype(int)

scaler = StandardScaler()
X_spam = scaler.fit_transform(X_spam)

X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(X_spam, y_spam, test_size=0.2, random_state=42)

print("Training set size: ", X_train_spam.shape)
print("Test set size: ", X_test_spam.shape)



Training set size:  (3680, 57)
Test set size:  (921, 57)


In [6]:
clfWithoutPCA = DecisionTreeClassifier(random_state=42)
clfWithoutPCA.fit(X_train_spam, y_train_spam)
y_pred_spam_baseline = clfWithoutPCA.predict(X_test_spam)
acc_spam_baseline = accuracy_score(y_test_spam, y_pred_spam_baseline)
print(f"Baseline Accuracy (Without PCA): {acc_spam_baseline:.4f}")


Baseline Accuracy (Without PCA): 0.9175


In [9]:
for dimensions in range(5, 58, 5):
    pcaObj = PCA(n_components=dimensions)
    X_train_pca_spam = pcaObj.fit_transform(X_train_spam)
    X_test_pca_spam = pcaObj.transform(X_test_spam)

    clfWithPCA = DecisionTreeClassifier(random_state=42)
    clfWithPCA.fit(X_train_pca_spam, y_train_spam)
    yPredSpam = clfWithPCA.predict(X_test_pca_spam)
    accSpam = accuracy_score(y_test_spam, yPredSpam)
    
    print(f"Accuracy with D={dimensions}: {accSpam:.4f}")

    if accSpam >= acc_spam_baseline - 0.03:
        print(f"Comparable test accuracy achieved with D={dimensions}")
        break


Accuracy with D=5: 0.8664
Accuracy with D=10: 0.8849
Accuracy with D=15: 0.8882
Comparable test accuracy achieved with D=15


## Analysis


- We get a PCA accuracy closest to the baseline accuracy when the number of features are reduced to 15.