In [15]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [16]:
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist.data, mnist.target.astype(int) 

X = X / 255.0

X = X - np.mean(X, axis=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Shape: ", X_train.shape)
print("Testing Shape: ", X_test.shape)


Training Shape:  (56000, 784)
Testing Shape:  (14000, 784)


In [17]:
pca_5 = PCA(n_components=5)
X_train_pca_5 = pca_5.fit_transform(X_train)
X_test_pca_5 = pca_5.transform(X_test)

pca_20 = PCA(n_components=20)
X_train_pca_20 = pca_20.fit_transform(X_train)
X_test_pca_20 = pca_20.transform(X_test)

print("Reduced shape (D=5):", X_train_pca_5.shape) 
print("Reduced shape (D=20):", X_train_pca_20.shape)


Reduced shape (D=5): (56000, 5)
Reduced shape (D=20): (56000, 20)


In [18]:
clfWithoutPCA = DecisionTreeClassifier(random_state=42)
clfWithoutPCA.fit(X_train, y_train)
y_pred_baseline = clfWithoutPCA.predict(X_test)
acc_baseline = accuracy_score(y_test, y_pred_baseline)
print(f"Baseline Accuracy (Without PCA) Decision Tree: {acc_baseline:.4f}")


Baseline Accuracy (Without PCA) Decision Tree: 0.8704


In [19]:
clfWithoutPCALogistic = LogisticRegression(random_state=42)
clfWithoutPCALogistic.fit(X_train, y_train)
y_pred_baseline_logistic = clfWithoutPCALogistic.predict(X_test)
acc_baseline_logistic = accuracy_score(y_test, y_pred_baseline_logistic)
print(f"Baseline Accuracy (Without PCA) Logistic Regression: {acc_baseline_logistic:.4f}")


Baseline Accuracy (Without PCA) Logistic Regression: 0.9191


In [None]:
clfWith5Features = DecisionTreeClassifier(random_state=42)
clfWith5Features.fit(X_train_pca_5, y_train)
yPred5 = clfWith5Features.predict(X_test_pca_5)
acc5 = accuracy_score(y_test, yPred5)
print(f"Accuracy with D=5: {acc5:.4f}")

clfWith20Features = DecisionTreeClassifier(random_state=42)
clfWith20Features.fit(X_train_pca_20, y_train)
yPred20 = clfWith20Features.predict(X_test_pca_20)
acc20 = accuracy_score(y_test, yPred20)
print(f"Accuracy with D=20 Decision Tree: {acc20:.4f}")


Accuracy with D=5: 0.6663
Accuracy with D=20: 0.8496


In [20]:
clfWith5FeaturesLogistic = LogisticRegression(random_state=42)
clfWith5FeaturesLogistic.fit(X_train_pca_5, y_train)
yPred5Logistic = clfWith5FeaturesLogistic.predict(X_test_pca_5)
acc5Logistic = accuracy_score(y_test, yPred5Logistic)
print(f"Accuracy with D=5 Logistic Regression: {acc5Logistic:.4f}")

clfWith20FeaturesLogistic = LogisticRegression(random_state=42)
clfWith20FeaturesLogistic.fit(X_train_pca_20, y_train)
yPred20Logistic = clfWith20FeaturesLogistic.predict(X_test_pca_20)
acc20Logistic = accuracy_score(y_test, yPred20Logistic)
print(f"Accuracy with D=20 Logistic Regression: {acc20Logistic:.4f}")

Accuracy with D=5 Logistic Regression: 0.6795
Accuracy with D=20 Logistic Regression: 0.8785
