## PROBLEM 2 : PCA library on MNIST

In [1]:
import os
import warnings

import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from mnist_parser import MNISTDataParser

In [2]:
seed = 42

np.random.seed(seed=seed)
warnings.filterwarnings("ignore")

### 1. MNIST

In [3]:
data_parser = MNISTDataParser()
X_train, y_train, X_test, y_test = data_parser.parse_data(verbose=0, sample_data=False)

In [4]:
print(f"Shape of Training Data = {X_train.shape}")
print(f"Shape of Training Labels = {y_train.shape}")

print(f"Shape of Testing Data = {X_test.shape}")
print(f"Shape of Testing Labels = {y_test.shape}")

Shape of Training Data = (60000, 784)
Shape of Training Labels = (60000,)
Shape of Testing Data = (10000, 784)
Shape of Testing Labels = (10000,)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 1. PCA with 5 Principal Components

In [6]:
pca = PCA(n_components=5, random_state=seed)

In [7]:
pca.fit(X_train_scaled)

In [8]:
pca.explained_variance_ratio_

array([0.05646717, 0.04078272, 0.0373938 , 0.02885115, 0.02521107])

In [9]:
X_train_proj = pca.transform(X_train_scaled)
X_test_proj = pca.transform(X_test_scaled)

#### A. Logistic Regression

In [10]:
lr = LogisticRegression(random_state=seed)
lr.fit(X_train_proj, y_train)

In [11]:
y_train_pred = lr.predict(X_train_proj)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 67.66%


In [12]:
y_test_pred = lr.predict(X_test_proj)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 68.00%


#### B. Decision Tree Classifier

In [13]:
clf = DecisionTreeClassifier(random_state=seed)

In [14]:
clf.fit(X_train_proj, y_train)

In [15]:
y_train_pred = clf.predict(X_train_proj)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 100.00%


In [16]:
y_test_pred = clf.predict(X_test_proj)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 66.95%


### 2. PCA with 20 Principal Components

In [17]:
pca = PCA(n_components=20, random_state=seed)

In [18]:
pca.fit(X_train_scaled)

In [19]:
pca.explained_variance_ratio_

array([0.05646717, 0.04078272, 0.0373938 , 0.02885115, 0.02521109,
       0.0219427 , 0.01923344, 0.01745799, 0.0153509 , 0.01401712,
       0.01341739, 0.01203719, 0.01114549, 0.01089905, 0.01028369,
       0.00994295, 0.00935621, 0.00919891, 0.00892169, 0.00867491])

In [20]:
X_train_proj = pca.transform(X_train_scaled)
X_test_proj = pca.transform(X_test_scaled)

#### A. Logistic Regression

In [21]:
lr = LogisticRegression(random_state=seed)
lr.fit(X_train_proj, y_train)

In [22]:
y_train_pred = lr.predict(X_train_proj)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 86.80%


In [23]:
y_test_pred = lr.predict(X_test_proj)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 87.15%


#### B. Decision Tree Classifier

In [24]:
clf = DecisionTreeClassifier(random_state=seed)

In [25]:
clf.fit(X_train_proj, y_train)

In [26]:
y_train_pred = clf.predict(X_train_proj)

training_acc = accuracy_score(y_train, y_train_pred) * 100

print(f"Training Accuracy: {training_acc:.2f}%")

Training Accuracy: 100.00%


In [27]:
y_test_pred = clf.predict(X_test_proj)

testing_acc = accuracy_score(y_test, y_test_pred) * 100

print(f"Testing Accuracy: {testing_acc:.2f}%")

Testing Accuracy: 83.95%


### 2. Spambase

In [28]:
BASE_DIR = os.path.abspath('..')
DATA_DIR = os.path.join(BASE_DIR, "data", "spambase")

DATA_URL = os.path.join(DATA_DIR, "spambase.data")
COLS_URL = os.path.join(DATA_DIR, "spambase.names")

In [29]:
columns = []

with open(COLS_URL, "r") as file:
    for line in file:
        if line[0] in {'w', 'c'}:
            columns.append(line.split(':')[0])

columns.append("is_spam")

In [30]:
df = pd.read_csv(DATA_URL, header=None)
df.columns = columns

In [31]:
X = df.drop(columns=["is_spam"]).values
y = df["is_spam"].values

#### A. Logistic Regression

In [32]:
n = X.shape[1]

tolerance = 5
best_acc = 92.62

print(f"Accuracy with the original dimension: {best_acc}%")

for n_component in range(1, n + 1):
    pca = PCA(n_components=n_component, random_state=seed)
    X_proj = pca.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_proj, y, test_size=0.2,
                                                    shuffle=True, random_state=seed)

    lr = LogisticRegression(random_state=seed)
    lr.fit(X_train, y_train)
    
    y_train_pred = lr.predict(X_train)
    training_acc = accuracy_score(y_train, y_train_pred) * 100    
    
    y_test_pred = lr.predict(X_test)
    testing_acc = accuracy_score(y_test, y_test_pred) * 100

    print(f"Dimension: {n_component}, Train. Acc: {training_acc:.2f}%, Test. Acc: {testing_acc:.2f}%")
    
    if best_acc - testing_acc <= tolerance:
        print(f"Min. Dimension(D) required to achieve a comparable performance ({tolerance}% tolerance) is {n_component}")
        break

Accuracy with the original dimension: 92.62%
Dimension: 1, Train. Acc: 67.15%, Test. Acc: 62.87%
Dimension: 2, Train. Acc: 73.51%, Test. Acc: 70.58%
Dimension: 3, Train. Acc: 74.08%, Test. Acc: 71.34%
Dimension: 4, Train. Acc: 79.43%, Test. Acc: 76.66%
Dimension: 5, Train. Acc: 79.84%, Test. Acc: 77.52%
Dimension: 6, Train. Acc: 82.99%, Test. Acc: 81.43%
Dimension: 7, Train. Acc: 84.08%, Test. Acc: 83.71%
Dimension: 8, Train. Acc: 84.67%, Test. Acc: 84.26%
Dimension: 9, Train. Acc: 85.11%, Test. Acc: 85.12%
Dimension: 10, Train. Acc: 85.27%, Test. Acc: 85.02%
Dimension: 11, Train. Acc: 85.30%, Test. Acc: 84.69%
Dimension: 12, Train. Acc: 85.62%, Test. Acc: 85.45%
Dimension: 13, Train. Acc: 89.62%, Test. Acc: 89.14%
Min. Dimension(D) required to achieve a comparable performance (5% tolerance) is 13


#### B. Decision Tree Classifier

In [33]:
n = X.shape[1]

tolerance = 5
best_acc = 91.86

print(f"Accuracy with the original dimension: {best_acc}%")

for n_component in range(1, n + 1):
    pca = PCA(n_components=n_component, random_state=seed)
    X_proj = pca.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_proj, y, test_size=0.2,
                                                    shuffle=True, random_state=seed)

    clf = DecisionTreeClassifier(random_state=seed)
    clf.fit(X_train, y_train)
    
    y_train_pred = clf.predict(X_train)
    training_acc = accuracy_score(y_train, y_train_pred) * 100    
    
    y_test_pred = clf.predict(X_test)
    testing_acc = accuracy_score(y_test, y_test_pred) * 100

    print(f"Dimension: {n_component}, Train. Acc: {training_acc:.2f}%, Test. Acc: {testing_acc:.2f}%")
    
    if best_acc - testing_acc <= tolerance:
        print(f"Min. Dimension(D) required to achieve a comparable performance ({tolerance}% tolerance) is {n_component}")
        break

Accuracy with the original dimension: 91.86%
Dimension: 1, Train. Acc: 99.84%, Test. Acc: 66.56%
Dimension: 2, Train. Acc: 99.95%, Test. Acc: 75.79%
Dimension: 3, Train. Acc: 99.95%, Test. Acc: 75.35%
Dimension: 4, Train. Acc: 99.95%, Test. Acc: 81.11%
Dimension: 5, Train. Acc: 99.95%, Test. Acc: 82.95%
Dimension: 6, Train. Acc: 99.95%, Test. Acc: 84.91%
Dimension: 7, Train. Acc: 99.95%, Test. Acc: 81.98%
Dimension: 8, Train. Acc: 99.95%, Test. Acc: 85.34%
Dimension: 9, Train. Acc: 99.95%, Test. Acc: 84.58%
Dimension: 10, Train. Acc: 99.95%, Test. Acc: 85.02%
Dimension: 11, Train. Acc: 99.95%, Test. Acc: 85.45%
Dimension: 12, Train. Acc: 99.95%, Test. Acc: 86.75%
Dimension: 13, Train. Acc: 99.95%, Test. Acc: 86.10%
Dimension: 14, Train. Acc: 99.95%, Test. Acc: 85.99%
Dimension: 15, Train. Acc: 99.95%, Test. Acc: 87.84%
Min. Dimension(D) required to achieve a comparable performance (5% tolerance) is 15
