In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from numpy.linalg import eig
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time

In [None]:
# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Lambda(lambda x: x.view(-1))])
mnist_train = torchvision.datasets.MNIST(root="./data", train=True, download=True, transform=transform)
mnist_test = torchvision.datasets.MNIST(root="./data", train=False, download=True, transform=transform)

In [None]:
X_test = torch.stack([mnist_test[i][0] for i in range(len(mnist_test))]).numpy()
y_test = np.array([label for _, label in mnist_test])

# Check the shape
print("Test Images Shape:", X_test.shape)  # (10000, 784)
print("Test Labels Shape:", y_test.shape)  # (10000,)

In [None]:
# Convert dataset to numpy arrays
X = torch.stack([mnist_train[i][0] for i in range(len(mnist_train))]).numpy()
y = np.array([mnist_train[i][1] for i in range(len(mnist_train))])

In [None]:
print("Train Images Shape:", X.shape)  # (60000, 784)
print("Train Labels Shape:", y.shape)  # (60000,)

In [None]:
# Function to sample 1000 images uniformly across classes
def sample_uniform(X, y, num_samples=1000):
    class_counts = Counter(y)
    samples_per_class = num_samples // len(class_counts)
    sampled_X, sampled_y = [], []
    for label in class_counts.keys():
        indices = np.where(y == label)[0]
        chosen_indices = np.random.choice(indices, samples_per_class, replace=False)
        sampled_X.extend(X[chosen_indices])
        sampled_y.extend(y[chosen_indices])
    return np.array(sampled_X), np.array(sampled_y)

In [None]:


# Sample 1000 images
X_sampled, y_sampled = sample_uniform(X, y)

# PCA Implementation
def pca(X, num_components=None):
    X_mean = np.mean(X, axis=0)
    X_centered = X - X_mean
    covariance_matrix = np.cov(X_centered, rowvar=False)
    eigenvalues, eigenvectors = eig(covariance_matrix)
    eigenvalues = np.real(eigenvalues)
    eigenvectors = np.real(eigenvectors)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_indices]
    eigenvectors = eigenvectors[:, sorted_indices]
    if num_components is not None:
        eigenvectors = eigenvectors[:, :num_components]
    X_projected = np.dot(X_centered, eigenvectors)
    return X_projected, eigenvectors, X_mean, eigenvalues





In [None]:
# Compute PCA and explained variance
X_projected, eigenvectors, X_mean, eigenvalues = pca(X_sampled)
explained_variance_ratio = eigenvalues / np.sum(eigenvalues)
cumulative_variance = np.cumsum(explained_variance_ratio)

# Plot explained variance
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='-')
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Explained Variance vs. Number of Principal Components")
plt.grid()
plt.show()

In [None]:
# Visualize the first 2 principal components
plt.figure(figsize=(8, 6))
plt.scatter(X_projected[:, 0], X_projected[:, 1], c=y_sampled, cmap='viridis', alpha=0.6)
plt.colorbar(label="Digit Label")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("2D Projection of MNIST using PCA")
plt.grid()
plt.show()

# **Observations on the PCA Projection of MNIST**

### 1. **Separation of Digit Clusters**
   - Some digits (e.g., **0 and 1**) form relatively distinct clusters, while others (e.g., **3, 5, and 8**) are more spread out and overlapping.
   - This suggests that certain digits have more unique features, making them easier to separate in a lower-dimensional space.

### 2. **Overlap of Multiple Digits**
   - Many points from different digit classes overlap, indicating that a simple **2D projection does not fully separate the digits**.
   - This suggests that some digits share similar pixel distributions, making them harder to distinguish using only two principal components.

### 3. **Distribution of Data**
   - The data appears to be **denser near the origin**, with most points clustered around **(0,0)**.
   - This indicates that **the first two principal components capture a significant portion of the variance**, but additional components may be needed for better separation.

### 4. **Non-linearity of the Data**
   - The overlapping regions suggest that **digits in MNIST might not be fully linearly separable in 2D**.
   - This aligns with the need for **non-linear techniques like t-SNE or UMAP** for better visualization.

### 5. **Insights for Classification**
   - A classifier using only **PC1 and PC2 as features would struggle** due to the overlapping digit regions.
   - More principal components or other feature extraction techniques would be needed for accurate **digit classification**.


In [None]:
print(f"Original shape: {X_sampled.shape}")

# Display the first 10 original images only once
fig, axes = plt.subplots(1, 5, figsize=(10, 2))
for i in range(5):
    axes[i].imshow(X_sampled[i].reshape(28, 28), cmap='gray')
    axes[i].axis('off')
plt.show()

for dim in [500, 300, 150, 30]:
    X_reduced, eigenvectors, X_mean,_ = pca(X_sampled, dim)
    print(f"Projected dataset to {dim} dimensions. Shape: {X_reduced.shape}")
    
    # Display first 10 reconstructed images
    fig, axes = plt.subplots(1, 5, figsize=(10, 2))
    for i in range(5):
        reconstructed = np.dot(X_reduced[i], eigenvectors.T) + X_mean
        axes[i].imshow(reconstructed.reshape(28, 28), cmap='gray')
        axes[i].axis('off')
    plt.show()

# **Observations on Dimensionality Reduction and Reconstruction of MNIST Digits**

### **1. Impact of Dimensionality Reduction on Image Quality**
   - As the number of principal components decreases, the reconstructed images progressively lose finer details and become more **blurred or distorted**.
   - The original images (784 dimensions) retain **sharp edges and clear digit structures**, whereas the images reconstructed from **30 dimensions appear highly distorted**.

### **2. Retaining Important Features**
   - **500 dimensions:** The reconstructed images are **almost identical** to the originals, indicating that most critical features are preserved.
   - **300 dimensions:** The images remain recognizable, but slight blurring and **loss of sharpness** at the place of gray and white region start to appear.
   - **150 dimensions:** Some strokes appear **less defined**, and minor distortions can be seen, but the digits are still distinguishable.
   - **30 dimensions:** The images are **heavily distorted**, with some digits losing their basic structure, making them **harder to recognize**.

### **3. Conclusion**
   - **500-300 dimensions** provide a **good balance** between compression and reconstruction quality.
   - **150 dimensions** can still capture recognizable digit features but may struggle in fine-grained classification tasks.
   - **30 dimensions** are insufficient for reconstruction, highlighting the **limits of PCA in preserving spatial details** at extreme reductions.


## 4.2

In [None]:

# Step 3: Select a random subset of 40K samples from train set
subset_indices = np.random.choice(X.shape[0], 40000, replace=False)
X_train = X[subset_indices]
y_train = y[subset_indices]

# Step 4: Standardize the data
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)  # Apply same transformation to test set



In [None]:
print("Train Images Shape:", X_train.shape)  # (40000, 784)
print("Train Labels Shape:", y_train.shape)  # (40000,)

### Original Data

In [None]:
start = time.time()

# Step 5: Train MLP Classifier
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=50, random_state=42)
mlp.fit(X_train, y_train)

# Step 6: Predict on test set
y_pred = mlp.predict(X_test)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')  # 'macro' averages across all classes
recall = recall_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

end = time.time()
print(f"time took :{(end-start):.4f}")

In [None]:
for dim in [500, 300, 150, 30]:
    X_reduced_train, eigenvectors, X_mean_train,_ = pca(X_train, dim)
    # X_reduced_test ,eigenvectors, X_mean_test,_ = pca(X_test,dim)
    X_centered_test = X_test - X_mean_train  # Use train mean
    X_reduced_test = np.dot(X_centered_test, eigenvectors) 
    
    mlp = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', solver='adam', max_iter=50, random_state=42)
    mlp.fit(X_reduced_train, y_train)

    y_pred = mlp.predict(X_reduced_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')  # 'macro' averages across all classes
    recall = recall_score(y_test, y_pred, average='macro')

    print(f"Dimensions:{dim}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}\n\n")
    

# **Observations on MLP Classifier Performance Before and After PCA**

## **1. Baseline Model Performance (Before PCA)**
- The model trained on the original dataset (without dimensionality reduction) achieved:
  - **Accuracy:** 97.73%
  - **Precision:** 97.72%
  - **Recall:** 97.71%
- This shows that the MLP classifier performs **very well** on the raw MNIST dataset with all 784 features.

## **2. Impact of Dimensionality Reduction on Performance**
### **(a) PCA with 500 Dimensions**
- **Accuracy:** 96.90%  
- **Precision:** 96.88%  
- **Recall:** 96.85%  
- The model performance drops slightly, indicating that PCA **removes some minor details** but still preserves most of the relevant information.

### **(b) PCA with 300 Dimensions**
- **Accuracy:** 97.28%  
- **Precision:** 97.25%  
- **Recall:** 97.24%  
- Performance improves compared to **500 dimensions**, showing that PCA helps **remove redundant features** while keeping important details.

### **(c) PCA with 150 Dimensions**
- **Accuracy:** 97.34%  
- **Precision:** 97.31%  
- **Recall:** 97.31%  
- Similar to **300 dimensions**, with a **slight improvement** in performance, suggesting that PCA effectively **captures essential features** with fewer dimensions.

### **(d) PCA with 30 Dimensions**
- **Accuracy:** 97.36%  
- **Precision:** 97.37%  
- **Recall:** 97.33%  
- Surprisingly, the model performs **better than PCA with 500 dimensions**, meaning that **only 30 principal components are enough** to encode the majority of the variance in the MNIST dataset.

## **3. Key Insights**   
1. **PCA helps remove redundant information:**  
   - The **slight improvement in accuracy** for 150 and 30 dimensions suggests that removing unnecessary features **reduces noise** and enhances generalization.

2. **Too many dimensions do not necessarily improve results:**  
   - The **500-dimension model performed slightly worse** than models trained with 150 and 30 dimensions, likely due to retaining some **irrelevant or redundant** features.

## **4. Conclusion**
- **PCA is effective for dimensionality reduction** in MNIST without compromising accuracy.
- **30 dimensions are sufficient** for high classification performance.
- **Higher dimensions (e.g., 500) retain unnecessary details** that slightly reduce generalization.
- PCA can **improve computational efficiency** without major losses in classification quality.



# 4.3 Report

# **How PCA Helps Mitigate the Curse of Dimensionality**

## **What is the Curse of Dimensionality?**
The **curse of dimensionality** refers to the challenges that arise when working with high-dimensional data, such as:
- Increased computational complexity.
- Overfitting due to too many features.
- Data sparsity, making it harder to find meaningful patterns.

## **How PCA Helps Mitigate These Issues**
### 1. **Reduces Redundant Features**
- In high-dimensional spaces, many features may be correlated and not contribute much new information.
- PCA projects data into a lower-dimensional space, **removing redundancy** and keeping only the most important features.

### 2. **Improves Model Generalization**
- High-dimensional models often **overfit**, capturing noise instead of meaningful patterns.
- PCA helps models **focus on the most significant features**, improving performance on unseen data.

### 3. **Enhances Computational Efficiency**
- Training models in high dimensions requires **more memory and computation**.
- PCA reduces dimensions, making learning algorithms **run faster** and with lower memory usage.

### 4. **Improves Data Visualization & Interpretation**
- High-dimensional data is hard to visualize.
- PCA projects data onto **2D or 3D spaces**, making it easier to analyze clusters and patterns.

---

# **When PCA Might Not Be Effective in High-Dimensional Spaces**
While PCA is useful, it **may not always work well**, especially in certain cases:

###  **1. Non-Linear Data Distributions**
- PCA **assumes linear relationships** between features.
- If data has complex **non-linear structures** (e.g., curved manifolds), PCA fails to capture essential information.
- 🔹 **Alternative:** Use **t-SNE, UMAP, or Kernel PCA** for non-linear transformations.

###  **2. When Variance Does Not Correlate with Importance**
- PCA selects components based on **variance**, assuming that high variance features are more important.
- In some cases, **low-variance features may be critical** (e.g., medical diagnosis where small variations indicate disease presence).
- 🔹 **Alternative:** Use **Lasso regression** or **mutual information-based feature selection**.

###  **3. Sparse Data Issues**
- If the dataset has many **zero or near-zero values** (e.g., text data in NLP with sparse embeddings), PCA may not work well.
- 🔹 **Alternative:** Use **Autoencoders** or **factorization techniques (e.g., NMF)**.

---


**PCA is highly effective for mitigating the curse of dimensionality**, particularly when linear relationships exist and variance is a useful indicator of importance.  
**However, it may not work well in non-linear, sparse, or highly structured data**, where other dimensionality reduction methods might be better suited.


# **Does PCA Always Capture the Most Informative Directions?**

## **PCA Assumption**
PCA assumes that **the directions of maximum variance contain the most important information** about the data.  
This works well in many cases, but **this assumption is not always valid**.  

---

# **When PCA's Assumption Might Fail**
PCA may fail when:
1. **Low-Variance Features are Important**  
2. **Non-Linear Relationships Exist**  
3. **Class Labels Are Not Captured by Variance**  

### **Example: Handwritten Digit Classification (MNIST)**
Consider **MNIST**, a dataset of handwritten digits (0-9).  

🔹 PCA will find the **principal components** based on pixel intensity variance.  
🔹 However, **digit identity (class information) is not always aligned with variance**.  
🔹 Some **low-variance features** (e.g., stroke thickness, subtle curve differences) may be critical for distinguishing similar digits (like 3 vs. 8).  

**PCA may discard these features, reducing classification performance.**  

---

### **Example: XOR Problem (Non-Linear Data)**
The **XOR dataset** is a classic example where PCA fails.  

#### **XOR Dataset Structure**
- Two classes are **non-linearly separable** (they form an "XOR" pattern in feature space).
- Variance alone **does not separate the classes**.

#### **Why PCA Fails?**
- PCA finds directions **maximizing variance**, but **this does not separate the XOR classes**.
- PCA projects XOR data in a way that **still makes it non-separable**.  
- A **non-linear method** (e.g., **Kernel PCA or t-SNE**) is needed instead.

---

 **PCA is powerful, but its assumption can fail when:**  
 Important information is in **low-variance directions** (e.g., digit classification).  
 The data has **non-linear relationships** (e.g., XOR problem).  

 **Alternative methods like Kernel PCA, t-SNE, or deep learning may be better in these cases.**
