In [None]:
### ASSUMING THE DATA HAS BEEN LOADED WITH CAPSTONE_LOADDATA.IPYNB ###

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Convert images to arrays, flatten, standardize

# Convert images to NumPy arrays and flatten
images = []
for i in range(1000): # Too high is difficult for PC to handle
    img = np.array(celebA_dataset[i][0]) # Convert image to NumPy array
    img_flat = img.flatten() # Flatten the image
    images.append(img_flat) # Add to list

# Convert list to array
images_np = np.array(images)

# Prints shape before standardization
print(f"Image data shape: {images_np.shape}") 

# Standardize the data
scaler = StandardScaler()
images_np_scaled = scaler.fit_transform(images_np)

# Prints standardized shape
print(f"Standardized image data shape: {images_np_scaled.shape}") 

### Apply PCA to data

pca_full = PCA()
image_pca_full = pca_full.fit_transform(images_np_scaled)

# Compute explained variance and cumulative variance
explained_variance = pca_full.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

# Find minimum number of components needed for 95% variance
num_components_needed = np.argmax(cumulative_variance >= 0.95) + 1

print(f"Number of components needed to retain 95% variance: {num_components_needed}")

### Plot outputs

# Explained variance
plt.figure(figsize=(8, 4))
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7)
plt.xlabel("PCA Component")
plt.ylabel("Explained Variance Ratio")
plt.title("Explained Variance")
plt.show()

# Cumulative variance
plt.figure(figsize=(8, 4))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker="o")
plt.axhline(y=0.95, color="r", linestyle="--", label="95% Variance")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.title("Cumulative Explained Variance")
plt.legend()
plt.show()

### Apply PCA with optimal components for future ML models

pca_optimal = PCA(n_components=num_components_needed)
image_pca_optimal = pca_optimal.fit_transform(images_np_scaled)

# Prints final reduced shape
print(f"Reduced image data shape: {image_pca_optimal.shape}")

### Find top two components in PCA and print associated variance

# Apply PCA to reduce to 2 components
pca_2 = PCA(n_components=2)
image_pca_2 = pca_2.fit_transform(images_np_scaled)

# Print the explained variance
print(f"Explained variance (2 components): {pca_2.explained_variance_ratio_}")

# Print cumulative variance
cumulative_variance_2 = np.cumsum(pca_2.explained_variance_ratio_)
print(f"Cumulative variance (2 components): {cumulative_variance_2[-1]}")

### Check if standardization affects variance of first two components
pca_2b = PCA(n_components=2)
image_pca_2b = pca_2b.fit_transform(images_np)

# Print the explained variance
print(f"Explained variance (2 components w/o standardization): {pca_2b.explained_variance_ratio_}")

# Print cumulative variance
cumulative_variance_2b = np.cumsum(pca_2b.explained_variance_ratio_)
print(f"Cumulative variance (2 components w/o standardization): {cumulative_variance_2b[-1]}")