# Dimensionality Reduction
### on synthetic and image data

Dimensionality Reduction, in Python with Scikit-learn

<br>
@Ricardo Almeida

Sources:
- [Dimensionality Reduction with Scikit-Learn: PCA Theory and Implementation](https://medium.com/towards-data-science/dimensionality-reduction-with-scikit-learn-pca-theory-and-implementation-aa224e6ee1f6), by Riccardo Andreoni
- [Using PCA for Data Reduction](https://python.plainenglish.io/using-pca-for-data-reduction-and-face-recognition-on-lfw-dataset), by Ahmed Mujtaba Butt

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split
import warnings

# Suppress FutureWarning
warnings.simplefilter(action='ignore', category=FutureWarning)

### Principal Component Analysis (PCA)

In [None]:
from sklearn.datasets import make_swiss_roll

# Create synthetic data
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

# Instantiate a PCA object
pca = PCA(n_components=1)

# Fit the PCA object on the data
pca.fit(X)

# Transform the data
X_transformed = pca.transform(X)

print('Original data shape: ', X.shape)
print('Transformed data shape: ', X_transformed.shape)

In [None]:
from sklearn.datasets import make_swiss_roll

# Create synthetic data
X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)

# Instantiate a PCA object
pca = PCA(n_components=1)

# Fit the PCA object on the data
pca.fit(X)

# Calculate the EVA of each PC
eva = pca.explained_variance_ratio_

print(eva)

In [None]:
# plot the data
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.viridis)
plt.show()

In [None]:
from sklearn.datasets import fetch_openml

# Load the dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
X = mnist.data

# Instantiate and train a PCA object
pca = PCA()
pca.fit(X)

# Compute the explained variance ratio
eva = pca.explained_variance_ratio_

#print(np.round(eva, decimals=4))

In [None]:
pca = PCA(n_components=.95)
pca.fit(X)
X_transformed = pca.transform(X)

print(X_transformed.shape)

### Kernel PCA (kPCA)

In [None]:
# Import required libraries
from sklearn.datasets import make_swiss_roll
from sklearn.decomposition import KernelPCA

# Generate synthetic data
X_swiss, t = make_swiss_roll(n_samples=1500, noise=0.3, random_state=2)

In [None]:
# Instantiate a KernelPCA object, specifying the kernel type 
# and the output's dimensions
pca_swiss = KernelPCA(n_components=2, kernel='sigmoid', gamma=1e-3, coef0=1, fit_inverse_transform=True)
# Transform the original data
X_pca_swiss = pca_swiss.fit_transform(X_swiss)

In [None]:
# plot the original data
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_swiss[:, 0], X_swiss[:, 1], X_swiss[:, 2], c=t, cmap=plt.cm.viridis)
plt.show()

In [None]:
# plot the reduced data
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(X_pca_swiss[:, 0], X_pca_swiss[:, 1], c=t, cmap=plt.cm.viridis)
plt.show()

### Locally Linear Embedding (LLE) 

In [None]:
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.decomposition import KernelPCA

X_swiss, t = make_swiss_roll(n_samples=1500, noise=0.3, random_state=2)

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=15)
X_pca_swiss_lle = lle.fit_transform(X_swiss)

In [None]:
# plot the reduced data
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(X_pca_swiss_lle[:, 0], X_pca_swiss_lle[:, 1], c=t, cmap=plt.cm.viridis)
plt.show()

## Dimensionality Reduction

In [None]:
# Load the dataset with minimal image downloading
lfw_people = fetch_lfw_people(min_faces_per_person=60)  

# Store images and target labels
X = lfw_people.data
y = lfw_people.target

# Other useful attributes
target_names = lfw_people.target_names
image_shape = lfw_people.images.shape

# Print a summary 
print(f"Image shape: {image_shape}")
print(f"Shape of image data (X): {X.shape}")
print(f"Number of classes (unique persons): {len(target_names)}")

In [None]:
print('Class names (persons):\n') 
for name in target_names:
    print(f'   {name}') 

In [None]:
X[0]

In [None]:
len(X[0])

In [None]:
plt.figure(figsize=(12, 12))
for i in range(16):
    plt.subplot(4, 4, i+1)
    plt.imshow(X[i].reshape(62, 47), cmap='gray')
    plt.title(f'{target_names[y[i]]}:')
plt.show()

In [None]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=6868)

Implement PCA and fiting to the training data

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.97)
pca.fit(X_train)

In [None]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
X_train_pca.shape

In [None]:
print(len(pca.components_))

In [None]:
pca.components_[0]

Show the resulting components, as images

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.imshow(pca.components_[i].reshape(62, 47), cmap='gray')
    plt.title(f'Component {i+1}')
plt.show()

#### Task #1

**Exercise**:

Choosing to keep 0.97 of variance has allowed us to reduce the dimensions from **2914** to **242**.

Check the reduction of dimensions (PCA components) for other variance thresholds: 0.99, 0.98, ... 0.90

In [None]:
### Create a process to get the dimension number for the 10 values of variance, between 0.99 to 0.90
### printing Variance Kept, Components
### such as:
### Variance Kept, Components
### ...
### 0.97 Nr. Components: 242
### ...
### YOUR CODE HERE:




## Clustering faces

Consider a subset with a single picture per person (8 persons):
- X_unique: train dataset with faces
- y_unique: label vector

In [None]:
y_unique, first_occurance_indices = np.unique(y_train, return_index=True)
X_unique = X_train[first_occurance_indices] 

**Exercise**:

Using `X_unique` dataset, use a clustering algorith to split the 8 person (faces) into 3 groups.

In [None]:
### Cluster the faces dataset, and check each cluster elements (person names)
### YOUR CODE HERE:


