# **Autoencoders in Computer Vision - Jupyter Notebook Tutorial**

# Section 1: Introduction to Autoencoders


NOTES:
Autoencoders are neural networks designed to learn compressed representations of data.
They work by encoding the input into a latent space and then decoding it back to reconstruct the input.

This is useful in scenarios where labeled data is scarce but we still want to learn meaningful features.
The learned features (latent vectors) are useful for:
- Image compression
- Noise reduction (denoising autoencoders)
- Anomaly detection (reconstruction error)
- Pretraining for classification or generative models
- Transfer learning


# Section 2: Setup and Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import numpy as np

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


# Section 3: Load Dataset (MNIST)

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_data, val_data = random_split(dataset, [50000, 10000])
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

100%|██████████| 9.91M/9.91M [00:01<00:00, 5.51MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 160kB/s]
100%|██████████| 1.65M/1.65M [00:01<00:00, 1.52MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 4.94MB/s]


# Section 4: Define Basic Autoencoder

In [None]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Linear(128, 64)
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 28*28),
            nn.Sigmoid(),
            nn.Unflatten(1, (1, 28, 28))
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Section 5: Training Autoencoder

In [None]:
epochs = 5
for epoch in range(epochs):
    for images, _ in train_loader:
        images = images.to(device)
        outputs = model(images)
        loss = criterion(outputs, images)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 1/5, Loss: 0.0189
Epoch 2/5, Loss: 0.0109
Epoch 3/5, Loss: 0.0077
Epoch 4/5, Loss: 0.0091
Epoch 5/5, Loss: 0.0095


# Section 6: Visualize Reconstruction

In [None]:
def show_reconstruction():
    model.eval()
    with torch.no_grad():
        for images, _ in train_loader:
            images = images.to(device)
            outputs = model(images)
            break

    fig, axes = plt.subplots(1, 2)
    axes[0].imshow(images[0].cpu().squeeze(), cmap='gray')
    axes[0].set_title("Original")
    axes[1].imshow(outputs[0].cpu().squeeze(), cmap='gray')
    axes[1].set_title("Reconstructed")
    plt.show()


 After visualizing the reconstructed images, the goal is to evaluate how well the model learned the compressed representation.
    If the reconstruction is good, it means the encoder has captured the essential information.

    These latent features can now be reused:
    - For clustering similar images
    - As inputs to a classifier
    - As building blocks for generative models like Variational Autoencoders or GANs

In [None]:
show_reconstruction()

# Section 7: Classification Using Latent Features

We now use the encoder's output (latent vector) to train a simple classifier.
This demonstrates the power of unsupervised representation learning.

In [None]:
class LatentClassifier(nn.Module):
    def __init__(self):
        super(LatentClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 10)
        )

    def forward(self, z):
        return self.fc(z)

classifier = LatentClassifier().to(device)
clf_criterion = nn.CrossEntropyLoss()
clf_optimizer = optim.Adam(classifier.parameters(), lr=0.001)

# Train the classifier using frozen encoder

In [None]:
model.eval()
for param in model.parameters():
    param.requires_grad = False

# Classification training loop

In [None]:
epochs = 5
for epoch in range(epochs):
    classifier.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        with torch.no_grad():
            z = model.encoder(images)
        preds = classifier(z)
        loss = clf_criterion(preds, labels)

        clf_optimizer.zero_grad()
        loss.backward()
        clf_optimizer.step()

    print(f"[Classifier] Epoch {epoch+1}, Loss: {loss.item():.4f}")

# Evaluate classifier

In [None]:
correct, total = 0, 0
classifier.eval()
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        z = model.encoder(images)
        preds = classifier(z)
        predicted = preds.argmax(1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

print(f"Validation Accuracy using latent features: {100 * correct / total:.2f}%")

# Final Notes:

We explored basic and masked autoencoders in this notebook.
They enable learning rich visual representations without any labels.

By training to reconstruct input images or masked portions,
autoencoders learn **meaningful features** that capture underlying structure in the data.
These features can be used as a foundation for other computer vision tasks:
- Classification (as demonstrated)
- Clustering
- Generative Modeling (e.g. VAEs, GANs)

# Visualizing the Latent Space of Autoencoders

This section will visualize Autoencoder’s latent space trained on MNIST dataset. We will try to develop some intuition about the gaps that prevent Autoencoders from being generative in nature.

In [None]:
import numpy as np
from sklearn.decomposition import PCA

imgs_visualize = 5000
figsize = 10

# Convert val_data (subset of MNIST) to tensors and stack
val_images = torch.stack([item[0] for item in val_data])  # shape [10000, 1, 28, 28]
val_labels = torch.tensor([item[1] for item in val_data])

# Randomly sample imgs_visualize indices from val set
indices = np.random.choice(len(val_images), imgs_visualize, replace=False)
images_sample = val_images[indices].to(device)

# Get latent embeddings (encoder output)
model.eval()
with torch.no_grad():
    latent_vectors = model.encoder(images_sample)  # shape [imgs_visualize, 64]

# Reduce to 2D using PCA
pca = PCA(n_components=2)
latent_2d = pca.fit_transform(latent_vectors.cpu().numpy())

# Plot
plt.figure(figsize=(figsize, figsize))
plt.scatter(latent_2d[:, 0], latent_2d[:, 1], alpha=0.5, s=2)
plt.xlabel("Dimension-1", size=20)
plt.ylabel("Dimension-2", size=20)
plt.xticks(size=15)
plt.yticks(size=15)
plt.title("Projection of 2D Latent-Space (MNIST)", size=20)
plt.grid(True)
plt.show()

Here, we will:
- Randomly sample points in the latent space (from a normal distribution),

- Feed those points into the trained decoder,

- Visualize the reconstructed images.

This will demonstrate that the vanilla autoencoder latent space is irregular and unstructured, so random latent vectors produce meaningless/noisy images.

In [None]:
import torch
import matplotlib.pyplot as plt

# Number of random latent samples to generate
num_samples = 10

# Random latent vectors sampled from standard normal distribution (mean=0, std=1)
random_latents = torch.randn(num_samples, 64).to(device)  # same latent size as encoder output

# Decode random latent vectors
model.eval()
with torch.no_grad():
    generated_images = model.decoder(random_latents)  # output shape [num_samples, 1, 28, 28]

# Visualize generated images
fig, axes = plt.subplots(1, num_samples, figsize=(num_samples * 2, 2))
for i in range(num_samples):
    axes[i].imshow(generated_images[i].cpu().squeeze(), cmap='gray')
    axes[i].axis('off')
    axes[i].set_title(f'Sample {i+1}')

# plt.suptitle("Random Images Reconstructed from Random Latent Vectors")
plt.show()

Since our autoencoder's latent space is not regularized, sampling from a standard normal distribution does not correspond to meaningful encoded images.

The decoder tries to decode these random latent vectors but produces noisy, uninterpretable images.

This illustrates the motivation for models like Variational Autoencoders (VAEs) that enforce latent space priors to enable meaningful generation.