# Introduction to Convolutional Neural Networks

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt


I want to create a model to distinguish between circles and squares in images. First, I create a dataset containing images of both classes.

In [None]:
class ShapesDataset(Dataset):
    def __init__(self, num_samples, image_size=28):
        self.num_samples = num_samples
        self.image_size = image_size
        self.data, self.labels = self.generate_data(num_samples, image_size)
        
    def generate_data(self, num_samples, image_size):
        data = []
        labels = []
        for _ in range(num_samples):
            label = np.random.randint(0, 2)
            image = np.zeros((image_size, image_size), dtype=np.float32)
            if label == 0:
                # Draw a square
                side = np.random.randint(5, image_size // 2)
                top_left_x = np.random.randint(0, image_size - side)
                top_left_y = np.random.randint(0, image_size - side)
                image[top_left_x:top_left_x+side, top_left_y:top_left_y+side] = 1.0
            else:
                # Draw a circle
                radius = np.random.randint(5, image_size // 4)
                center_x = np.random.randint(radius, image_size - radius)
                center_y = np.random.randint(radius, image_size - radius)
                y, x = np.ogrid[:image_size, :image_size]
                mask = (x - center_x)**2 + (y - center_y)**2 <= radius**2
                image[mask] = 1.0
            data.append(image)
            labels.append(label)
        return torch.tensor(data).unsqueeze(1), torch.tensor(labels)
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create the dataset and dataloaders
train_dataset = ShapesDataset(num_samples=1000)
test_dataset = ShapesDataset(num_samples=200)
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

Lets display some examples

In [None]:
fig, axes = plt.subplots(1, 5, figsize=(15, 3))
for i in range(5):
    image, label = train_dataset[i]
    axes[i].imshow(image.squeeze(), cmap='gray')
    axes[i].set_title(f"Label: {'Square' if label.item() == 0 else 'Circle'}")
    axes[i].axis('off')
plt.show()

Lets create a network using pixels as features

In [None]:
image, label = train_dataset[0]

print("Original", image.shape)
# The first step is flattening the image to enter the networn
nn.Flatten(1)(image).shape

In [None]:
model = nn.Sequential(
    nn.Flatten(start_dim=1),
    nn.Linear(784, 20),
    nn.Tanh(),
    nn.Linear(20, 2),
    nn.Softmax(dim=1)
)
sum(p.nelement() for p in model.parameters())

There are some problems with this solution:
- The interaction of every pair of pixels is taken into account in the linear layer. This creates a very large number of parameters to learn.
- The model is very sensitive to image translations, because each pixel is considered a feature.
- The locality is completelly missed. Borders, for example, cannot be found.

## Convolutional Neural Networks
A convolution is a mathematical operation that combines two sets of information.

In computer vision, a convolution transform an image using a kernel. Convolution transforms pixels into features, that takes into account in a particular way the relations of each pixel with its neighbors.

In [None]:
def manual_convolution(input_data, kernel):
    # Get dimensions
    input_h, input_w = input_data.shape
    kernel_h, kernel_w = kernel.shape

    # Calculate output dimensions
    output_h = input_h - kernel_h + 1
    output_w = input_w - kernel_w + 1

    # Initialize the output
    output = torch.zeros((output_h, output_w), dtype=torch.float32, device=input_data.device)

    # Perform the convolution operation
    for i in range(output_h):
        for j in range(output_w):
            # Extract the region of interest
            region = input_data[i:i+kernel_h, j:j+kernel_w]
            # Perform element-wise multiplication and sum the result
            output[i, j] = torch.sum(region * kernel)
    
    return output


In [None]:
def display_image(image, title='',*, cmap='gray', **kwargs):
    plt.imshow(image, cmap=cmap, **kwargs)
    plt.title(title)
    plt.axis('off')
    plt.show()

image = np.zeros((10, 10), dtype=np.float32)
image[3:7, 3:7] = 1.0

# Display the original image
display_image(image, title='Original Image')

# Convert the image to a PyTorch tensor and add a batch dimension and a channel dimension
image_tensor = torch.tensor(image).unsqueeze(0).unsqueeze(0)

In [None]:
def show_by_kernel(image_tensor, kernel, title):
    output = F.conv2d(image_tensor, kernel, padding='same').squeeze().numpy()
    display_image(output, title)

edge_detection_kernel = torch.tensor([[
    [-1, -1, -1], 
    [-1, 8, -1], 
    [-1, -1, -1]]], dtype=torch.float32).unsqueeze(0)
show_by_kernel(image_tensor, edge_detection_kernel, 'Edge Detection Output')


In [None]:
vertical_edge_kernel = torch.tensor([[
    [-1, 0, 1], 
    [-1, 0, 1], 
    [-1, 0, 1]]], dtype=torch.float32).unsqueeze(0)
show_by_kernel(image_tensor, vertical_edge_kernel, 'Horizontal Edge Detection Output')


In [None]:
horizontal_edge_kernel = torch.tensor([[
    [1, 1, 1], 
    [0, 0, 0], 
    [-1, -1, -1]]], dtype=torch.float32).unsqueeze(0)
show_by_kernel(image_tensor, horizontal_edge_kernel, 'Horizontal Edge Detection Output')


In [None]:
corner_kernel = torch.tensor([[
    [0, -1, 0], 
    [-1, 4, -1], 
    [0, -1, 0]]], dtype=torch.float32).unsqueeze(0)
show_by_kernel(image_tensor, corner_kernel, 'Corners')

Convolutions find new features that contains information a pixel region, so now they can include information about borders.

If we train a network using convoluted values, we can provide better information with less size:
- Learn the kernels in the first layers

Additionally, we would need to reduce the size of the images on each layer:
- Reduce the dimensionality, so the number of parameters
- Translation invariance, by summarizing feautures in regions
- Hierarchical features, by producing more abstract features per layers
- Noise reduction 

CNNs are neural networks that use convolutions to create higher-level features based on the input images:
- In convolutional layers, the kernels are learnt
- All operations are differenciable, so can be learned by gradient descent and backpropagation

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=3)
        self.conv2 = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)
        self.fc = nn.Linear(3 * 3, 2)  # Adjust the dimensions accordingly
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.pool(self.tanh(self.conv1(x)))
        x = self.pool(self.tanh(self.conv2(x)))
        x = x.view(x.shape[0], -1)  # Flatten the tensor
        x = self.fc(x)
        x = self.softmax(x)
        return x

model = SimpleCNN()


In [None]:
sum(p.nelement() for p in model.parameters())

Lets train the model

In [None]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:

        # forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # update
        optimizer.step()
        running_loss += loss.item()
    if epoch % (num_epochs // 10) == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

print("Training complete")

# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")


Lets explore now what are the convolutions learned by the model while training

In [None]:
square_1 = [img for img, cls in train_dataset if cls == 0][0]
circle_1 = [img for img, cls in train_dataset if cls == 1][0]
display_image(square_1.squeeze())
display_image(circle_1.squeeze())


In [None]:
model.conv1.weight

In [None]:
with torch.no_grad():
    image = model.conv1(square_1).squeeze()
    display_image(image)
    image = model.conv1(circle_1).squeeze()
    display_image(image)


In [None]:
model.conv2.weight

Lets apply the pooling and the second convolution to both examples

In [None]:
with torch.no_grad():
    image = model.conv2(model.pool(torch.tanh(model.conv1(square_1)))).squeeze()
    display_image(image)
    image = model.conv2(model.pool(torch.tanh(model.conv1(circle_1)))).squeeze()
    display_image(image)

And then we apply the last pooling

In [None]:
with torch.no_grad():
    image = model.pool(torch.tanh(model.conv2(
        model.pool(torch.tanh(model.conv1(square_1)))))).squeeze()
    display_image(image)
    image = model.pool(torch.tanh(model.conv2(
        model.pool(torch.tanh(model.conv1(circle_1)))))).squeeze()
    display_image(image)

Lets show the results for some of the circles and squares

In [None]:
squares = [img for img, cls in train_dataset if cls == 0][:6]
circles = [img for img, cls in train_dataset if cls == 1][:6]

with torch.no_grad():
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(5, 5))
    for i, ax in enumerate(axes.flat):
        image = model.pool(torch.tanh(model.conv2(
            model.pool(torch.tanh(model.conv1(squares[i])))))).squeeze()
        ax.imshow(image, cmap='gray')
    plt.show()

    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(5, 5))
    for i, ax in enumerate(axes.flat):
        image = model.pool(torch.tanh(model.conv2(
            model.pool(torch.tanh(model.conv1(circles[i])))))).squeeze()
        ax.imshow(image, cmap='gray')
    plt.show()
        

In real, more complex applications, we usually performs different parallel convolutions to the same input. The result of each of the convolutions is stored in a different chanel of the output

In [None]:
# single convolution kernel
nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1).weight.shape


In [None]:
image = torch.randn((10, 1, 28, 28))
nn.Conv2d(in_channels=1, out_channels=1, kernel_size=3, padding=1)(image).shape

In [None]:
# multiple convolution kernels
nn.Conv2d(in_channels=1, out_channels=5, kernel_size=3, padding=1).weight.shape


In [None]:
image = torch.randn((10, 1, 28, 28))
nn.Conv2d(in_channels=1, out_channels=5, kernel_size=3, padding=1)(image).shape

For multipe inputs channels, actual kernels have one more dimmension.

In [None]:
nn.Conv2d(in_channels=3, out_channels=1, kernel_size=3, padding=1).weight.shape

In [None]:
image = torch.randn((10, 3, 28, 28))
nn.Conv2d(in_channels=3, out_channels=1, kernel_size=3, padding=1)(image).shape

In [None]:
# Final case, multiple inputs, multiple outputs
nn.Conv2d(in_channels=3, out_channels=5, kernel_size=3, padding=1).weight.shape

In [None]:
image = torch.randn((10, 3, 28, 28))
nn.Conv2d(in_channels=3, out_channels=5, kernel_size=3, padding=1)(image).shape

Now, lets explore the kernel size

In [None]:
image = torch.randn((1, 1, 28, 28))
nn.Conv2d(in_channels=1, out_channels=5, kernel_size=7)(image).shape

In [None]:
# padding

image = torch.randn((1, 1, 28, 28))
nn.Conv2d(in_channels=1, out_channels=5, kernel_size=7, padding=3)(image).shape

In [None]:
# padding, auto

image = torch.randn((1, 1, 28, 28))
print(nn.Conv2d(in_channels=1, out_channels=5, kernel_size=3, padding='same')(image).shape)
print(nn.Conv2d(in_channels=1, out_channels=5, kernel_size=7, padding='same')(image).shape)

Lets change the model to use multiple kernels per layer

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=3)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, padding=1)
        self.fc = nn.Linear(16 * 3 * 3, 2)  # out_channels * (28 // 3 //3) * (28 // 3 //3)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.pool(self.tanh(self.conv1(x)))
        x = self.pool(self.tanh(self.conv2(x)))
        x = x.view(-1, 16 * 3 * 3)  # Flatten the tensor
        x = self.fc(x)
        x = self.softmax(x)
        return x

model = SimpleCNN()


In [None]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

print("Training complete")

# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")


In color images, the input images has three channels, so we need to adjust the conv2d parameters.

In this example, I will differentiate between two classes:
- class 0: yellow squares
- class 1: red squares and yellow circles

In [None]:
# channels are R, G, B. yellow = R+G

def create_square(image_size, color='rgb'):
    image = np.zeros((3, image_size, image_size), dtype=np.float32)
    side = np.random.randint(5, image_size // 2)
    top_left_x = np.random.randint(0, image_size - side)
    top_left_y = np.random.randint(0, image_size - side)
    for idx, c in enumerate('rgb'):
        if c in color:
            image[idx, top_left_x:top_left_x+side, top_left_y:top_left_y+side] = 1.0
    return image

def create_circle(image_size, color='rgb'):
    image = np.zeros((3, image_size, image_size), dtype=np.float32)
    radius = np.random.randint(5, image_size // 4)
    center_x = np.random.randint(radius, image_size - radius)
    center_y = np.random.randint(radius, image_size - radius)
    y, x = np.ogrid[:image_size, :image_size]
    mask = (x - center_x)**2 + (y - center_y)**2 <= radius**2
    for idx, c in enumerate('rgb'):
        if c in color:
            image[idx, mask] = 1.0
    return image

class ShapesDataset2(Dataset):
    def __init__(self, num_samples, image_size=28):
        self.num_samples = num_samples
        self.image_size = image_size
        self.data, self.labels = self.generate_data(num_samples, image_size)
        
    def generate_data(self, num_samples, image_size):
        data = []
        labels = []
        for _ in range(num_samples):
            label = np.random.randint(0, 2)
            # image = np.zeros((3, image_size, image_size), dtype=np.float32)
            if label == 0:
                # Draw a yellow square
                image = create_square(image_size, 'rg')
            else:
                if np.random.rand() < 0.5:
                    image = create_circle(image_size, 'rg')
                else:
                    image = create_square(image_size, 'r')
                    
            data.append(image)
            labels.append(label)
        return torch.tensor(data), torch.tensor(labels)
    
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create the dataset and dataloaders
train_dataset = ShapesDataset2(num_samples=1000)
test_dataset = ShapesDataset2(num_samples=200)
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)


In [None]:
# Display some samples
fig, axes = plt.subplots(3, 5, figsize=(10,6))
for i in range(15):
    row = i // 5
    col = i % 5
    ax = axes[row, col]
    image, label = train_dataset[i]
    ax.imshow(image.squeeze().permute(1, 2, 0).numpy(), cmap='gray')
    ax.set_title(f"Label: {'Class 1' if label.item() == 0 else 'Class 2'}")
    ax.axis('off')
plt.show()

In [None]:
class SimpleCNNColor(nn.Module):
    def __init__(self):
        super(SimpleCNNColor, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=2, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=3)
        self.conv2 = nn.Conv2d(in_channels=2, out_channels=2, kernel_size=3, padding=1)
        self.fc = nn.Linear(2 * 3 * 3, 2)  # out_channels * (28 // 3 //3) * (28 // 3 //3)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.pool(self.tanh(self.conv1(x)))
        x = self.pool(self.tanh(self.conv2(x)))
        x = x.view(-1, 2 * 3* 3)  # Flatten the tensor
        x = self.fc(x)
        x = self.softmax(x)
        return x

model = SimpleCNNColor()


In [None]:
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")

print("Training complete")

# Evaluation loop
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy: {100 * correct / total}%")


In [None]:
# out_channels, in_channels, h, w
model.conv1.weight.shape

In [None]:
display(model.conv1.weight[0])
torch.sum(model.conv1.weight[0], dim=(1, 2))

In [None]:
display(model.conv1.weight[1])
torch.sum(model.conv1.weight[1], dim=(1, 2))

Lets now explore the results of applying those convolutions to the three types of objects we have in the dataset

In [None]:
def display_image_color(image, title='',*, cmap='gray'):
    plt.imshow(image.squeeze().permute(1, 2, 0).numpy(), cmap=cmap)
    plt.title(title)
    plt.axis('off')
    plt.show()


In [None]:
obj = torch.tensor(create_square(28,'rg'))
fig, axes = plt.subplots(1, 3, figsize=(10,6))
axes[0].imshow(obj.squeeze().permute(1, 2, 0).numpy(), cmap='gray')
with torch.no_grad():
    c1 = model.conv1(obj)
axes[1].imshow(c1[0], cmap='gray')
axes[2].imshow(c1[1], cmap='gray')

In [None]:
obj = torch.tensor(create_square(28,'r'))
fig, axes = plt.subplots(1, 3, figsize=(10,6))
axes[0].imshow(obj.squeeze().permute(1, 2, 0).numpy(), cmap='gray')
with torch.no_grad():
    c1 = model.conv1(obj)
axes[1].imshow(c1[0], cmap='gray')
axes[2].imshow(c1[1], cmap='gray')

In [None]:
obj = torch.tensor(create_circle(28,'rg'))
fig, axes = plt.subplots(1, 3, figsize=(10,6))
axes[0].imshow(obj.squeeze().permute(1, 2, 0).numpy(), cmap='gray')
with torch.no_grad():
    c1 = model.conv1(obj)
axes[1].imshow(c1[0], cmap='gray')
axes[2].imshow(c1[1], cmap='gray')

Remember the classes
- class 0: yellow squares
- class 1: red squares and yellow circles

Here you can see that on red objects, no borders are extracte because it is not necesary in order to distinguish the class