<a href="https://colab.research.google.com/github/institutohumai/cursos-python/blob/master/CV/2_Convoluciones/ejercicios/ejercicios_solucion.ipynb"> <img src='https://colab.research.google.com/assets/colab-badge.svg' /> </a>

# Ejercicio 1

## Mejorando LeNet

Sabemos que LeNet fue un gran hito en su momento, pero también sabemos que hay modificaciones que podemos hacer a LeNet para mejorar su rendimiento.

Apliquelas, usando de referencia el Pipeline usado en la clase teórica.



In [None]:
import torch
from torch import nn


In [None]:
def init_cnn(module):
    """Initialize weights for CNNs."""
    if type(module) == nn.Linear or type(module) == nn.Conv2d:
        nn.init.xavier_uniform_(module.weight)

In [None]:
NUM_CHANNEL1 = 6
NUM_CHANNEL2 = 16
NUM_MLP1 = 120
NUM_MLP2 = 84
num_classes = 10

## Defina aquí su modelo.
model = nn.Sequential(
            nn.LazyConv2d(NUM_CHANNEL1, kernel_size=5, padding=2), nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.LazyConv2d(NUM_CHANNEL2, kernel_size=5), nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(NUM_MLP1), nn.ReLU(),
            nn.LazyLinear(NUM_MLP2), nn.ReLU(),
            nn.LazyLinear(num_classes))

In [None]:
def layer_summary(net, X_shape):
    X = torch.randn(*X_shape)
    print("Entrada original:\t", X.shape)
    for layer in net:
        X = layer(X)
        print("Salida tras "+layer.__class__.__name__+':\t', X.shape)

layer_summary(model, (1, 1, 28, 28))

Entrada original:	 torch.Size([1, 1, 28, 28])
Salida tras Conv2d:	 torch.Size([1, 6, 28, 28])
Salida tras ReLU:	 torch.Size([1, 6, 28, 28])
Salida tras MaxPool2d:	 torch.Size([1, 6, 14, 14])
Salida tras Conv2d:	 torch.Size([1, 16, 10, 10])
Salida tras ReLU:	 torch.Size([1, 16, 10, 10])
Salida tras MaxPool2d:	 torch.Size([1, 16, 5, 5])
Salida tras Flatten:	 torch.Size([1, 400])
Salida tras Linear:	 torch.Size([1, 120])
Salida tras ReLU:	 torch.Size([1, 120])
Salida tras Linear:	 torch.Size([1, 84])
Salida tras ReLU:	 torch.Size([1, 84])
Salida tras Linear:	 torch.Size([1, 10])


## Cargando los datos.

Como dijimos, vamos a trabajar con Fashion MNIST. Para ello cargaremos le dataset desde la biblioteca de torch.

In [None]:
import torchvision
from torchvision import transforms
from torch.utils import data

def load_data_fashion_mnist(batch_size):
    trans = [transforms.ToTensor()]
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.FashionMNIST(
        root="../data", train=True, transform=trans, download=True)
    length = len(mnist_train)
    stop = int(len(mnist_train) * 0.7)
    mnist_val = [mnist_train[i] for i in range(stop,length)]
    mnist_train = [mnist_train[i] for i in range(stop)]
    mnist_test = torchvision.datasets.FashionMNIST(
        root="../data", train=False, transform=trans, download=True)
    return (data.DataLoader(mnist_train, batch_size, shuffle=True,
                            num_workers=1),
            data.DataLoader(mnist_val, batch_size, shuffle=True,
                            num_workers=1),
            data.DataLoader(mnist_test, batch_size, shuffle=False,
                            num_workers=1))

batch_size = 1024
iter_train, iter_val, iter_test = load_data_fashion_mnist(batch_size)


También calcularemos el accuracy de nuestro modelo.

In [None]:
def binary_accuracy(preds, y):

    # aproximamos al entero más cercano
    preds = torch.argmax(preds, dim=1)
    correct = (preds == y).float() #convertimos a flotante para la división
    acc = correct.sum() / len(correct)
    return acc

Definiremos una función de entrenamiento y evaluación como las que habíamos usado antes.

In [None]:
def train(model, iterator, optimizer, criterion, device):
    
    model.train()
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    for batch in iterator:

        image, label = batch
        image, label  = image.to(device), label.to(device)      
        optimizer.zero_grad()
        output = model(image)
        loss = criterion(output, label)
        acc = binary_accuracy(output, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device):
    
    model.eval()
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    with torch.no_grad():

      for batch in iterator:

          image, label = batch
          image, label  = image.to(device), label.to(device)      
          optimizer.zero_grad()
          output = model(image)
          loss = criterion(output, label)
          acc = binary_accuracy(output, label)
          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

y una función para calcular el tiempo de cálculo

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

##Entrenamiento

In [None]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()


In [None]:
import math
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, iter_train, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, iter_val, criterion, device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train acc.: {train_acc:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. acc.: {valid_acc:.3f}')

Epoch: 01 | Time: 0m 1s
	Train Loss: 1.804 | Train acc.: 0.411
	 Val. Loss: 0.981 |  Val. acc.: 0.611
Epoch: 02 | Time: 0m 0s
	Train Loss: 0.796 | Train acc.: 0.700
	 Val. Loss: 0.723 |  Val. acc.: 0.715
Epoch: 03 | Time: 0m 0s
	Train Loss: 0.687 | Train acc.: 0.733
	 Val. Loss: 0.639 |  Val. acc.: 0.755
Epoch: 04 | Time: 0m 0s
	Train Loss: 0.612 | Train acc.: 0.761
	 Val. Loss: 0.598 |  Val. acc.: 0.767
Epoch: 05 | Time: 0m 0s
	Train Loss: 0.563 | Train acc.: 0.782
	 Val. Loss: 0.558 |  Val. acc.: 0.790
Epoch: 06 | Time: 0m 0s
	Train Loss: 0.529 | Train acc.: 0.798
	 Val. Loss: 0.535 |  Val. acc.: 0.800
Epoch: 07 | Time: 0m 0s
	Train Loss: 0.508 | Train acc.: 0.812
	 Val. Loss: 0.510 |  Val. acc.: 0.812
Epoch: 08 | Time: 0m 0s
	Train Loss: 0.491 | Train acc.: 0.818
	 Val. Loss: 0.544 |  Val. acc.: 0.781
Epoch: 09 | Time: 0m 0s
	Train Loss: 0.482 | Train acc.: 0.821
	 Val. Loss: 0.519 |  Val. acc.: 0.789
Epoch: 10 | Time: 0m 0s
	Train Loss: 0.460 | Train acc.: 0.833
	 Val. Loss: 0.472 

In [None]:

model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, iter_test, criterion ,device)

print(f'\t Test. acc: {test_loss:.3f} |  test. acc: {test_acc:.3f}')

	 Test. acc: 0.393 |  test. acc: 0.860


# Ejercicio 2

## Cambiando el dataset.

Dado los resultados que tenemos con Fashion MNIST, es una buena idea tratar de probar otro dataset. 


In [None]:
import torch
from torch import nn


In [None]:
def init_cnn(module):
    """Initialize weights for CNNs."""
    if type(module) == nn.Linear or type(module) == nn.Conv2d:
        nn.init.xavier_uniform_(module.weight)

Como vamos a trabajar con imágenes en colores (3 canales RGB rojo, verde y azul), lo primero que haremos será aumentar el número de canales. Además, dado que CIFAR10 tiene imágenes de $32×32$ eliminaremos el padding de la primera capa convolucional para obtener mapas receptivos similares a los de FashionMNIST. Tambien cambiaremos el número de salidas de la primera capa densa

In [None]:
NUM_CHANNEL1 = 20
NUM_CHANNEL2 = 50
NUM_MLP1 = 200
NUM_MLP2 = 80
num_classes = 10

## Defina aquí su modelo.
model = nn.Sequential(
            nn.LazyConv2d(NUM_CHANNEL1, kernel_size=5), nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.LazyConv2d(NUM_CHANNEL2, kernel_size=5), nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.LazyLinear(NUM_MLP1), nn.ReLU(),
            nn.LazyLinear(NUM_MLP2), nn.ReLU(),
            nn.LazyLinear(num_classes))


In [None]:
def layer_summary(net, X_shape):
    X = torch.randn(*X_shape)
    print("Entrada original:\t", X.shape)
    for layer in net:
        X = layer(X)
        print("Salida tras "+layer.__class__.__name__+':\t', X.shape)

layer_summary(model, (1, 3, 32, 32))

Entrada original:	 torch.Size([1, 3, 32, 32])
Salida tras Conv2d:	 torch.Size([1, 20, 28, 28])
Salida tras ReLU:	 torch.Size([1, 20, 28, 28])
Salida tras MaxPool2d:	 torch.Size([1, 20, 14, 14])
Salida tras Conv2d:	 torch.Size([1, 50, 10, 10])
Salida tras ReLU:	 torch.Size([1, 50, 10, 10])
Salida tras MaxPool2d:	 torch.Size([1, 50, 5, 5])
Salida tras Flatten:	 torch.Size([1, 1250])
Salida tras Linear:	 torch.Size([1, 200])
Salida tras ReLU:	 torch.Size([1, 200])
Salida tras Linear:	 torch.Size([1, 80])
Salida tras ReLU:	 torch.Size([1, 80])
Salida tras Linear:	 torch.Size([1, 10])


#Ejercicio 3



## Cargando los datos.

Ahora trabajaremos con CIFAR10 un dataset que tiene imagenes de en colores de $32\times32$. Defina una función que genere iteradores de entrenamiento, validación y prueba para este dataset

In [None]:
import torchvision
from torchvision import transforms
from torch.utils import data

def load_data_fashion_mnist(batch_size):
  ## inserte aquí su código
    trans = [transforms.ToTensor()]
    trans = transforms.Compose(trans)
    mnist_train = torchvision.datasets.CIFAR10(
        root="../data", train=True, transform=trans, download=True)
    length = len(mnist_train)
    stop = int(len(mnist_train) * 0.7)
    mnist_val = [mnist_train[i] for i in range(stop,length)]
    mnist_train = [mnist_train[i] for i in range(stop)]
    mnist_test = torchvision.datasets.CIFAR10(
        root="../data", train=False, transform=trans, download=True)
    return (data.DataLoader(mnist_train, batch_size, shuffle=True,
                            num_workers=1),
            data.DataLoader(mnist_val, batch_size, shuffle=True,
                            num_workers=1),
            data.DataLoader(mnist_test, batch_size, shuffle=False,
                            num_workers=1))

batch_size = 1024
iter_train, iter_val, iter_test = load_data_fashion_mnist(batch_size)


Files already downloaded and verified
Files already downloaded and verified


También calcularemos el accuracy de nuestro modelo.

In [None]:
def binary_accuracy(preds, y):

    # aproximamos al entero más cercano
    preds = torch.argmax(preds, dim=1)
    correct = (preds == y).float() #convertimos a flotante para la división
    acc = correct.sum() / len(correct)
    return acc

Definiremos una función de entrenamiento y evaluación como las que habíamos usado antes.

In [None]:
def train(model, iterator, optimizer, criterion, device):
    
    model.train()
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    for batch in iterator:

        image, label = batch
        image, label  = image.to(device), label.to(device)      
        optimizer.zero_grad()
        output = model(image)
        loss = criterion(output, label)
        acc = binary_accuracy(output, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, device):
    
    model.eval()
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    with torch.no_grad():

      for batch in iterator:

          image, label = batch
          image, label  = image.to(device), label.to(device)      
          optimizer.zero_grad()
          output = model(image)
          loss = criterion(output, label)
          acc = binary_accuracy(output, label)
          epoch_loss += loss.item()
          epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

y una función para calcular el tiempo de cálculo

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

##Entrenamiento

In [None]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()


In [None]:
import math
N_EPOCHS = 50

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, iter_train, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, iter_val, criterion, device)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train acc.: {train_acc:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. acc.: {valid_acc:.3f}')

Epoch: 01 | Time: 0m 1s
	Train Loss: 2.062 | Train acc.: 0.230
	 Val. Loss: 1.851 |  Val. acc.: 0.321
Epoch: 02 | Time: 0m 1s
	Train Loss: 1.757 | Train acc.: 0.364
	 Val. Loss: 1.701 |  Val. acc.: 0.373
Epoch: 03 | Time: 0m 1s
	Train Loss: 1.639 | Train acc.: 0.403
	 Val. Loss: 1.620 |  Val. acc.: 0.409
Epoch: 04 | Time: 0m 1s
	Train Loss: 1.560 | Train acc.: 0.432
	 Val. Loss: 1.526 |  Val. acc.: 0.445
Epoch: 05 | Time: 0m 1s
	Train Loss: 1.501 | Train acc.: 0.453
	 Val. Loss: 1.516 |  Val. acc.: 0.452
Epoch: 06 | Time: 0m 1s
	Train Loss: 1.440 | Train acc.: 0.480
	 Val. Loss: 1.464 |  Val. acc.: 0.477
Epoch: 07 | Time: 0m 1s
	Train Loss: 1.401 | Train acc.: 0.494
	 Val. Loss: 1.417 |  Val. acc.: 0.488
Epoch: 08 | Time: 0m 1s
	Train Loss: 1.373 | Train acc.: 0.504
	 Val. Loss: 1.371 |  Val. acc.: 0.508
Epoch: 09 | Time: 0m 1s
	Train Loss: 1.385 | Train acc.: 0.505
	 Val. Loss: 1.355 |  Val. acc.: 0.511
Epoch: 10 | Time: 0m 1s
	Train Loss: 1.300 | Train acc.: 0.535
	 Val. Loss: 1.368 

In [None]:

model.load_state_dict(torch.load('tut2-model.pt'))

test_loss, test_acc = evaluate(model, iter_test, criterion ,device)

print(f'\t Test. loss: {test_loss:.3f} |  test. acc: {test_acc:.3f}')

	 Test. acc: 0.986 |  test. acc: 0.661


#Ejercicio 4


#Formas, Tamaños y Salidas

Los siguientes ejercicios están hechos para practicar, entender y aprender como cambian los tamaños de las salidas y entradas al aplicar padding y strides

In [None]:
import torch
from torch import nn

1. Defina un tensor de $15 \times 15$ y aplique una convolución con kernel $5 \times 5$ de tal manera que a la salida tenga un solo mapa de características de $3 \times 3$. No debe utilizar padding
> NOTA: Recuerde que las convoluciones espera a la entrada tensores de la forma:
`X = [tamaño de minilote, numero de canales, alto en píxeles, ancho en píxeles]`

In [None]:
# Inserte su código

X1 = torch.randn((1,1,15,15))
net1 = nn.Sequential(nn.LazyConv2d(1, kernel_size=5, stride=5))
Y1 = net1(X1)
print(Y1.shape)

torch.Size([1, 1, 3, 3])




2. Defina un tensor de $7 \times 7$ y aplique una convolución con kernel $3 \times 3$ de tal manera que a la salida tenga un solo mapa de características de $3 \times 3$.

In [None]:
# Inserte su código

X1 = torch.randn((1,1,7,7))
net1 = nn.Sequential(nn.LazyConv2d(1, kernel_size=3, stride=3, padding=1))
Y1 = net1(X1)
print(Y1.shape)

torch.Size([1, 1, 3, 3])


3. Dado un mapa de carácteristicas de la forma $100 \times 100$ y una convolución con kernel $7 \times 7$. ¿Cual era el tamaño original de la imagen de entrada?

In [None]:
# Inserte su código

X1 = torch.randn((1,1,106,106))
net1 = nn.Sequential(nn.LazyConv2d(1, kernel_size=7))
Y1 = net1(X1)
print(Y1.shape)

torch.Size([1, 1, 100, 100])


4. Para una imagen de $16\times16$, al aplicar una ventana de pooling de $2\times2$, ¿Cual es el tamaño esperado a la salida?

In [None]:
# Inserte su código

X1 = torch.randn((1,1,16,16))
net1 = nn.Sequential(nn.MaxPool2d(kernel_size=2))
Y1 = net1(X1)
print(Y1.shape)

torch.Size([1, 1, 8, 8])


5. Para una imagen de $16\times16$, si aplicamos una ventana de pooling de $2\times2$, ¿Podemos obtener un salida de $4\times4$ usando strides?

In [None]:
# Inserte su código

X1 = torch.randn((1,1,16,16))
net1 = nn.Sequential(nn.MaxPool2d(kernel_size=2,stride=4))
Y1 = net1(X1)
print(Y1.shape)

torch.Size([1, 1, 4, 4])


#Ejercicio 5

Entrenando kernels preexistentes

En la clase presentamos un pequeño pipeline para mostrar como entrenar un kernel. Nuestra intención en este ejercicio es replicar eso resultados para los siguientes ejemplos:

## Operador Laplaciano

In [None]:
X = torch.Tensor.uniform_(torch.Tensor(1,1,64,64))
K = torch.Tensor([  [0,  1, 0],
                    [1, -4, 1],
                    [0,  1, 0], ])

def corr2d(X, K): 
    h, w = K.shape
    Y = torch.zeros((X.shape[-2] - h + 1, X.shape[-1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[0][0][i:i + h, j:j + w] * K).sum() # producto de Haddamar
    return Y

Y = corr2d(X,K)

# inserte su codigo.
conv2d = nn.LazyConv2d(1, kernel_size=3, bias=False)

lr = 3e-1  # Learning rate

for i in range(100):
    Y_hat = conv2d(X)
    l = (Y_hat - Y) ** 2 ## minimos cuadrados
    conv2d.zero_grad()
    l.mean().backward()
    # actualizamos los pesos
    conv2d.weight.data[:] -= lr * conv2d.weight.grad
    if (i + 1) % 20 == 0:
        print(f'epoch {i + 1}, loss {l.sum():.3f}')

print(conv2d.weight.data)
print(K)

epoch 20, loss 918.581
epoch 40, loss 121.710
epoch 60, loss 16.184
epoch 80, loss 2.160
epoch 100, loss 0.289
tensor([[[[ 8.4668e-04,  9.9256e-01, -1.0056e-03],
          [ 9.9378e-01, -3.9745e+00,  9.9316e-01],
          [-3.8226e-04,  9.9379e-01,  1.7304e-03]]]])
tensor([[ 0.,  1.,  0.],
        [ 1., -4.,  1.],
        [ 0.,  1.,  0.]])


## Suavizador Gaussiano

In [None]:
X = torch.Tensor.uniform_(torch.Tensor(1,1,64,64))
K = torch.Tensor([  [1.0, 2.0, 1.0],
                    [2.0, 4.0, 2.0],
                    [1.0, 2.0, 1.0], ])
K /= 16.0

def corr2d(X, K): 
    h, w = K.shape
    Y = torch.zeros((X.shape[-2] - h + 1, X.shape[-1] - w + 1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i, j] = (X[0][0][i:i + h, j:j + w] * K).sum() # producto de Haddamar
    return Y

Y = corr2d(X,K)

# inserte su codigo.
conv2d = nn.LazyConv2d(1, kernel_size=3, bias=False)

lr = 3e-1  # Learning rate

for i in range(100):
    Y_hat = conv2d(X)
    l = (Y_hat - Y) ** 2 ## minimos cuadrados
    conv2d.zero_grad()
    l.mean().backward()
    # actualizamos los pesos
    conv2d.weight.data[:] -= lr * conv2d.weight.grad
    if (i + 1) % 20 == 0:
        print(f'epoch {i + 1}, loss {l.sum():.3f}')

print(conv2d.weight.data)
print(K)

epoch 20, loss 12.960
epoch 40, loss 1.627
epoch 60, loss 0.205
epoch 80, loss 0.026
epoch 100, loss 0.003
tensor([[[[0.0628, 0.1258, 0.0610],
          [0.1244, 0.2505, 0.1245],
          [0.0647, 0.1242, 0.0622]]]])
tensor([[0.0625, 0.1250, 0.0625],
        [0.1250, 0.2500, 0.1250],
        [0.0625, 0.1250, 0.0625]])


# Espacio de colores CMY