## Images

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Define the three linear layers
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
    def forward(self, x):
        # Pass x through linear layers adding activations
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

In [None]:
import torch
from torchmetrics import Accuracy

# Set up binary accuracy metric
acc = Accuracy(task="binary", num_classes=2)

net.eval()
with torch.no_grad():
    for features, labels in dataloader_test:
        # Get predicted probabilities for test data batch
        outputs = net(features)
        preds = (outputs >= 0.5).float()
        acc(preds, labels.view(-1, 1))

# Compute total test accuracy
test_accuracy = acc.compute()
print(f"Test accuracy: {test_accuracy}")

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
        # Apply He initialization
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity="sigmoid")

    def forward(self, x):
        # Update ReLU activation to ELU
        x = nn.functional.elu(self.fc1(x))
        x = nn.functional.elu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        # Add two batch normalization layers
        self.bn1 = nn.BatchNorm1d(16)
        self.bn2 = nn.BatchNorm1d(8)
        
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity="sigmoid") 
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.elu(x)

        # Pass x through the second set of layers
        x = self.fc2(x)
        x = self.bn2(x)
        x = nn.functional.elu(x)

        x = nn.functional.sigmoid(self.fc3(x))
        return x

In [None]:
from torchvision.datasets import ImageFolder
from torchvision import transforms

train_transforms = transforms.Compose([
    # Add horizontal flip and rotation
# Define transforms
    train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.RandomAutocontrast(),
    transforms.ToTensor(),
    transforms.Resize((128, 128)),
])

dataset_train = ImageFolder(
  "clouds_train",
  transform=train_transforms,
)
dataloader_train = DataLoader(
  dataset_train, shuffle=True, batch_size=16
)

dataset_train = ImageFolder(
  "clouds_train",
  transform=train_transforms,
)

dataloader_train = DataLoader(
  dataset_train, shuffle=True, batch_size=1
)

image, label = next(iter(dataloader_train))
# Reshape the image tensor
image = image.squeeze().permute(1, 2, 0) 
# Display the image
plt.imshow(image)
plt.show()

In [None]:
class Net(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # Define feature extractor
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(),
        )
        # Define classifier
        self.classifier = nn.Linear(64*16*16, num_classes)
    
    def forward(self, x):  
        # Pass input through feature extractor and classifier
        x = self.feature_extractor(x)
        x = self.classifier(x)
        return x

In [None]:
# Define the model
net = Net(num_classes=7)
# Define the loss function
criterion = nn.CrossEntropyLoss()
# Define the optimizer
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(3):
    running_loss = 0.0
    # Iterate over training batches
    for images, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = net(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    epoch_loss = running_loss / len(dataloader_train)
    print(f"Epoch {epoch+1}, Loss: {epoch_loss:.4f}")

In [None]:
# Define metrics
metric_precision = Precision(task='multiclass', num_classes=7, average='micro')
metric_recall = Recall(task='multiclass', num_classes=7, average='micro')

net.eval()
with torch.no_grad():
    for images, labels in dataloader_test:
        outputs = net(images)
        _, preds = torch.max(outputs, 1)
        metric_precision(preds, labels)
        metric_recall(preds, labels)

precision = metric_precision.compute()
recall = metric_recall.compute()
print(f"Precision: {precision}")
print(f"Recall: {recall}")

In [None]:
# Define precision metric
metric_precision = Precision(
    task="multiclass", num_classes=7, average=None
)

net.eval()
with torch.no_grad():
    for images, labels in dataloader_test:
        outputs = net(images)
        _, preds = torch.max(outputs, 1)
        metric_precision(preds, labels)
precision = metric_precision.compute()

# Get precision per class
precision_per_class = {
    k: precision[v].item()
    for k, v 
    in dataset_test.class_to_idx.items()
}
print(precision_per_class)

### 1. Handling sequences with PyTorch
We've learned to handle tabular and image data. Let's now discuss sequential data.

### 2. Sequential data
Sequential data is ordered in time or space, where the order of the data points is important and can contain temporal or spatial dependencies between them. Time series, data recorded over time like stock prices, weather, or daily sales is sequential. So is text, in which the order of words in a sentence determines its meaning. Another example is audio waves, where the order of data points is crucial to the sound reproduced when the audio file is played.

### 3. Electricity consumption prediction
In this chapter, we will tackle the problem of predicting electricity consumption based on past patterns. We will use a subset of the electricity consumption dataset from the UC Irvine Machine Learning Repository. It contains electricity consumption in kilowatts, or kW, for a certain user recorded every 15 minutes for four years.

### 4. Train-test split
In many machine learning applications, one randomly splits the data into training and testing sets. However, with sequential data, there are better approaches. If we split the data randomly, we risk creating a look-ahead bias, where the model has information about the future when making forecasts. In practice, we won't have information about the future when making predictions, so our test set should reflect this reality. To avoid the look-ahead bias, we should split the data by time. We will train on the first three years of data, and test on the fourth year.

### 5. Creating sequences
To feed the training data to the model, we need to chunk it first to create sequences that the model can use as training examples. First, we need to select the sequence length, which is the number of data points in one training example. Let's make each forecast based on the previous 24 hours. Because data is at 15 minute intervals, we need to use 24 times 4 which is 96 data points. In each example, the data point right after the input sequence will be the target to predict.

### 6. Creating sequences in Python
Let's implement a Python function to create sequences. It takes the DataFrame and the sequence length as inputs. We start with initializing two empty lists, xs for inputs and ys for targets. Next, we iterate over the DataFrame. The loop only goes up to "len(df) - seq_length", ensuring that for every iteration, there are always seq_length data points available in the DataFrame for creating the sequence and a subsequent data point to serve as the target. For each considered data point, we define inputs x as the electricity consumption values starting from this point plus the next sequence length points, and the target y as the subsequent electricity consumption value. The 1 passed to the iloc method stands for the second DataFrame column, which stores the electricity consumption data. Finally, we append the inputs and the target to pre-initialized lists, and after the loop, return them as NumPy arrays.

### 7. TensorDatasetet
Let's use our function to create sequences from the training data. This gives us almost 35 thousand training examples. To convert them to a torch Dataset, we will use the TensorDataset function. We pass it two arguments, the inputs and the targets. Each argument is the NumPy array converted to a tensor with torch.from_numpy and parsed to float. The TensorDataset behaves just like all other torch Datasets and it can be passed to a DataLoader in the same way.

### 8. Applicability to other sequential datat
Everything we have learned here can also be applied to other sequential data. For example, Large Language Models are trained to predict the next word in a sentence, a problem similar to predicting the next amount of electricity used. For speech recognition, which means transcribing an audio recording of someone speaking to text, one would typically use the same sequence-processing model architectures we will learn about soon.

In [None]:
import numpy as np

def create_sequences(df, seq_length):
    xs, ys = [], []
    # Iterate over data indices
    for i in range(len(df) - seq_length):
      	# Define inputs
        x = df.iloc[i : (i+seq_length), 1]
        # Define target
        y = df.iloc[(i+seq_length), 1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [None]:
import torch
from torch.utils.data import TensorDataset

# Use create_sequences to create inputs and targets
X_train, y_train = create_sequences(train_data, 24*4)
print(X_train.shape, y_train.shape)

# Create TensorDataset
dataset_train = TensorDataset(
    torch.from_numpy(X_train).float(),
    torch.from_numpy(y_train).float(),
)
print(len(dataset_train))

### 2. Recurrent neuron
So far, we built feed-forward neural networks where data is passed in one direction: from inputs, through all the layers, to the outputs. Recurrent neural networks, or RNNs, are similar, but also have connections pointing back. At each time step, a recurrent neuron receives some input x, multiplied by the weights and passed through an activation. Out come two values: the main output y, and the hidden state, h, that is fed back to the same neuron. In PyTorch, a recurrent neuron is available as nn.RNN.

### 3. Unrolling recurrent neuron through time
We can represent the same neuron once per time step, a visualization known as unrolling a neuron through time. At a given time step, the neuron represented as a gray circle receives input data x-zero and the previous hidden state h0 and produces output y-zero and a hidden state h1.

### 4. Unrolling recurrent neuron through time
At the next time step, it takes the next value x1 as input and its last hidden state, h1.

### 5. Unrolling recurrent neuron through time
And so it continues until the end of the input sequence. Since at the first time step there is no previous hidden state, h0 is typically set to zero. Notice that the output at each time step depends on all the previous inputs. This allows recurrent networks to maintain memory through time, which allows them to handle sequential data well.

### 6. Deep RNNs
We can also stack multiple layers of recurrent cells on top of each other to get a deep recurrent neural network. In this case, each input will pass through multiple neurons one after another, just like in dense and convolutional networks we have discussed before.

### 7. Sequence-to-sequence architecture
Depending on the lengths of input and output sequences, we distinguish four different architecture types. Let's look at them one by one. In a sequence-to-sequence architecture, we pass the sequence as input and make use of the output produced at every time step. For example, a real-time speech recognition model could receive audio at each time step and output the corresponding text.

### 8. Sequence-to-vector architecture
In a sequence-to-vector architecture, we pass a sequence as input, but ignore all the outputs but the last one. In other words, we let the model process the entire input sequence before it produces the output. We can use this architecture to classify text as one of multiple topics. It's a good idea to let the model "read" the whole text before it decides what it's about. We will also use the sequence-to-vector architecture for electricity consumption prediction.

### 9. Vector-to-sequence architecture
One can also build a vector-to-sequence architecture where we pass a single input and replace all other inputs with zeros but make use of all the outputs from each time step. This architecture can be used for text generation: given a single vector representing a specific topic, style, or sentiment, a model can generate a sequence of words or sentences.

### 10. Encoder-decoder architecture
Finally, in an encoder-decoder architecture, we pass the input sequences, and only then start using the output sequence. This is different from sequence-to-sequence in which outputs are generated while the inputs are still being received. A canonical use case is machine translation. One cannot translate word by word; rather the entire input must be processed before output generation can start.

### 11. RNN in PyTorch
Let's build a sequence-to-vector RNN in PyTorch. We define a model class with the init method as usual. Inside it, we assign the nn.RNN layer to self.rnn, passing it an input size of 1 since we only have one feature, the electricity consumption, an arbitrarily chosen hidden size of 32 and 2 layers, and we set batch_first to True since our data will have the batch size as its first dimension. We also define a linear layer mapping from the hidden size of 32 to the output of 1. In the forward method, we initialize the first hidden state to zeros using torch.zeros and assign it to h0. Its shape is the number of layers (2) by input size, which we extract from x as x.size-zero, by hidden state size (32). Next, we pass the input x and the first hidden state through the RNN layer. Then, we select only the last output by indexing the middle dimension with -1, pass the result through the linear layer, and return.

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # Define RNN layer
        self.rnn = nn.RNN(
            input_size=1,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        # Initialize first hidden state with zeros
        h0 = torch.zeros(2, x.size(0), 32)
        # Pass x and h0 through recurrent layer
        out, _ = self.rnn(x, h0)  
        # Pass recurrent layer's last output through linear layer
        out = self.fc(out[:, -1, :])
        return out

### 1. LSTM and GRU cells
Let's discuss recurrent architectures more powerful than a plain RNN.

### 2. Short-term memory problem
Because RNN neurons pass the hidden state from one time step to the next, they can be said to maintain some sort of memory. That's why they are often called RNN memory cells, or just cells for short. However, this memory is very short-term: by the time a long sentence is processed, the hidden state doesn't have much information about its beginning. Imagine trying to translate a sentence between languages; as soon as we have read it, we don't remember how it started. To solve this short-term memory problem, two more powerful types of cells have been proposed: the Long Short-Term Memory or LSTM cell and the Gated Recurrent Unit or GRU cell.

### 3. RNN cell
Before we look at LSTM and GRU cells, let's visualize the plain RNN cell. At each time step t, it takes two inputs, the current input data x and the previous hidden state h. It multiplies these inputs with the weights, applies activation, and outputs two things: the current outputs y and the next hidden state.

### 4. LSTM cell
The LSTM cell has three inputs and outputs. Next to the input data x, there are two hidden states: h represents the short-term memory and c the long-term memory. At each time step, h and x are passed through some linear layers called gate controllers which determine what is important enough to keep in the long-term memory. The gate controllers first erase some parts of the long-term memory in the forget gate. Then, they analyze x and h and store their most important parts in the long-term memory in the input gate. This long-term memory, c, is one of the outputs of the cell. At the same time, another gate called the output gate determines what the current output y should be. The short-term memory output h is the same as y.

### 5. LSTM in PyTorch
Building an LSTM network in PyTorch is very similar to the plain RNN we have already seen. In the init method, we only need to use the nn.LSTM layer instead of nn.RNN. The arguments that the layer takes as inputs are the same. In the forward method, we add the long-term hidden state c and initialize both h and c with zeros. Then, we pass h and c as a tuple to the LSTM layer. Finally, we take the last output, pass it through the linear layer and return just like before.

### 6. GRU cell
The GRU cell is a simplified version of the LSTM cell. It merges the long-term and short-term memories into a single hidden state. It also doesn't use an output gate: the entire hidden state is returned at each time step.

### 7. GRU in PyTorch
Building a GRU network in PyTorch is almost identical to the plain RNN. All we need to do is replace the nn.rnn with nn.gru when defining the layer in the init method, and then call the new gru layer in the forward method.

### 8. Should I use RNN, LSTM, or GRU?
So, which type of recurrent network should we use: the plain RNN, LSTM, or GRU? There is no single answer, but consider the following. Although plain RNNs have revolutionized modeling of sequential data and are important to understand, they are not used much these days because of the short-term memory problem. Our choice will likely be between LSTM and GRU. GRU's advantage is that it's less complex than LSTM, which means less computation. Other than that, the relative performance of GRU and LSTM varies per use case, so it's often a good idea to try both and compare the results. We will learn how to evaluate these models soon.

In [None]:
class Net(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # Define lstm layer
        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 32)
        # Initialize long-term memory
        c0 = torch.zeros(2, x.size(0), 32)
        # Pass all inputs to lstm layer
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # Define RNN layer
        self.gru = nn.GRU(
            input_size=1,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 32)
        out, _ = self.gru(x, h0)  
        out = self.fc(out[:, -1, :])
        return out

### 2. Mean Squared Error Loss
Up to now, we have been solving classification tasks using cross-entropy losses. Forecasting of electricity consumption is a regression task, for which we will use a different loss function: Mean Squared Error. Here is how it's calculated. The difference between the predicted value and the target is the error. We then square it, and finally average over the batch of examples. Squaring the errors plays two roles. First, it ensures positive and negative errors don't cancel out, and second, it penalizes large errors more than small ones. Mean Squared Error loss is available in PyTorch as nn.MSELoss.

### 3. Expanding tensors
Before we take a look at the model training and evaluation, we need to discuss two useful concepts: expanding and squeezing tensors. Let's tackle expanding first. All recurrent layers, RNNs, LSTMs, and GRUs, expect input in the shape: batch size, sequence length, number of features. But as we loop over the DataLoader, we can see that we got the shape batch size of 32 by the sequence length of 96. Since we are dealing with only one feature, the electricity consumption, the last dimension is dropped. We can add it, or expand the tensor, by calling view on the sequence and passing the desired shape.

### 4. Squeezing tensors
Conversely, as we evaluate the model, we will need to revert the expansion we have applied to the model inputs which can be achieved through squeezing. Let's see why that's the case and how to do it. As we iterate through test data batches, we get labels in shape batch size. Model outputs, however, are of shape batch size by 1, our number of features. We will be passing the labels and the model outputs to the loss function, and each PyTorch loss requires its inputs to be of the same shape. To achieve that, we can apply the squeeze method to the model outputs. This will reshape them to match the labels' shape.

### 5. Training loop
The training loop is similar to what we have already seen. We instantiate the model and define the loss and the optimizer. Then, we iterate over epochs and training data batches. For each batch, we reshape the input sequence as we have just discussed. The rest of the training loop is the same as before.

### 6. Evaluation loop
Let's look at the evaluation loop. We start by setting up the Mean Squared Error metric from torchmetrics. Then, we iterate through test data batches without computing the gradients. Next, we reshape the model inputs just like during training, pass them to the model, and squeeze the outputs. Finally, we update the metric. After the loop, we can print the final metric value by calling compute on it, just like we did before.

### 7. LSTM vs. GRU
Here is our LSTM's test Mean Squared Error again. Let's see how it compares to a GRU network. It seems that for our electricity consumption dataset, with the task defined as predicting the next value based on the previous 24 hours of data, both models perform similarly, with GRU achieving even a slightly lower error. In this case, GRU might be preferred as it achieves the same or better results while requiring less processing power.

In [None]:
net = Net()
# Set up MSE loss
criterion = nn.MSELoss()
optimizer = optim.Adam(
  net.parameters(), lr=0.0001
)

for epoch in range(3):
    for seqs, labels in dataloader_train:
        # Reshape model inputs (batch size, sequence length, num features)
        seqs = seqs.view(16, 96, 1)
        # Get model outputs
        outputs = net(seqs)#.squeeze()
        # Compute loss
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

In [None]:
# Define MSE metric
mse = torchmetrics.MeanSquaredError()

net.eval()
with torch.no_grad():
    for seqs, labels in dataloader_test:
        seqs = seqs.view(32, 96, 1)
        # Pass seqs to net and squeeze the result
        outputs = net(seqs).squeeze()
        mse(outputs, labels)

# Compute final metric value
test_mse = mse.compute()
print(f"Test MSE: {test_mse}")

# Multi Input

### 2. Why multi-input?
Multi-input models, or models that accept more than one source of data, have many applications. First, we might want the model to use multiple information sources, such as two images of the same car to predict its model. Second, multi-modal models can work on different input types such as image and text to answer a question about the image. Next, in metric learning, the model learns whether two inputs represent the same object. Think about an automated passport control where the system compares our passport photo with a picture it takes of us. Finally, in self-supervised learning, the model learns data representation by learning that two augmented versions of the same input represent the same object. Multi-input models are everywhere!

### 3. Omniglot dataset
Throughout the chapter, we will be using the Omniglot dataset, a collection of images of 964 different handwritten characters from 30 different alphabets.

### 4. Character classification
Let's use the Omniglot dataset to build a two-input model to classify handwritten characters. The first input will be the image of the character, such as this Latin letter "k".

### 5. Character classification
The second input will the the alphabet that it comes from expressed as a one-hot vector.

### 6. Character classification
Both inputs will be processed separately, then we concatenate their representations.

### 7. Character classification
Finally a classification layer predicts one of the 964 classes. We need two elements to build such a model: a custom Dataset and an appropriate model architecture.

### 8. Two-input Dataset
Let's start with the custom Omniglot dataset. We set it up as a class based on torch Dataset. In the init method, we store transform and samples provided when instantiating the dataset as class attributes. Samples are tuples of three: image file path, alphabet as a one-hot vector, and target label as the character class index. In the exercises, samples will be provided. For personal projects, we would need to create them from data file paths. Next, we need to implement the len method that returns the number of samples. Finally, the getitem method returns one sample based on the index it receives as input. For the given index, we retrieve the sample and load the image using Image.open from PIL. The convert method with the argument "L" makes sure that the image is read as grayscale. Then, we transform the image and return a triplet: the transformed image, the alphabet vector, and the target label.

### 9. Tensor concatenation
Before we proceed to building the model, we need to understand tensor concatenation. torch.cat concatenates tensors along a specified dimension. We pass it the tensors and the dimension: for 2D tensors, 0 stands for "horizontal" and 1 stands for "vertical" concatenation.

### 10. Two-input architecture
It's time to define our two-input model. We start with defining a sub-network or layer to process our first input, the image. It should look familiar: a convolution, max pool, elu activation, flattened to a linear layer of shape 128 in the end. Next, we define a layer to process our second input, the alphabet vector. Its input size is 30, the number of alphabets, and we map it to an arbitrarily chosen output size of 8. Then, a classifier would accept input of size 128 plus 8 (image and alphabet outputs concatenated) and produce the output of size 964, the number of classes.

### 11. Two-input architecture
In the forward method, we pass each input through its corresponding layer. Then, we concatenate the outputs with torch.cat. Finally, we pass the result through the classifier layer and return.

### 12. Training loop
The training loop looks just like all the ones we have seen so far. The only difference is that now the training data consists of three items: the image, the alphabet vector, and the labels, and we pass the images and alphabets to the model.

In [1]:
class OmniglotDataset(Dataset):
    def __init__(self, transform, samples):
		# Assign transform and samples to class attributes
        self.transform = transform
        self.samples = samples
                    
    def __len__(self):
		# Return number of samples
        return len(self.samples)

    def __getitem__(self, idx):
      	# Unpack the sample at index idx
        img_path, alphabet, label = self.samples[idx]
        img = Image.open(img_path).convert('L')
        # Transform the image 
        img_transformed = self.transform(img)
        return img_transformed, alphabet, label

NameError: name 'Dataset' is not defined

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # Define sub-networks as sequential models
        self.image_layer = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.ELU(),
            nn.Flatten(),
            nn.Linear(16*32*32, 128)
        )
        self.alphabet_layer = nn.Sequential(
            nn.Linear(30, 8),
            nn.ELU(), 
        )
        self.classifier = nn.Sequential(
            nn.Linear(128 + 8, 964), 
        )
        
    def forward(self, x_image, x_alphabet):
		# Pass the x_image and x_alphabet through appropriate layers
        x_image = self.image_layer(x_image)
        x_alphabet = self.alphabet_layer(x_alphabet)
        # Concatenate x_image and x_alphabet
        x = torch.cat((x_image, x_alphabet), dim=1)
        return self.classifier(x)

### 2. Why multi-output?
Just like multi-input models, multi-output architectures are everywhere. Their simplest use-case is for multi-task learning, where we want to predict two things from the same input, such as a car's make and model from its picture. In multi-label classification problem, the input can belong to multiple classes simultaneously. For instance, an image can depict both a beach and people. For each of these labels, a separate output from the model is needed. Finally, in very deep models built of blocks of layers, it is a common practice to add extra outputs predicting the same targets after each block. These additional outputs ensure that the early parts of the model are learning features useful for the task at hand while also serving as a form of regularization to boost the robustness of the network.

### 3. Character and alphabet classification
Let's use the Omniglot dataset again to build a model to predict both the character and the alphabet it comes from based on the image. First, we will pass the image through some layers to obtain its embedding.

### 4. Character and alphabet classification
Then we add two independent classifiers on top, one for each output.

### 5. Two-output Dataset
The good news is that we have already done much of the work needed. We can reuse the OmniglotDataset we built before, with just one small difference in the samples we pass it. When the alphabet was an input to the model, we represented it as a one-hot vector. Now that it is an output, all we need is the integer representing the class label, just like with the other output, the character. This will be a number between 0 and 29 since we have 30 alphabets in the Dataset.

### 6. Two-output architecture
Let's look at the model's architecture. We start with defining a sub-network for processing the image identical to the one we used before. Then, we define two classifier layers, one for each output, with the output shape corresponding to the number of alphabets (30) and characters (964), respectively. In the forward method, we first pass the image through its dedicated sub-network, and then feed the result separately to each of the two classifiers. Finally, we return the two outputs.

### 7. Training loop
Let's examine the training loop. The beginning should look familiar, except for the fact that now the model produces two outputs instead of one. Having produced these outputs, we calculate the loss for each of them separately using the appropriate target labels. Next, we need to define the total loss for the model to optimize. Here, we just sum the two partial losses together, indicating that the accuracy of predicting the alphabet and the character is equally important. If that is not the case, we can weigh the partial losses with some weights to reflect their relative importance. We will explore this idea later in the next video. Finally, we run backpropagation and the optimization step as always.

In [None]:
# Print the sample at index 100
print(samples[100])

# Create dataset_train
dataset_train = OmniglotDataset(
    transform=transforms.Compose([
        transforms.ToTensor(),
      	transforms.Resize((64, 64)),
    ]),
    samples=samples,
)

# Create dataloader_train
dataloader_train = DataLoader(
    dataset_train, batch_size=64, shuffle=True,
)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.image_layer = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.ELU(),
            nn.Flatten(),
            nn.Linear(16*32*32, 128)
        )
        # Define the two classifier layers corresponding to the number of alphabets (30) and the number of characters (964)
        self.classifier_alpha = nn.Linear(128, 30)
		self.classifier_char = nn.Linear(128, 964)

    def forward(self, x):
        x_image = self.image_layer(x)
        # Pass x_image through the classifiers and return both results
        output_alpha = self.classifier_alpha(x_image)
        output_char = self.classifier_char(x_image)
        return output_alpha, output_char

In [None]:
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.05)

for epoch in range(1):
    for images, labels_alpha, labels_char in dataloader_train:
        optimizer.zero_grad()
        outputs_alpha, outputs_char = net(images)
        # Compute alphabet classification loss
        loss_alpha = criterion(outputs_alpha, labels_alpha)
        # Compute character classification loss
        loss_char = criterion(outputs_char, labels_char)
        # Compute total loss
        loss = loss_alpha + loss_char
        loss.backward()
        optimizer.step()

# Multi-Model evaluation
Let's start with the evaluation of a multi-output model. It's very similar to what we have done before. However, with two different outputs, we need to set up two accuracy metrics: one for alphabet classification and one for character classification. We iterate over the test DataLoader and get the model's predictions as usual. Finally, we update the accuracy metrics, and after the loop, we can calculate their final values. The accuracy is higher for alphabets than for characters, which is not surprising: predicting the alphabet is an easier task with just 30 classes to choose from; for characters, there are 964 possible labels. The difference in accuracy scores is not very large, however: 31 versus 24 percent. This is because learning to recognize the alphabets helped the model recognize individual characters: there is a combined positive effect from solving these two tasks at once.

### 3. Multi-output training loop revisited
Let's now take a look at the training loop for our last model predicting characters and alphabets. Because the model solves two classification tasks at the same time, we have two losses: one for alphabets, and another one for characters. However, since the optimizer can only handle one objective, we had to combine the two losses somehow. We chose to define the final loss as the sum of the two partial losses. By doing so, we are telling the model that recognizing characters and recognizing alphabets are equally important to us. If that is not the case, we can combine the two losses differently.

### 4. Varying task importance
Let's say that correct classification of characters is twice as important for us as the classification of alphabets. To pass this information to the model, we can multiply the character loss by two to force the model to optimize it more. Another approach is to assign weights to both losses that sum up to one. This is equivalent from the optimization perspective, but arguably easier to read for humans, especially with more than two loss components.

### 5. Warning: losses on different scales
There is just one caveat: when assigning loss weights, we must be aware of the magnitudes of the loss values. If the losses are not on the same scale, one loss could dominate the other, causing the model to effectively ignore the smaller loss. Consider a scenario where we're building a model to predict house prices, and use MSE loss. If we also want to use the same model to provide a quality assessment of the house, categorized as "Low", "Medium", or "High", we would use cross-entropy loss. Cross-entropy is typically in the single-digit range, while MSE can reach tens of thousands. Combining these two would result in the model ignoring the quality assessment task completely. A solution is to scale each loss by dividing it by the maximum value in the batch. This brings them to the same range, allowing us to weight them if desired and add together.