# Chapter 1

## PyTorch and object-oriented programming

In [None]:
# Object-Oriented Programming (OOP)
class BankAccount:
    def __init__(self, balance):
        self.balance = balance

# __init__ is called when BankAccount object is created
# balance is the attribute of the BankAccount object
account = BankAccount(100)
print(account.balance)

In [None]:
# Object-Oriented Programming (OOP)
# Methods: Python functions to perform tasks
class BankAccount:
    # deposit method increases balance
    def __init__(self, balance):
        self.balance = balance
        
    def deposit(self, amount):
        self.balance += amount

account = BankAccount(100)
account.deposit(50)
print(account.balance)
# 150

In [None]:
import torch

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.current_device()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
torch.set_default_device(device)

In [None]:
# Water potability dataset
# PyTorch Dataset
import pandas as pd
from torch.utils.data import Dataset
class WaterDataset(Dataset):
    # init: load data, store as numpy array
    # super().__init__() ensures
    # WaterDataset behaves like torch Dataset
    def __init__(self, csv_path):
        super().__init__()
        df = pd.read_csv(csv_path)
        self.data = df.to_numpy()
        
    # len: return the size of the dataset
    def __len__(self):
        return self.data.shape[0]
        
    # getitem: take one argument called idx
    # and return features and label for a single sample at index idx
    def __getitem__(self, idx):
        features = self.data[idx, :-1]
        label = self.data[idx, -1]
        return features, label


In [None]:
# PyTorch DataLoader
dataset_train = WaterDataset(
    "./data/water_potability/water_train.csv"
)

In [None]:
from torch.utils.data import DataLoader
dataloader_train = DataLoader(
    dataset_train,
    batch_size=2,
    shuffle=True,
    generator=torch.Generator(device=device),
)

In [None]:
features, labels = next(iter(dataloader_train))
print(f"Features: {features},\nLabels: {labels}")

In [None]:
# PyTorch Model
# Sequential model definition:
import torch.nn as nn

net = nn.Sequential(
    nn.Linear(9, 16),
    nn.ReLU(),
    nn.Linear(16, 8),
    nn.ReLU(),
    nn.Linear(8, 1),
    nn.Sigmoid(),
)

# Class-based model definition:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(9, 16, dtype=torch.float64)
        self.fc2 = nn.Linear(16, 8, dtype=torch.float64)
        self.fc3 = nn.Linear(8, 1, dtype=torch.float64)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

net = Net()

In [None]:
# net.to(device)

In [None]:
next(net.parameters()).is_cuda

## Optimizers, training, and evaluation

In [None]:
# Training loop
import torch.nn as nn
import torch.optim as optim
criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

for epoch in range(1000):
    for features, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(
            outputs, labels.view(-1, 1)
        )
        loss.backward()
        optimizer.step()

### Optimizers

#### Stochastic Gradient Descent (SGD)
```
optimizer = optim.SGD(net.parameters(), lr=0.01)
```

* Update depends on learning rate
* Simple and efficient, for basic models
* Rarely used in practice

#### Adaptive Gradient (Adagrad)
```
optimizer = optim.Adagrad(net.parameters(), lr=0.01)
```

* Adapts learning rate for each parameter
* Good for sparse data
* May decrease the learning rate too fast

#### Root Mean Square Propagation (RMSprop)
```
optimizer = optim.RMSprop(net.parameters(), lr=0.01)
```

* Update for each parameter based on the size of its previous gradients

#### Adaptive Moment Estimation (Adam)
```
optimizer = optim.Adam(net.parameters(), lr=0.01)
```

* Arguably the most versatile and widely used
* RMSprop + gradient momentum
* Often used as the go-to optimizer


In [None]:
# PyTorch DataLoader
dataset_test = WaterDataset(
    "./data/water_potability/water_test.csv"
)
from torch.utils.data import DataLoader
dataloader_test = DataLoader(
    dataset_test,
    batch_size=2,
    shuffle=True,
generator=torch.Generator(device=device),
)

In [None]:
# Model evaluation
# Set up accuracy metric

# Put model in eval mode and iterate over
# test data batches with no gradients

# Pass data to model to get predicted
# probabilities
# Compute predicted labels
# Update accuracy metric

from torchmetrics import Accuracy

acc = Accuracy(task="binary")
net.eval()
with torch.no_grad():
    for features, labels in dataloader_test:
        outputs = net(features)
        preds = (outputs >= 0.5).float()
        acc(preds, labels.view(-1, 1))

accuracy = acc.compute()
print(f"Accuracy: {accuracy}")

## Vanishing and exploding gradients

Vanishing gradients
* Gradients get smaller and smaller during backward pass
* Earlier layers get small parameter updates
* Model doesn't learn

Exploding gradients
* Gradients get bigger and bigger
* Parameter updates are too large
* Training diverges

Solution to unstable gradients
1. Proper weights initialization
2. Good activations
3. Batch normalization

In [None]:
# Weights initialization
layer = nn.Linear(8, 1)
print(layer.weight)

Weights initialization
Good initialization ensures:
* Variance of layer inputs = variance of layer outputs
* Variance of gradients the same before and after a layer

How to achieve this depends on the activation:
For ReLU and similar, we can use He/Kaiming initialization

In [None]:
# Weights initialization
import torch.nn.init as init
init.kaiming_uniform_(layer.weight)
print(layer.weight)


In [None]:
# He / Kaiming initialization
import torch.nn as nn
import torch.nn.init as init

class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(9, 16)
        self.fc2 = nn.Linear(16, 8)
        self.fc3 = nn.Linear(8, 1)
        
        init.kaiming_uniform_(self.fc1.weight)
        init.kaiming_uniform_(self.fc2.weight)
        init.kaiming_uniform_(
            self.fc3.weight,
            nonlinearity="sigmoid",
        )
        
    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.sigmoid(self.fc3(x))
        return x

Activation functions
* Often used as the default activation
* `nn.functional.elu()`
* `nn.functional.relu()`

#### Batch normalization
After a layer:
1. Normalize the layer's outputs by:
   * Subtracting the mean
   * Dividing by the standard deviation
2. Scale and shift normalized outputs using learned parameters
   * Model learns optimal inputs distribution for each layer:
   * Faster loss decrease
   * Helps against unstable gradients


In [None]:
# Batch normalization
# Simple one layer example
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(9, 16)
        self.bn1 = nn.BatchNorm1d(16)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.functional.elu(x)


# Chapter 2

## Handling images with PyTorch

[Clouds dataset](https://www.kaggle.com/competitions/cloud-type-classification2/data)

In [None]:
Loading images to PyTorch
from torchvision.datasets import ImageFolder
from torchvision import transforms
train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((128, 128)),
])
dataset_train = ImageFolder(
    "data/clouds_train",
    transform=train_transforms,
)

In [None]:
# Displaying images
dataloader_train = DataLoader(
    dataset_train,
    shuffle=True,
    batch_size=1,
)
image, label = next(iter(dataloader_train))
print(image.shape)

# torch.Size([1, 3, 128, 128])

image = image.squeeze().permute(1, 2, 0)
print(image.shape)

# torch.Size([128, 128, 3])

import matplotlib.pyplot as plt
plt.imshow(image)
plt.show()



In [None]:
# Data augmentation
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.ToTensor(),
    transforms.Resize((128, 128)),
])
dataset_train = ImageFolder(
    "data/clouds/train",
    transform=train_transforms,
)



## Convolutional Neural Networks

In [None]:
# Zero-padding
# Add a frames of zeros to convolutional layer's input
# nn.Conv2d(
#     3, 32, kernel_size=3, padding=1
# )

In [None]:
# Max Pooling
# Slide non-overlapping window over input
# At each position, retain only the maximum value
# Used after convolutional layers to reduce spatial dimensions
# nn.MaxPool2d(kernel_size=2)

In [None]:
Convolutional Neural Network
class Net(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(),
        )
        self.classifier = nn.Linear(64*16*16, num_classes)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.classifier(x)
        return x

In [None]:
# Feature extractor output size
self.feature_extractor = nn.Sequential(
    nn.Conv2d(3, 32, kernel_size=3, padding=1),
    nn.ELU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Conv2d(32, 64, kernel_size=3, padding=1),
    nn.ELU(),
    nn.MaxPool2d(kernel_size=2),
    nn.Flatten(),
)
self.classifier = nn.Linear(64*16*16, num_classes)


## Training image classifiers

In [None]:
# Augmentations for cloud classification
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.RandomAutocontrast(),
    transforms.ToTensor(),
    transforms.Resize((128, 128))
])

In [None]:
# Image classifier training loop
net = Net(num_classes=7)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(10):
    for images, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = net(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()


## Evaluating image classifiers

In [None]:
# Data augmentation at test time

# Data augmentation for training data:
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(45),
    transforms.RandomAutocontrast(),
    transforms.ToTensor(),
    transforms.Resize((64, 64)),
])

dataset_train = ImageFolder(
    "clouds_train",
    transform=train_transforms,
)


test_transforms = transforms.Compose([
    #
    # NO DATA AUGMENTATION AT TEST TIME
    #
    transforms.ToTensor(),
    transforms.Resize((64, 64)),
])

dataset_test = ImageFolder(
    "clouds_test",
    transform=test_transforms,
)

In [None]:
# Averaging multi-class metrics
from torchmetrics import Recall
recall_per_class = Recall(task="multiclass", num_classes=7, average=None)
recall_micro = Recall(task="multiclass", num_classes=7, average="micro")
recall_macro = Recall(task="multiclass", num_classes=7, average="macro")
recall_weighted = Recall(task="multiclass", num_classes=7, average="weighted")

In [None]:
# Evaluation loop
from torchmetrics import Precision, Recall

metric_precision = Precision(
    task="multiclass", num_classes=7, average="macro"
)

metric_recall = Recall(
    task="multiclass", num_classes=7, average="macro"
)

net.eval()
with torch.no_grad():
    for images, labels in dataloader_test:
        outputs = net(images)
        _, preds = torch.max(outputs, 1)
        metric_precision(preds, labels)
        metric_recall(preds, labels)
precision = metric_precision.compute()
recall = metric_recall.compute()

print(f"Precision: {precision}")
print(f"Recall: {recall}")

In [None]:
# Analyzing performance per class
metric_recall = Recall(
    task="multiclass", num_classes=7, average=None
)
net.eval()
with torch.no_grad():
    for images, labels in dataloader_test:
        outputs = net(images)
        _, preds = torch.max(outputs, 1)
        metric_recall(preds, labels)
recall = metric_recall.compute()

print(recall)

dataset_test.class_to_idx


In [None]:
# Analyzing performance per class
# k = class name, e.g. cirriform clouds
{
    k: recall[v].item()
    for k, v
    in dataset_test.class_to_idx.items()
}

### Examples

In [None]:
# Define metrics
metric_precision = Precision(task="multiclass", num_classes=7, average="macro")
metric_recall = Recall(task="multiclass", num_classes=7, average="macro")

net.eval()
with torch.no_grad():
    for images, labels in dataloader_test:
        outputs = net(images)
        _, preds = torch.max(outputs, 1)
        metric_precision(preds, labels)
        metric_recall(preds, labels)

precision = metric_precision.compute()
recall = metric_recall.compute()
print(f"Precision: {precision}")
print(f"Recall: {recall}")

In [None]:
# Define precision metric
metric_precision = Precision(
    task="multiclass", num_classes=7, average=None
)

net.eval()
with torch.no_grad():
    for images, labels in dataloader_test:
        outputs = net(images)
        _, preds = torch.max(outputs, 1)
        metric_precision(preds, labels)
precision = metric_precision.compute()

# Get precision per class
precision_per_class = {
    k: precision[v].item()
    for k, v 
    in dataset_test.class_to_idx.items()
}
print(precision_per_class)

# Chapter 3

## Handling sequences with PyTorch

In [1]:
import torch

In [2]:
import pandas as pd
train_data = pd.read_csv("./data/electricity_consump/electricity_train.csv")

In [3]:
# 1 Trindade,Artur. (2015). ElectricityLoadDiagrams20112014. UCI Machine Learning Repository.
# https://doi.org/10.24432/C58C86.

# Creating sequences in Python
import numpy as np
def create_sequences(df, seq_length):
    xs, ys = [], []
    for i in range(len(df) - seq_length):
        x = df.iloc[i:(i+seq_length), 1]
        y = df.iloc[i+seq_length, 1]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [4]:
train_data

Unnamed: 0,timestamp,consumption
0,2011-01-01 00:15:00,-0.704319
1,2011-01-01 00:30:00,-0.704319
2,2011-01-01 00:45:00,-0.678983
3,2011-01-01 01:00:00,-0.653647
4,2011-01-01 01:15:00,-0.704319
...,...,...
105210,2013-12-31 22:45:00,-0.932595
105211,2013-12-31 23:00:00,-0.907259
105212,2013-12-31 23:15:00,-0.932595
105213,2013-12-31 23:30:00,-0.932595


In [5]:
# TensorDataset
# Create training examples
seq_length = 96
X_train, y_train = create_sequences(train_data, seq_length)
print(X_train.shape, y_train.shape)

# (34944, 96) (34944,)

(105119, 96) (105119,)


In [6]:
# Convert them to a Torch Dataset
from torch.utils.data import TensorDataset
dataset_train = TensorDataset(
    torch.from_numpy(X_train).float(),
    torch.from_numpy(y_train).float(),
)

## Recurrent Neural Networks

In [None]:
# RNN in PyTorch
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=1,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(32, 1)
    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 32)
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])
        return out

## LSTM and GRU cells

In [13]:
import torch.nn as nn
import torch.optim as optim

In [17]:
# LSTM in PyTorch
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 32)
        c0 = torch.zeros(2, x.size(0), 32)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [18]:
# GRU in PyTorch
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.gru = nn.GRU(
            input_size=1,
            hidden_size=32,
            num_layers=2,
            batch_first=True,
        )
        self.fc = nn.Linear(32, 1)

    def forward(self, x):
        h0 = torch.zeros(2, x.size(0), 32)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

## Training and evaluating RNNs

In [20]:
# NOTE (JS): Is this the way to go for time series data
from torch.utils.data import DataLoader
dataloader_train = DataLoader(
    dataset_train,
    batch_size=32,
    shuffle=True,
    # generator=torch.Generator(device=device),
)

In [27]:
# Training loop
for seqs, labels in dataloader_train:
    # seqs = seqs.view(32, 96, 1)
    print(seqs.shape == torch.Size([32, 96]))
    break

True


In [28]:
# Training loop
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(
    net.parameters(), lr=0.001
)

num_epochs=3
for epoch in range(num_epochs):
    for seqs, labels in dataloader_train:
        if (seqs.shape != torch.Size([32, 96])):
            continue # The last entry might not have 32 records
        seqs = seqs.view(32, 96, 1)
        outputs = net(seqs)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Evaluation loop
mse = torchmetrics.MeanSquaredError()

# Set up MSE metric
# Iterate through test data with no gradients
net.eval()
with torch.no_grad():
    for seqs, labels in test_loader:
        # Reshape model inputs
        # Squeeze model outputs
        seqs = seqs.view(32, 96, 1)
        # Update the metric
        outputs = net(seqs).squeeze()
        # Compute final metric value
        mse(outputs, labels)
print(f"Test MSE: {mse.compute()}")

# Test MSE: 0.13292162120342255

# LSTM vs. GRU
# LSTM:
# Test MSE: 0.13292162120342255

# GRU:
# Test MSE: 0.12187089771032333

# GRU preferred: same or better results with less processing power

# Chapter 4: Multi-Input & Multi-Output Architectures
Omniglot dataset  
Lake, B. M., Salakhutdinov, R., and Tenenbaum, J. B. (2015). Human-level concept learning through probabilistic program induction. Science, 350(6266), 1332-1338.

## Multi-input models

In [None]:
# Two-input Dataset
from PIL import Image

# Assign samples and transforms
class OmniglotDataset(Dataset):
    def __init__(self, transform, samples):
        self.transform = transform
        self.samples = samples
        
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_path, alphabet, label = self.samples[idx]
        img = Image.open(img_path).convert('L')
        img = self.transform(img)
        return img, alphabet, label

In [None]:
# Tensor concatenation
x = torch.tensor([
    [1, 2, 3],
])
y = torch.tensor([
    [4, 5, 6],
])

# Concatenation along axis 0
torch.cat((x, y), dim=0)
# [[1, 2, 3],
# [4, 5, 6]]

# Concatenation along axis 1
torch.cat((x, y), dim=1)
# [[1, 2, 3, 4, 5, 6]]

In [None]:
# Two-input architecture
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.image_layer = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.MaxPool2d(kernel_size=2),
            nn.ELU(),
            nn.Flatten(),
            nn.Linear(16*32*32, 128)
        )
        self.alphabet_layer = nn.Sequential(
            nn.Linear(30, 8),
            nn.ELU(),
        )
        self.classifier = nn.Sequential(
            nn.Linear(128 + 8, 964),
        )

    # Two-input architecture
    def forward(self, x_image, x_alphabet):
        x_image = self.image_layer(x_image)
        x_alphabet = self.alphabet_layer(x_alphabet)
        x = torch.cat((x_image, x_alphabet), dim=1)
        return self.classifier(x)

In [None]:
# Training loop
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)
for epoch in range(10):
    for img, alpha, labels in dataloader_train:
        optimizer.zero_grad()
        outputs = net(img, alpha)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

# Training data consists of three items:
# Image
# Alphabet vector
# Labels
# We pass the model images and alphabets

## Multi-output models

In [None]:
Two-output Dataset
class OmniglotDataset(Dataset):
def __init__(self, transform, samples):
self.transform = transform
self.samples = samples

We can use the same Dataset...
...with updated samples:
print(samples[0])

def __len__(self):
return len(self.samples)

[(
'omniglot_train/.../0459_14.png',

def __getitem__(self, idx):
img_path, alphabet, label = \
self.samples[idx]

0,
0,
)]

img = Image.open(img_path).convert('L')
img = self.transform(img)
return img, alphabet, label

In [None]:
Two-output architecture
class Net(nn.Module):
def __init__(self, num_alpha, num_char):
super().__init__()
self.image_layer = nn.Sequential(
nn.Conv2d(1, 16, kernel_size=3, padding=1),
nn.MaxPool2d(kernel_size=2),
nn.ELU(),
nn.Flatten(),
nn.Linear(16*32*32, 128)

Define image-processing sub-network
Define output-specific classifiers
Pass image through dedicated sub-network
Pass the result through each output layer
Return both outputs

)
self.classifier_alpha = nn.Linear(128, 30)
self.classifier_char = nn.Linear(128, 964)
def forward(self, x):
x_image = self.image_layer(x)
output_alpha = self.classifier_alpha(x_image)
output_char = self.classifier_char(x_image)
return output_alpha, output_char

In [None]:
Training loop
for epoch in range(10):
for images, labels_alpha, labels_char \
in dataloader_train:
optimizer.zero_grad()
outputs_alpha, outputs_char = net(images)
loss_alpha = criterion(

Model produces two outputs
Calculate loss for each output
Combine the losses to one total loss
Backprop and optimize with the total loss

outputs_alpha, labels_alpha
)
loss_char = criterion(
outputs_char, labels_char
)
loss = loss_alpha + loss_char
loss.backward()
optimizer.step()

## Evaluation of multioutput models and loss weighting

In [None]:
Model evaluation
acc_alpha = Accuracy(
task="multiclass", num_classes=30
)
acc_char = Accuracy(
task="multiclass", num_classes=964
)
net.eval()
with torch.no_grad():
for images, labels_alpha, labels_char \
in dataloader_test:

Set up metric for each output
Iterate over test loader and get outputs
Calculate prediction for each output
Update accuracy metrics
Calculate final accuracy scores
print(f"Alphabet: {acc_alpha.compute()}")
print(f"Character: {acc_char.compute()}")

out_alpha, out_char = net(images)
_, pred_alpha = torch.max(out_alpha, 1)

Alphabet: 0.3166305720806122

_, pred_char = torch.max(out_char, 1)

Character: 0.24064336717128754

acc_alpha(pred_alpha, labels_alpha)
acc_char(pred_char, labels_char)

In [None]:
Multi-output training loop revisited
for epoch in range(10):
for images, labels_alpha, labels_char \
in dataloader_train:

Two losses: for alphabets and characters
Final loss defined as sum of alphabet and

optimizer.zero_grad()

character losses:

outputs_alpha, outputs_char = net(images)

loss = loss_alpha + loss_char

loss_alpha = criterion(
outputs_alpha, labels_alpha
)

Both classification tasks deemed equally
important

loss_char = criterion(
outputs_char, labels_char
)
loss = loss_alpha + loss_char
loss.backward()
optimizer.step()

In [None]:
Varying task importance
Character classification 2 times more important than alphabet classification
Approach 1: Scale more important loss by a factor of 2
loss = loss_alpha + loss_char * 2
Approach 2: Assign weights that sum to 1
loss = 0.33 * loss_alpha + 0.67 * loss_char

Warning: losses on different scales
Losses must be on the same scale before they are weighted and added
Example tasks:
Predict house price -> MSE loss
Predict quality: low, medium, high -> CrossEntropy loss
CrossEntropy is typically in the single-digits
MSE loss can reach tens of thousands
Model would ignore quality assessment task
Solution: Normalize both losses before weighting and adding
loss_price = loss_price / torch.max(loss_price)
loss_quality = loss_quality / torch.max(loss_quality)
loss = 0.7 * loss_price + 0.3 * loss_quality

What you learned
1. Training robust neural networks
2. Images and convolutional neural networks

PyTorch and OOP
Handling images with PyTorch
Optimizers
Training and evaluating convolutional networks
Vanishing and exploding gradients

Data augmentation
3. Sequences and recurrent neural networks
4. Multi-input and multi-output architectures

Handling sequences with PyTorch

Multi-input models

Training and evaluating recurrent networks
(LSTM and GRU)

Multi-output models
Loss weighting

What's next?
What you might consider learning next:
Transformers
Self-supervised learning

Courses:
Deep Learning for Text with PyTorch
Deep Learning for Images with PyTorch
Efficient AI Model Training with PyTorch
