# Short Introduction to Neural Networks and Deep Learning with Pytorch

In [None]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<h1 id="tocheading">Table of Contents</h1>
<div id="toc"></div>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from tqdm.auto import tqdm

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Resize, Compose

In [None]:
%matplotlib widget

# How to define a Neural Network Architecture in Torch

To declare a new Network architecture, we create a new class inheriting from `torch.nn.Model`.

The simplest way to declare a Network architecture is to declare the sequence of layers using `torch.nn.Sequential`
in `__init__` and we have to implement the `forward` pass. The rest is taken care of by torch (gradients, backword propagation, ...) automagically.

Torch builds a computational graph, that can be executed (on different devices) and transformed (e.g. calculate the gradient).

In [None]:
class FullyConnected(nn.Module):
    def __init__(self, input_size, n_classes, dropout=0.25, n_hidden=256):
        super().__init__()
        
        self.flatten = nn.Flatten()
        self.fc_stack = nn.Sequential(
            nn.Linear(input_size, n_hidden),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(n_hidden, n_hidden),
            nn.Dropout(dropout),
            nn.ReLU(),
            nn.Linear(n_hidden, n_classes),
            nn.Softmax(dim=1),
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc_stack(x)
        return x
    
FullyConnected(input_size=3 * 50 * 50, n_classes=10)

# Training

Unfortunately, training the network is not as simple as calling `fit` like in sklearn.
Torch is a very flexible framework, and we have to decide for the data loader, loss function, the optimizer, the model, device and how we evaluate the performance on the test data set.

In the end, we are going to write our own `fit` function, to make it simpler.

In [None]:
# device = "cpu"
# uncomment to use GPU if available
# CPU offers better debugging
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(DEVICE))

In [None]:
def train(dataloader, model, loss_fn, optimizer, device=DEVICE):    
    model = model.to(device)
    model.train()
    
    losses = []
    for X, y in dataloader:
        X, y = X.to(device), y.to(device)
        
        
        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
       
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # store loss for plotting
        losses.append(loss.item())
    return losses

            
def test(dataloader, model, loss_fn, device=DEVICE):
    model = model.to(device)
    test_losses = []
    with torch.no_grad():
        model.eval()
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_losses.append(loss_fn(pred, y).item())

    return test_losses


def fit_one_epoch(train_dataloader, test_dataloader, model, loss_fn, optimizer, device=DEVICE):
    train_losses = train(train_dataloader, model, loss_fn, optimizer, device)
    test_losses = test(test_dataloader, model, loss_fn, device)
    return train_losses, test_losses


def accuracy(dataloader, model, device=DEVICE):
    correct = 0
    total = 0
    model = model.to(device)
    with torch.no_grad():
        model.eval()
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            total += len(y)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    return correct / total

# MNIST

In [None]:
mnist_train = datasets.MNIST(
    root="data",
    train=True,
    transform=Compose([Resize((8, 8)), ToTensor()]),
    download=True,
)

mnist_test = datasets.MNIST(
    root="data",
    train=False,
    transform=Compose([Resize((8, 8)), ToTensor()]),
    download=True,

)

In [None]:
batch_size = 64

train_dataloader = DataLoader(mnist_train, batch_size=batch_size)
test_dataloader = DataLoader(mnist_test, batch_size=batch_size)


# get first batch
X, y = next(iter(test_dataloader))

print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(9, 3), constrained_layout=True)

for i, ax in enumerate(axs.flat):
    ax.imshow(X[i, 0], cmap='gray')

In [None]:
epochs = 10

model = FullyConnected(
    input_size=X[0].shape.numel(),
    n_classes=len(mnist_train.classes),
    n_hidden=512,
    dropout=0.75,
)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


test_losses = []
train_losses = []

print(f'Test Accuracy: {accuracy(test_dataloader, model):.1%}')

for t in tqdm(range(epochs)):
    epoch_loss_train, epoch_loss_test = fit_one_epoch(
        train_dataloader, test_dataloader, model, loss_fn, optimizer
    )
    train_losses.append(epoch_loss_train)
    test_losses.append(epoch_loss_test)
    print(f'Test Accuracy: {accuracy(test_dataloader, model):.1%}')
    
print("Done!")

In [None]:
def plot_losses(train_losses, test_losses):
    plt.figure()
    
    for i, (label, losses) in enumerate(zip(("Train", "Test"), (train_losses, test_losses))):
        losses = np.array(losses)
       
        x = np.linspace(0, len(losses), losses.size)
        plt.plot(x, losses.ravel(), label=f'Loss {label}', color=f'C{i}', alpha=0.5)
                
        mean_loss = losses.mean(axis=1)
        x = np.arange(0.5, len(mean_loss))
        plt.plot(x, mean_loss, label=f'Mean Epoch Loss {label}', color=f'C{i}', zorder=3)
        
    plt.xlabel('Epoch')
    plt.legend()

In [None]:
plot_losses(train_losses, test_losses)

# CIFAR-10

In [None]:
cifar10_train = datasets.CIFAR10(
    root="data",
    train=True,
    transform=ToTensor(),
    download=True,
)

cifar10_test = datasets.CIFAR10(
    root="data",
    train=False,
    transform=ToTensor(),
    download=True,
)

In [None]:
batch_size = 64

train_dataloader = DataLoader(cifar10_train, batch_size=batch_size)
test_dataloader = DataLoader(cifar10_test, batch_size=batch_size)

# get first batch
X, y = next(iter(test_dataloader))

print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

In [None]:
fig, axs = plt.subplots(4, 4, figsize=(9, 9), constrained_layout=True)

for idx, ax in enumerate(axs.flat):
    img = np.swapaxes(X[idx], 1, 2).T
    

    ax.set_title(cifar10_train.classes[y[idx]])
    ax.imshow(img)
    ax.set_axis_off()

In [None]:
model = FullyConnected(
    input_size=X[0].shape.numel(),
    n_classes=len(cifar10_train.classes),
)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


test_losses = []
train_losses = []

print(f'Test Accuracy: {accuracy(test_dataloader, model):.1%}')

epochs = 15
for t in tqdm(range(epochs)):
    epoch_loss_train, epoch_loss_test = fit_one_epoch(
        train_dataloader, test_dataloader, model, loss_fn, optimizer
    )
    train_losses.append(epoch_loss_train)
    test_losses.append(epoch_loss_test)
    print(f'Test Accuracy: {accuracy(test_dataloader, model):.1%}')
    
print("Done!")

We do not get much better than 40 % with a fully connected network.

Let's try a deep learning network with convolutional layers. The architecture follows the one proposed here:
https://arxiv.org/abs/1409.1556

> Very Deep Convolutional Networks for Large-Scale Image Recognition  
> Karen Simonyan, Andrew Zisserman

> In this work we investigate the effect of the convolutional network depth on its accuracy in the large-scale image recognition setting. Our main contribution is a thorough evaluation of networks of increasing depth using an architecture with very small (3x3) convolution filters, which shows that a significant improvement on the prior-art configurations can be achieved by pushing the depth to 16-19 weight layers. These findings were the basis of our ImageNet Challenge 2014 submission, where our team secured the first and the second places in the localisation and classification tracks respectively. We also show that our representations generalise well to other datasets, where they achieve state-of-the-art results. We have made our two best-performing ConvNet models publicly available to facilitate further research on the use of deep visual representations in computer vision. 

In [None]:
class ConvolutionalNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_stack = nn.Sequential(
            # 1st stack of conv layers
            nn.Conv2d(3, 32, kernel_size=(3, 3), padding='same'),
            nn.Conv2d(32, 32, kernel_size=(3, 3), padding='same'),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
            # 2nd stack
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding='same'),
            nn.Conv2d(64, 64, kernel_size=(3, 3), padding='same'),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
            # 3rd stack
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding='same'),
            nn.Conv2d(128, 128, kernel_size=(3, 3), padding='same'),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
        )
        
        self.flatten = nn.Flatten()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(128 * 4 * 4, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 10),
            nn.Softmax(dim=1),
        )

    def forward(self, x):
        x = self.conv_stack(x)
        x = self.flatten(x)
        x = self.linear_relu_stack(x)
        return x


In [None]:
model = ConvolutionalNetwork()

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

test_losses = []
train_losses = []

In [None]:
epochs = 25

print(f'Test Accuracy: {accuracy(test_dataloader, model):.1%}')

for t in tqdm(range(epochs)):
    epoch_loss_train, epoch_loss_test = fit_one_epoch(
        train_dataloader, test_dataloader, model, loss_fn, optimizer
    )
    train_losses.append(epoch_loss_train)
    test_losses.append(epoch_loss_test)
    print(f'Epoch {t:02d}: test accuracy = {accuracy(test_dataloader, model):.1%}')
    
print("Done!")

In [None]:
plot_losses(train_losses=train_losses, test_losses=test_losses)

## FACT

In [None]:
!curl -LO https://factdata.app.tu-dortmund.de/smd/smd_deeplearning_gammas.hdf5

### Dataset

We need to implement our own data reader for the FACT data

In [None]:
from torch.utils.data import Dataset
import tables

class FACTData(Dataset):
    def __init__(self, min_charge=20, train=True):
        with tables.open_file('smd_deeplearning_gammas.hdf5') as f:
            X = f.root.events.photons[:].astype(np.float32)
            y = np.log10(f.root.events.corsika_event_header_total_energy[:].astype(np.float32))

        # replace nans with 0
        np.nan_to_num(X, copy=False)
                
        # select only bright events
        mask = X.max(axis=(1, 2)) > min_charge
        
        X = X[mask]
        y = y[mask]
        
        
        # insert channel axis for torch
        X = X[:, np.newaxis, :, :]
        y = y[:, np.newaxis]
        
        # sample train / test events
        rng = np.random.default_rng(0)
        train_mask = rng.binomial(1, 0.75, len(X)).astype(bool)
        
        if train:
            self.X = X[train_mask]
            self.y = y[train_mask]
        else:
            self.X = X[~train_mask]
            self.y = y[~train_mask]
            
            
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
fact_data_train = FACTData(train=True)
fact_data_test = FACTData(train=False)

print(len(fact_data_train), len(fact_data_test))

In [None]:
from ipywidgets import interact
    
    
fig = plt.figure(constrained_layout=True)
ax = fig.add_subplot(1, 1, 1)
img = ax.imshow(fact_data_test[0][0][0], cmap='inferno')
cbar = fig.colorbar(img, ax=ax)

def plot(event=0):
    img.set_array(fact_data_test[event][0][0])
    img.autoscale()
    cbar.update_normal(img)
    ax.set_title(f'E = {10**fact_data_test[event][1][0]:.1f} GeV')
    
interact(plot, event=(0, len(fact_data_test)))

In [None]:
batch_size = 256

In [None]:
# Create data loaders.
train_dataloader = DataLoader(fact_data_train, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(fact_data_test, batch_size=batch_size, shuffle=True)

X, y = next(iter(train_dataloader))
X.shape, y.shape

In [None]:
def layer(out_channels, dropout):
    return [
        nn.LazyConv2d(out_channels, kernel_size=(3, 3), padding='same'),
        nn.LazyConv2d(out_channels, kernel_size=(3, 3), padding='same'),
        nn.BatchNorm2d(out_channels),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.Dropout(dropout),
    ]


class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_stack = nn.Sequential(
            *layer(32, 0.25),
            *layer(64, 0.25),
            *layer(128, 0.25),
        )
        
        self.flatten = nn.Flatten()
        
        n_neurons = 256
        self.linear_relu_stack = nn.Sequential(
            nn.LazyLinear(n_neurons),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(n_neurons, n_neurons),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(n_neurons, 1),
        )


    def forward(self, x):
        x = self.conv_stack(x)
        x = self.flatten(x)
        x = self.linear_relu_stack(x)
        return x

In [None]:
class ConvolutionalNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_stack = nn.Sequential(
            # 1st stack of conv layers
            nn.Conv2d(1, 32, kernel_size=(3, 3), padding='same'),
            nn.Conv2d(32, 32, kernel_size=(3, 3), padding='same'),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
            # 2nd stack
            nn.Conv2d(32, 64, kernel_size=(3, 3), padding='same'),
            nn.Conv2d(64, 64, kernel_size=(3, 3), padding='same'),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
            # 3rd stack
            nn.Conv2d(64, 128, kernel_size=(3, 3), padding='same'),
            nn.Conv2d(128, 128, kernel_size=(3, 3), padding='same'),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Dropout(0.25),
        )
        
        self.flatten = nn.Flatten()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(128 * 6 * 6, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 1),
        )

    def forward(self, x):
        x = self.conv_stack(x)
        x = self.flatten(x)
        x = self.linear_relu_stack(x)
        return x

    
class SimpleConvolutionalNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv_stack = nn.Sequential(
            # 1st stack of conv layers
            nn.Conv2d(1, 10, kernel_size=(5, 5), padding='same'),
            nn.Conv2d(10, 10, kernel_size=(5, 5), padding='same'),
            nn.BatchNorm2d(10),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # 2rd stack
            nn.Conv2d(10, 10, kernel_size=(5, 5), padding='same'),
            nn.BatchNorm2d(10),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        
        self.flatten = nn.Flatten()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(10 * 12 * 12, 128),
            nn.Dropout(0.25),
            nn.ReLU(),
            nn.Linear(128, 1),
        )

    def forward(self, x):
        x = self.conv_stack(x)
        x = self.flatten(x)
        x = self.linear_relu_stack(x)
        return x


In [None]:
12 * 12 * 10

In [None]:
model = SimpleConvolutionalNetwork()
train_losses = []
test_losses = []

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

In [None]:
epochs = 1000

for t in tqdm(range(epochs)):
    epoch_loss_train, epoch_loss_test = fit_one_epoch(
        train_dataloader, test_dataloader, model, loss_fn, optimizer, device='cpu'
    )
    train_losses.append(epoch_loss_train)
    test_losses.append(epoch_loss_test)
   
    print(f'Epoch {t:3d}: test loss = {np.mean(test_losses[-1]):.3f}, train loss = {np.mean(train_losses[-1]):.3f}')

    
print("Done!")

In [None]:
plot_losses(test_losses=test_losses, train_losses=train_losses)
plt.yscale('log')

In [None]:
predictions = []
labels = []
for X, y in train_dataloader:
    labels.append(y)
    model.eval()
    with torch.no_grad():
        pred = model(X.to(DEVICE))
        predictions.append(pred.cpu())
    
predictions = np.concatenate(predictions)[:, 0]
labels = np.concatenate(labels)[:, 0]

In [None]:
labels.shape, predictions.shape

In [None]:
plt.figure()
plt.hist2d(labels, predictions, bins=100, range=[[2.5, 4], [2.5, 4]])
plt.axline([1, 1], [1.1, 1.1], color='lightgray')
plt.gca().set_aspect(1)