# Advanced torch.nn

Sometimes Sequential is not enough

In [1]:
import torch
import torch.nn as nn

In [2]:
myLogisticRegression = nn.Sequential(
    nn.Linear(10, 1),
    nn.Sigmoid()
)

Sequential is basicaly a for loop over the modules

But sometimes, we want to have other operation and have control on the order of execution

In [3]:
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__() # never forget to call the super method !!
        self.ln = nn.Linear(5, 5)

    def forward(self, x):
        x_transformed = self.ln(x)
        # here I can do other thing
        x = x + x_transformed
        return F.softmax(x, dim=1) # dim0 is batch, dim1 is input

In [4]:
batch = torch.rand((32, 5))
my_model = Model()
my_model(batch)

tensor([[0.2618, 0.1978, 0.1257, 0.2389, 0.1758],
        [0.1616, 0.1834, 0.2019, 0.2840, 0.1691],
        [0.1175, 0.2497, 0.1860, 0.3224, 0.1244],
        [0.2282, 0.1938, 0.1163, 0.2827, 0.1790],
        [0.1818, 0.1809, 0.1543, 0.3685, 0.1146],
        [0.1777, 0.2839, 0.1170, 0.2292, 0.1922],
        [0.2214, 0.2467, 0.1339, 0.2728, 0.1253],
        [0.1611, 0.1796, 0.1727, 0.2707, 0.2160],
        [0.1769, 0.1617, 0.1354, 0.3470, 0.1789],
        [0.1230, 0.1565, 0.1267, 0.3385, 0.2553],
        [0.1623, 0.2321, 0.2125, 0.2832, 0.1098],
        [0.1266, 0.1913, 0.1647, 0.3520, 0.1653],
        [0.1319, 0.1559, 0.1844, 0.4048, 0.1229],
        [0.1183, 0.2131, 0.1768, 0.4055, 0.0864],
        [0.1958, 0.1828, 0.1387, 0.2577, 0.2250],
        [0.2229, 0.1732, 0.1915, 0.3410, 0.0714],
        [0.2356, 0.1670, 0.1682, 0.2518, 0.1774],
        [0.1836, 0.2537, 0.1075, 0.2534, 0.2018],
        [0.1873, 0.3138, 0.1585, 0.2523, 0.0881],
        [0.1270, 0.1392, 0.1795, 0.4282, 0.1262],


Functions such as Softmax can be used throught `torch.nn.functional` or with modules `torch.nn`

In [5]:
import random

class ModuleModel(nn.Module):
    def __init__(self):
        super(ModuleModel, self).__init__()
        self.ln = nn.Linear(5, 5)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x_transformed = self.ln(x)
        # here I can do other things
        if random.randint(0, 1) % 2 == 0:
            x = x + x_transformed
        else:
            x = x_transformed * 2
        return self.softmax(x)

In [6]:
batch = torch.rand((32, 5))
my_module_model = ModuleModel()
my_module_model(batch)

tensor([[0.1708, 0.0426, 0.2399, 0.0541, 0.4927],
        [0.1737, 0.0598, 0.1755, 0.0685, 0.5226],
        [0.2524, 0.0643, 0.3401, 0.1343, 0.2090],
        [0.3348, 0.0664, 0.1403, 0.1950, 0.2635],
        [0.3251, 0.0637, 0.1101, 0.1901, 0.3109],
        [0.2724, 0.0610, 0.1158, 0.1387, 0.4122],
        [0.1968, 0.1066, 0.1936, 0.1086, 0.3945],
        [0.2672, 0.0763, 0.1744, 0.1163, 0.3658],
        [0.2339, 0.0610, 0.2806, 0.1026, 0.3220],
        [0.2262, 0.0678, 0.0810, 0.1377, 0.4873],
        [0.4135, 0.0328, 0.1640, 0.1812, 0.2086],
        [0.2891, 0.0511, 0.2637, 0.1158, 0.2804],
        [0.3054, 0.0494, 0.1679, 0.1380, 0.3393],
        [0.2943, 0.0869, 0.1336, 0.1912, 0.2940],
        [0.2649, 0.0759, 0.2906, 0.1502, 0.2183],
        [0.1958, 0.1027, 0.1811, 0.1056, 0.4147],
        [0.3074, 0.0701, 0.1679, 0.1588, 0.2958],
        [0.2796, 0.0564, 0.2682, 0.1449, 0.2510],
        [0.2515, 0.0699, 0.1303, 0.1188, 0.4295],
        [0.2914, 0.0430, 0.1344, 0.1170, 0.4143],


We can leverage the parallelism power of GPUs

In [7]:
# check if a GPU with Cuda capacities is available
if torch.cuda.is_available():
    # put the model on the GPU
    my_module_model = my_module_model.to('cuda')
    print('Training on GPU!')

In [8]:
my_module_model(batch)

tensor([[0.1708, 0.0426, 0.2399, 0.0541, 0.4927],
        [0.1737, 0.0598, 0.1755, 0.0685, 0.5226],
        [0.2524, 0.0643, 0.3401, 0.1343, 0.2090],
        [0.3348, 0.0664, 0.1403, 0.1950, 0.2635],
        [0.3251, 0.0637, 0.1101, 0.1901, 0.3109],
        [0.2724, 0.0610, 0.1158, 0.1387, 0.4122],
        [0.1968, 0.1066, 0.1936, 0.1086, 0.3945],
        [0.2672, 0.0763, 0.1744, 0.1163, 0.3658],
        [0.2339, 0.0610, 0.2806, 0.1026, 0.3220],
        [0.2262, 0.0678, 0.0810, 0.1377, 0.4873],
        [0.4135, 0.0328, 0.1640, 0.1812, 0.2086],
        [0.2891, 0.0511, 0.2637, 0.1158, 0.2804],
        [0.3054, 0.0494, 0.1679, 0.1380, 0.3393],
        [0.2943, 0.0869, 0.1336, 0.1912, 0.2940],
        [0.2649, 0.0759, 0.2906, 0.1502, 0.2183],
        [0.1958, 0.1027, 0.1811, 0.1056, 0.4147],
        [0.3074, 0.0701, 0.1679, 0.1588, 0.2958],
        [0.2796, 0.0564, 0.2682, 0.1449, 0.2510],
        [0.2515, 0.0699, 0.1303, 0.1188, 0.4295],
        [0.2914, 0.0430, 0.1344, 0.1170, 0.4143],


This will crash (**if you have a CUDA GPU**). Why?

We've put the model on the GPU but the batch is on the CPU

In [9]:
print(next(my_module_model.parameters()).device)
print(batch.device)

cpu
cpu


The solution is simply to move the batch to the GPU before passing the data to the model

**Clean way to do it:**

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #0, first GPU if multiple one
my_module_model.to(device)
batch.to(device)

tensor([[4.1526e-01, 8.7074e-01, 7.6456e-01, 8.0506e-01, 1.8693e-01],
        [2.7996e-01, 5.5058e-01, 6.3251e-01, 4.7868e-01, 8.7997e-02],
        [2.0897e-01, 7.1913e-01, 3.4556e-02, 7.7303e-01, 9.7798e-01],
        [7.1657e-01, 6.4428e-01, 4.6273e-01, 1.1385e-01, 9.6753e-01],
        [4.3807e-01, 7.1147e-02, 3.8300e-01, 2.0679e-01, 6.7220e-01],
        [6.1677e-01, 7.4780e-01, 8.2767e-01, 1.1967e-01, 7.4780e-01],
        [1.8971e-01, 3.4026e-01, 1.4341e-01, 1.4436e-01, 1.7825e-01],
        [9.3474e-01, 5.3825e-01, 3.7738e-01, 2.2789e-02, 1.4459e-01],
        [3.9991e-01, 7.9361e-01, 3.6654e-01, 6.4011e-01, 6.2002e-01],
        [1.4891e-01, 4.8677e-01, 8.7762e-01, 7.5507e-02, 7.5744e-01],
        [6.8368e-01, 2.1135e-02, 2.8277e-01, 8.2078e-01, 8.2123e-01],
        [8.1512e-01, 8.5075e-01, 4.2459e-01, 5.6651e-01, 6.5530e-01],
        [6.5198e-01, 7.3676e-01, 6.8148e-01, 4.6406e-01, 8.2393e-01],
        [3.5115e-01, 1.7240e-01, 1.9470e-01, 8.2418e-02, 6.5621e-01],
        [4.0990e-01,

Performance trick:

In [11]:
for i in range(5):
    batch.to(device, non_blocking=True) # doesn't lock the program while waiting for batch to be put on GPU
    print(f'iteration {i}')

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4


## But Pierre, how do I init the layers? You said it's important....

## Glad you asked!

Writing a module makes it easier

In [12]:
def init_my_layer(m):
    torch.nn.init.xavier_normal_(m.weight)
    torch.nn.init.constant_(m.bias, 0)
    return m

class MyInitLayer(nn.Module):
    def __init__(self):
        super(ModuleModel, self).__init__()
        self.ln = init_my_layer(nn.Linear(5, 5))
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.ln(x)
        return self.softmax(x)