# `torch.nn` & `torch.nn.functional`


## Dependencies

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Convolution Layers

### Convnd

In [2]:
"""
nn.Conv1d(
    in_channels=Cin,
    out_channels=Cout,
    kernel_size=K,
    stride=S=1,
    padding=P=0,
    dilation=D=1
):  (N?, Cin, Lin)
->  (N?, Cout, Lout),
    Lout = (Lin + 2 * P - D * (K - 1) - 1) // S + 1
"""

"""
nn.Conv2d(
    in_channels=Cin,
    out_channels=Cout,
    kernel_size=K,
    stride=S=1,
    padding=P=0,
    dilation=D=1
):  (N?, Cin, Hin, Win)
->  (N?, Cout, Hout, Wout),
    Hout = (Hin + 2 * P - D * (K - 1) - 1) // S + 1
    Wout = (Win + 2 * P - D * (K - 1) - 1) // S + 1
"""

"""
nn.Conv3d(
    in_channels=Cin,
    out_channels=Cout,
    kernel_size=K,
    stride=S=1,
    padding=P=0,
    dilation=D=1
):  (N?, Cin, Din, Hin, Win)
->  (N?, Cout, Dout, Hout, Wout),
    Dout = (Din + 2 * P - D * (K - 1) - 1) // S + 1
    Hout = (Hin + 2 * P - D * (K - 1) - 1) // S + 1
    Wout = (Win + 2 * P - D * (K - 1) - 1) // S + 1
"""

'\nnn.Conv3d(\n    in_channels=Cin,\n    out_channels=Cout,\n    kernel_size=K,\n    stride=S=1,\n    padding=P=0,\n    dilation=D=1\n):  (N?, Cin, Din, Hin, Win)\n->  (N?, Cout, Dout, Hout, Wout),\n    Dout = (Din + 2 * P - D * (K - 1) - 1) // S + 1\n    Hout = (Hin + 2 * P - D * (K - 1) - 1) // S + 1\n    Wout = (Win + 2 * P - D * (K - 1) - 1) // S + 1\n'

### LazyConvnd

In [3]:
"""
nn.LazyConv1d(
    out_channels=Cout,
    kernel_size=K,
    stride=S=1,
    padding=P=0,
    dilation=D=1
):  (N?, Cin, Lin)
->  (N?, Cout, Lout),
    Lout = (Lin + 2 * P - D * (K - 1) - 1) // S + 1
"""

"""
nn.LazyConv2d(
    out_channels=Cout,
    kernel_size=K,
    stride=S=1,
    padding=P=0,
    dilation=D=1
):  (N?, Cin, Hin, Win)
->  (N?, Cout, Hout, Wout),
    Hout = (Hin + 2 * P - D * (K - 1) - 1) // S + 1
    Wout = (Win + 2 * P - D * (K - 1) - 1) // S + 1
"""

"""
nn.LazyConv3d(
    out_channels=Cout,
    kernel_size=K,
    stride=S=1,
    padding=P=0,
    dilation=D=1
):  (N?, Cin, Din, Hin, Win)
->  (N?, Cout, Dout, Hout, Wout),
    Dout = (Din + 2 * P - D * (K - 1) - 1) // S + 1
    Hout = (Hin + 2 * P - D * (K - 1) - 1) // S + 1
    Wout = (Win + 2 * P - D * (K - 1) - 1) // S + 1
"""

'\nnn.LazyConv3d(\n    out_channels=Cout,\n    kernel_size=K,\n    stride=S=1,\n    padding=P=0,\n    dilation=D=1\n):  (N?, Cin, Din, Hin, Win)\n->  (N?, Cout, Dout, Hout, Wout),\n    Dout = (Din + 2 * P - D * (K - 1) - 1) // S + 1\n    Hout = (Hin + 2 * P - D * (K - 1) - 1) // S + 1\n    Wout = (Win + 2 * P - D * (K - 1) - 1) // S + 1\n'

### ConvTransposend

### LazyConvTransposend

## Recurrent Layers


### RNN

In [4]:
"""
nn.RNN(
    input_size=Hin,
    hidden_size=Hout,
    num_layers=C=1,
    batch_first={(L, N?): False, (N?, L): True}[B],
    bidirectional={1: False, 2: True}[D]
):  (*B, Hin),    (D*C, N?, Hout)
->  (*B, D*Hout), (D*C, N?, Hout)
"""

# https://docs.pytorch.org/docs/stable/generated/torch.nn.RNN.html
# https://karpathy.github.io/2015/05/21/rnn-effectiveness/
"""Define Layer"""

"""1. Position Arguments"""
input_size, hidden_size, num_layers, nonlinearity, bias = 2, 3, 4, "tanh", True

"""2. Keyword Arguments"""
batch_first, bidirectional = False, False

rnn = nn.RNN(
    input_size,  # Hin, Required
    hidden_size,  # Hout, Required
    num_layers,  # C, default=1
    nonlinearity,  # "tanh" or "relu", default="tanh"
    bias,  # default=True
    batch_first=batch_first,  # default=False
    dropout=0.0,
    bidirectional=bidirectional,  # default=False
    device=None,
    dtype=None,
)

D = 2 if bidirectional else 1

"""Forward Pass"""

"""1. Inputs"""
batch_size, seq_len = 5, 6

if batch_first:
    x = torch.randn(batch_size, seq_len, input_size)  # (N, L, Hin)
else:
    x = torch.randn(seq_len, batch_size, input_size)  # (L, N, Hin)

h0 = torch.randn(D * num_layers, batch_size, hidden_size)  # (D * C, N, Hout)


"""2. Outputs"""
y, h = rnn(x, None if h0 is None else h0)

if batch_first:
    assert y.shape == (batch_size, seq_len, D * hidden_size)  # (N, L, D * Hout)
else:
    assert y.shape == (seq_len, batch_size, D * hidden_size)  # (L, N, D * Hout)

assert h.shape == (D * num_layers, batch_size, hidden_size)  # (D * C, N, Hout)

In [5]:
"""
nn.RNNCell(
    input_size=Hin,
    hidden_size=Hout,
):  (N?, Hin),  (N?, Hout)
->  (N?, Hout), (N?, Hout)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.RNNCell.html
"""Define Layer"""

"""1. Position Arguments"""
input_size, hidden_size, bias, nonlinearity = 2, 3, True, "tanh"


rnn_cell = nn.RNNCell(
    input_size,  # Hin, Required
    hidden_size,  # Hout, Required
    bias,  # default=True
    nonlinearity,  # "tanh" or "relu", default="tanh"
    device=None,
    dtype=None,
)

f = {"tanh": F.tanh, "relu": F.relu}[nonlinearity]

"""Forward Pass"""

"""1. Inputs"""
batch_size = 4

x = torch.randn(batch_size, input_size)  # (N, Hin)

h0 = torch.randn(batch_size, hidden_size)  # (N, Hout)


"""2. Outputs"""
h = rnn_cell(x, None if h0 is None else h0)

assert h.shape == (batch_size, hidden_size)  # (N, Hout)
assert torch.allclose(
    h,
    f(
        F.linear(x, rnn_cell.weight_ih, rnn_cell.bias_ih)
        + F.linear(h0, rnn_cell.weight_hh, rnn_cell.bias_hh)
    ),
    atol=1e-6,
)

### LSTM

In [6]:
"""
nn.LSTM(
    input_size=Hin,
    hidden_size=Hout,
    num_layers=C=1,
    batch_first={(L, N?): False, (N?, L): True}[B],
    bidirectional={1: False, 2: True}[D],
    proj_size=P=P if P > 0 else Hout
):  (*B, Hin), ((D*C, N?, P), (D*C, N?, Hout))
->  (*B, D*P), ((D*C, N?, P), (D*C, N?, Hout))
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.LSTM.html
# https://colah.github.io/posts/2015-08-Understanding-LSTMs/
"""Define Layer"""

"""1. Position Arguments"""
input_size, hidden_size, num_layers, bias = 2, 3, 1, True

"""2. Keyword Arguments"""
batch_first, bidirectional, proj_size = False, True, 1

assert proj_size < hidden_size, "proj_size has to be smaller than hidden_size"

lstm = nn.LSTM(
    input_size,  # Hin
    hidden_size,  # Hout
    num_layers,  # C
    bias=bias,
    batch_first=batch_first,
    dropout=0.0,
    bidirectional=bidirectional,
    proj_size=proj_size,  # default=0
    device=None,
    dtype=None,
)

D = 2 if bidirectional else 1
P = proj_size if proj_size > 0 else hidden_size

"""Forward Pass"""

"""1. Inputs"""
batch_size, seq_len = 4, 5

if batch_first:
    x = torch.randn(batch_size, seq_len, input_size)  # (N, L, Hin)
else:
    x = torch.randn(seq_len, batch_size, input_size)  # (L, N, Hin)

h = torch.randn(D * num_layers, batch_size, P)  # (D * C, N, P)
c = torch.randn(D * num_layers, batch_size, hidden_size)  # (D * C, N, Hout)


"""2. Outputs"""
y, (h, c) = lstm(x, None if h is None or c is None else (h, c))

if batch_first:
    assert y.shape == (batch_size, seq_len, D * P)  # (N, L, D * P)
else:
    assert y.shape == (seq_len, batch_size, D * P)  # (L, N, D * P)

assert h.shape == (D * num_layers, batch_size, P)  # (D * C, N, P)
assert c.shape == (D * num_layers, batch_size, hidden_size)  # (D * C, N, Hout)

In [7]:
"""
nn.LSTMCell(
    input_size=Hin,
    hidden_size=Hout,
):  (N?, Hin), ((N?, P), (N?, Hout))
->  (N?, P),   ((N?, P), (N?, Hout))
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.LSTMCell.html
# https://yb.tencent.com/s/4OGnkvsVzqDH
"""Define Layer"""

"""1. Position Arguments"""
input_size, hidden_size, bias = 2, 3, True

lstm_cell = nn.LSTMCell(
    input_size,  # Hin
    hidden_size,  # Hout
    bias,
    device=None,
    dtype=None,
)

"""Forward Pass"""

"""1. Inputs"""
batch_size = 4

x = torch.randn(batch_size, input_size)  # (N, Hin)

h0 = torch.randn(batch_size, hidden_size)  # (D * C, N, Hout)
c0 = torch.randn(batch_size, hidden_size)  # (D * C, N, Hout)


"""2. Outputs"""
h, c = lstm_cell(x, None if h0 is None or c0 is None else (h0, c0))

assert h.shape == (batch_size, hidden_size)  # (N, Hout)
assert c.shape == (batch_size, hidden_size)  # (N, Hout)

gates = F.linear(x, lstm_cell.weight_ih, lstm_cell.bias_ih) + F.linear(
    h0, lstm_cell.weight_hh, lstm_cell.bias_hh
)
i, f, g, o = gates.chunk(4, dim=1)  # Split into input, forget, gate and output
i = F.sigmoid(i)
f = F.sigmoid(f)
g = F.tanh(g)
o = F.sigmoid(o)

c1 = f * c0 + i * g  # Update cell state
h1 = o * F.tanh(c)  # Update hidden state

assert torch.allclose(h, h1, atol=1e-6)
assert torch.allclose(c, c1, atol=1e-6)

### GRU

In [8]:
"""
nn.GRU(
    input_size=Hin,
    hidden_size=Hout,
    num_layers=C=1,
    batch_first={(L, N?): False, (N?, L): True}[B],
    bidirectional={1: False, 2: True}[D]
):  (*B, Hin),    (D*C, N?, Hout)
->  (*B, D*Hout), (D*C, N?, Hout)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.GRU.html
"""Define Layer"""

"""1. Position Arguments"""
input_size, hidden_size, num_layers, bias = 2, 3, 1, True

"""2. Keyword Arguments"""
batch_first, bidirectional = False, True

gru = nn.GRU(
    input_size,  # Hin
    hidden_size,  # Hout
    num_layers,  # C
    bias,
    batch_first=batch_first,
    dropout=0.0,
    bidirectional=bidirectional,
    device=None,
    dtype=None,
)

D = 2 if bidirectional else 1

"""Forward Pass"""

"""1. Inputs"""
batch_size, seq_len = 4, 5

if batch_first:
    x = torch.randn(batch_size, seq_len, input_size)  # (N, L, Hin)
else:
    x = torch.randn(seq_len, batch_size, input_size)  # (L, N, Hin)

h = torch.randn(D * num_layers, batch_size, hidden_size)  # (D * C, N, Hout)


"""2. Outputs"""
y, h = gru(x, None if h is None else h)

if batch_first:
    assert y.shape == (batch_size, seq_len, D * hidden_size)  # (N, L, D * Hout)
else:
    assert y.shape == (seq_len, batch_size, D * hidden_size)  # (L, N, D * Hout)

assert h.shape == (D * num_layers, batch_size, hidden_size)  # (D * C, N, Hout)

In [9]:
"""
nn.GRUCell(
    input_size=Hin,
    hidden_size=Hout,
):  (N?, Hin),  (N?, Hout)
->  (N?, Hout), (N?, Hout)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.GRUCell.html
"""Define Layer"""

"""1. Position Arguments"""
input_size, hidden_size, bias = 2, 3, True

gru_cell = nn.GRUCell(
    input_size,  # Hin
    hidden_size,  # Hout
    bias=bias,
    device=None,
    dtype=None,
)

"""Forward Pass"""

"""1. Inputs"""
batch_size = 4

x0 = torch.randn(batch_size, input_size)  # (N, Hin)

h0 = torch.randn(batch_size, hidden_size)  # (N, Hout)


"""2. Outputs"""
h = gru_cell(x0, None if h0 is None else h0)

assert h.shape == (batch_size, hidden_size)  # (N, Hout)

x_gates = F.linear(x0, gru_cell.weight_ih, gru_cell.bias_ih).chunk(3, dim=1)
h_gates = F.linear(h0, gru_cell.weight_hh, gru_cell.bias_hh).chunk(3, dim=1)
r = F.sigmoid(x_gates[0] + h_gates[0])  # Reset gate
z = F.sigmoid(x_gates[1] + h_gates[1])  # Update gate
n = F.tanh(x_gates[2] + r * h_gates[2])  # New gate
h1 = (torch.ones_like(z) - z) * n + z * h0  # Update hidden state

assert torch.allclose(h, h1, atol=1e-6)

## Linear Layers

### Identity

In [10]:
"""
nn.Identity(): (*) -> (*)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.Identity.html
"""Define Layer"""
identity = nn.Identity()

"""Forward Pass"""

x = torch.randn(2, 3)
y = identity(x)

assert y.shape == x.shape
assert torch.equal(y, x)

### Linear

In [11]:
"""
nn.Linear(
    in_features=Hin,
    out_features=Hout,
):  (*, Hin)
->  (*, Hout)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.Linear.html
"""Define Layer"""
"""1. Position Arguments"""
in_features, out_features, bias = 2, 3, True

linear = nn.Linear(
    in_features,  # Hin, Required
    out_features,  # Hout, Required
    bias,  # default=True
    device=None,
    dtype=None,
)

"""Forward Pass"""

batch_sizes = (2, 3, 4)

x = torch.randn(*batch_sizes, in_features)  # (N, Hin)
y = linear(x)

assert y.shape == (*batch_sizes, out_features)  # (N, Hout)
assert torch.allclose(y, F.linear(x, linear.weight, linear.bias))
assert torch.allclose(
    F.linear(x, linear.weight, linear.bias),
    x @ linear.weight.t() + (linear.bias if bias else 0),
)

### Bilinear

In [12]:
"""
nn.Bilinear(
    in1_features=Hin1,
    in2_features=Hin2,
    out_features=Hout,
):  (*, Hin1), (*, Hin2)
->  (*, Hout)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.Bilinear.html
"""Define Layer"""
"""1. Position Arguments"""
in1_features, in2_features, out_features, bias = 2, 3, 4, True

bilinear = nn.Bilinear(
    in1_features,  # Hin1, Required
    in2_features,  # Hin2, Required
    out_features,  # Hout, Required
    bias,
    device=None,
    dtype=None,
)

"""Forward Pass"""
batch_sizes = (2, 3, 4)

x1 = torch.randn(*batch_sizes, in1_features)  # (N, Hin1)
x2 = torch.randn(*batch_sizes, in2_features)  # (N, Hin2)
y = bilinear(x1, x2)

assert y.shape == (*batch_sizes, out_features)  # (N, Hout)
assert torch.allclose(
    y,
    F.bilinear(x1, x2, bilinear.weight, bilinear.bias),
)
assert torch.allclose(
    y,
    torch.einsum("...i,kij,...j->...k", x1, bilinear.weight, x2)
    + (bilinear.bias if bias else 0),
    atol=1e-6,
)

### LazyLinear

In [13]:
"""
nn.LazyLinear(
    out_features=Hout
):  (*, Hin)
->  (*, Hout)
"""
# https://docs.pytorch.org/docs/stable/generated/torch.nn.LazyLinear.html

"""Define Layer"""
"""1. Position Arguments"""
out_features, bias = 2, True

lazy_linear = nn.LazyLinear(
    out_features,  # Hout
    bias,
    device=None,
    dtype=None,
)

"""Forward Pass"""

batch_size, in_features = (3, 4), 5

x = torch.randn(*batch_size, in_features)  # (N, Hin)
y = lazy_linear(x)

assert y.shape == (*batch_size, out_features)  # (N, Hout)
assert torch.allclose(y, F.linear(x, lazy_linear.weight, lazy_linear.bias), atol=1e-6)