In [1]:
# Dropout 正则化
import torch
from torch.distributions import Bernoulli
activations = torch.rand((5, 5))
m = Bernoulli(0.5)
m, activations

(Bernoulli(probs: 0.5),
 tensor([[0.6196, 0.8533, 0.7224, 0.7670, 0.1143],
         [0.4029, 0.4340, 0.5146, 0.9677, 0.3558],
         [0.6987, 0.0196, 0.3622, 0.7990, 0.3895],
         [0.3811, 0.5607, 0.5803, 0.4775, 0.4352],
         [0.0953, 0.7569, 0.9279, 0.6191, 0.2136]]))

In [2]:
mask = m.sample(activations.shape)
mask

tensor([[0., 1., 1., 1., 0.],
        [0., 1., 1., 0., 1.],
        [0., 1., 0., 0., 0.],
        [1., 0., 1., 0., 1.],
        [0., 1., 0., 1., 0.]])

In [3]:
activations *= mask
activations

tensor([[0.0000, 0.8533, 0.7224, 0.7670, 0.0000],
        [0.0000, 0.4340, 0.5146, 0.0000, 0.3558],
        [0.0000, 0.0196, 0.0000, 0.0000, 0.0000],
        [0.3811, 0.0000, 0.5803, 0.0000, 0.4352],
        [0.0000, 0.7569, 0.0000, 0.6191, 0.0000]])

In [4]:
from torch import nn
p, count, iters, shape = 0.5, 0, 50, (5, 5)
dropout = nn.Dropout(p)
dropout.train()

Dropout(p=0.5, inplace=False)

In [5]:
for _ in range(iters):
    activations = torch.rand(shape) + 1e-5
    output = dropout(activations)
    count += torch.sum(output == activations * (1 / (1 - p)))
print(f"Train 模式影响了{100 * (1 - float(count) / (activations.nelement() * iters)):.1f}%的神经元")

Train 模式影响了50.8%的神经元


In [6]:
output == activations * (1 / (1 - p))

tensor([[False, False, False,  True, False],
        [ True, False,  True, False, False],
        [False, False,  True,  True,  True],
        [False, False, False, False, False],
        [False, False, False,  True, False]])

In [7]:
(output == activations * (1 / (1 - p))).float()

tensor([[0., 0., 0., 1., 0.],
        [1., 0., 1., 0., 0.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0.]])

In [8]:
count = 0
dropout.eval()
for _ in range(iters):
    activations = torch.rand(shape) + 1e-5
    output = dropout(activations)
    count += torch.sum(output == activations)  # 测试阶段不需要乘以 1 / (1 - p)
print(f"Eval 模式影响了{100 * (1 - float(count) / (activations.nelement() * iters)):.1f}%的神经元")

Eval 模式影响了0.0%的神经元


In [9]:
# 批标准化
import torch
from torch import nn

In [10]:
m = nn.BatchNorm1d(num_features = 5, affine = False)
print("BEFORE")
print("running_mean:", m.running_mean)
print("running_var:", m.running_var)

BEFORE
running_mean: tensor([0., 0., 0., 0., 0.])
running_var: tensor([1., 1., 1., 1., 1.])


In [11]:
for _ in range(100):
    input = torch.randn(20, 5)
    output = m(input)

In [12]:
input.size(), input[:5], output.size(), output[:5]

(torch.Size([20, 5]),
 tensor([[-0.7554,  1.2216,  0.8998,  0.3112, -0.4706],
         [-1.0602,  2.0212,  0.0225,  0.8421,  0.7273],
         [ 0.5163,  0.3491,  0.9872,  0.9916,  1.2226],
         [ 0.5108,  0.7029, -1.0604,  0.3484,  1.1812],
         [-0.6991, -1.0747,  0.5229,  0.5034,  0.1215]]),
 torch.Size([20, 5]),
 tensor([[-0.6919,  0.9283,  1.3450,  0.1044, -0.6471],
         [-1.0343,  1.9080,  0.3792,  0.5088,  0.7554],
         [ 0.7366, -0.1409,  1.4411,  0.6226,  1.3353],
         [ 0.7305,  0.2927, -0.8127,  0.1328,  1.2868],
         [-0.6287, -1.8855,  0.9300,  0.2508,  0.0461]]))

In [13]:
print('AFTER')
print("running_mean:", m.running_mean)
print("running_var:", m.running_var)

AFTER
running_mean: tensor([-0.0229,  0.0272, -0.0490,  0.0192,  0.0112])
running_var: tensor([1.0332, 0.9424, 0.9128, 1.0989, 0.9065])


In [14]:
m.eval()
for _ in range(100):
    input = torch.randn(20, 5)
    output = m(input)

In [15]:
print("EVAL")  # eval 模式批量归一化参数也不进行更新
print("running_mean:", m.running_mean)
print("running_var:", m.running_var)

EVAL
running_mean: tensor([-0.0229,  0.0272, -0.0490,  0.0192,  0.0112])
running_var: tensor([1.0332, 0.9424, 0.9128, 1.0989, 0.9065])


In [16]:
import torch
from torch import nn

In [17]:
print("No Affine, Gamma:", m.weight)
print("No Affine, beta:", m.bias)

No Affine, Gamma: None
No Affine, beta: None


In [19]:
m_affine = nn.BatchNorm1d(num_features = 5, affine = True)
print("with affine, gamma:", m_affine.weight)
print("with affine, beta:", m_affine.bias)

with affine, gamma: Parameter containing:
tensor([1., 1., 1., 1., 1.], requires_grad=True)
with affine, beta: Parameter containing:
tensor([0., 0., 0., 0., 0.], requires_grad=True)
