In [19]:
%config ZMQInteractiveShell.ast_node_interactivity = "all"
%pprint

Pretty printing has been turned OFF


不像机器学习一样，`weight_decay`对深度学习的效果收效甚微，深度学习常常用dropout(丢弃法)来应对过拟合
- 但是在批量归一化(batch normalization)出现后，也可以用来替代dropout
- dropout 有点像集成学习，通过随机丢弃神经元来拟合出不同的神经网络模型，从而降低最后的方差。
- pytorch的是以概率p来丢弃神经元，而tensorflow是以概率p来保留神经元
- 通常dropout是对全连接层（fc层）使用的，很少或者不对卷积层使用，因为卷积的参数本来就少，再dropout往往欠拟合
- 在做dropout的时候，有p概率的神经元会被清零，有1-p概率的神经元会除以1-p来做拉伸（dropout保持输入输出的期望值不变）
    - 设随机变量$\xi$为0和1的概率为p和1-p，计算新的神经元$h^{'}_i$
    - $h^{'}_i = \frac{\xi_i}{1-p}h_i$
    - 因为$E(\xi_i) = 1- p$， 所以丢弃后的期望值和丢弃前的期望值是一样的
    - $E(h^{'}_i) = \frac{E(\xi_i)}{1-p} h_i = h_i$
- 测试模型的时候，为了拿到更加确定性的结果，一般不使用dropout

In [3]:
import torch
import torch.nn as nn

In [49]:
def dropout(x, drop_prob):
    # drop_prob是丢弃的概率
    x = x.float()
    # 如果丢弃概率大于1时，判错
    assert 0 <= drop_prob <= 1
    if drop_prob == 1:
        return torch.zeros_like(x)
    mask = (torch.rand(x.shape) > drop_prob).float()
    # 以1-p扩大剩下的神经元，保持dropout前后的期望值一致
    return mask / (1 - drop_prob)

In [50]:
x = torch.arange(16).reshape(2, 8)
dropout(x, 1)
dropout(x, 0.5)
dropout(x, 0)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

tensor([[0., 0., 2., 0., 0., 0., 0., 0.],
        [0., 2., 0., 2., 0., 0., 0., 2.]])

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [59]:
# 使用伯努利分布来实现
def dropout2(x, drop_prob):
    # 伯努利分布中的drop_prob是保留概率,比如0.3,就是有30%的概率的神经元为1
    assert 0 <= drop_prob <= 1
    if drop_prob == 0:
        return torch.zeros_like(x)
    m = torch.distributions.Bernoulli(torch.tensor([float(drop_prob)]))
    mask = m.sample(x.shape).view(x.shape)
    return mask / drop_prob

In [60]:
x = torch.arange(16).reshape(2, 8)
dropout2(x, 1)
dropout2(x, 0.5)
dropout2(x, 0)

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

tensor([[2., 2., 0., 2., 2., 2., 2., 0.],
        [2., 0., 0., 2., 0., 2., 2., 2.]])

tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]])

In [21]:
m = torch.distributions.Bernoulli(torch.tensor([0.9]))
m.sample((10, 10)).shape

torch.Size([10, 10, 1])

In [8]:
help(torch.distributions.Bernoulli)

Help on class Bernoulli in module torch.distributions.bernoulli:

class Bernoulli(torch.distributions.exp_family.ExponentialFamily)
 |  Bernoulli(probs=None, logits=None, validate_args=None)
 |  
 |  Creates a Bernoulli distribution parameterized by :attr:`probs`
 |  or :attr:`logits` (but not both).
 |  
 |  Samples are binary (0 or 1). They take the value `1` with probability `p`
 |  and `0` with probability `1 - p`.
 |  
 |  Example::
 |  
 |      >>> m = Bernoulli(torch.tensor([0.3]))
 |      >>> m.sample()  # 30% chance 1; 70% chance 0
 |      tensor([ 0.])
 |  
 |  Args:
 |      probs (Number, Tensor): the probability of sampling `1`
 |      logits (Number, Tensor): the log-odds of sampling `1`
 |  
 |  Method resolution order:
 |      Bernoulli
 |      torch.distributions.exp_family.ExponentialFamily
 |      torch.distributions.distribution.Distribution
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, probs=None, logits=None, validate_args=None)
 | 