# PyTorch实现L1，L2正则化以及Dropout

## 1,了解知道Dropout原理 

###  在机器学习的模型中，如果模型的参数太多，而训练样本又太少，训练出来的模型很容易产生过拟合的现象. 具体表现在：模型在训练数据上损失函数较小，预测准确率较高；但是在测试数据上损失函数比较大，预测准确率较低

### 2012年，Hinton提出Dropout,用于防止过拟合. Dropout可以作为训练深度神经网络的一种trick. 在每个训练批次中，通过忽略一半的特征检测器（让一半的隐层节点值为0），可以明显地减少过拟合现象。 

## 2,  用代码实现正则化(L1、L2、Dropout）

### https://www.jianshu.com/p/e53a608d3d75 

In [20]:
import numpy as np
import matplotlib.pyplot as plt

In [21]:
sigmoid = lambda x: 1 / (1 + np.exp(-x))
layer_size = lambda x, y: (x.shape[0], 4, y.shape[0])

In [22]:
initialize_parameters = lambda n_x, n_h, n_y: {
    "w1": np.random.rand(n_h, n_x) * 0.01,
    "b1": np.zeros((n_h, 1)),
    "w2": np.random.rand(n_y, n_h) * 0.01,
    "b2": np.zeros((n_y, 1))
}

In [23]:
parameters = initialize_parameters(2,4,1)

In [24]:
parameters

{'b1': array([[ 0.],
        [ 0.],
        [ 0.],
        [ 0.]]), 'b2': array([[ 0.]]), 'w1': array([[ 0.00835272,  0.0019225 ],
        [ 0.00395097,  0.00300081],
        [ 0.00080104,  0.00904631],
        [ 0.00370154,  0.00530697]]), 'w2': array([[ 0.00494116,  0.00132161,  0.00206454,  0.00076189]])}

In [25]:
def forward_propagetion_with_dropout(x, paramters, keep_prod=0.7):
    # 取出参数
    w1 = parameters['w1']
    b1 = parameters['b1']
    w2 = parameters['w2']
    b2 = parameters['b2']
    # 向前传播
    z1 = np.matmul(w1, x) + b1
    a1 = np.tanh(z1)
    # 这里加入mask，使得其中一些神经元的活动变为0，在反向传播中，不再更新这些节点
    mask1 = (np.random.rand(a1.shape[0], 1) < keep_prod)
    
    a1 = a1 * mask1
    a1 = a1 / keep_prod
    z2 = np.matmul(w2, a1) + b2
    a2 = sigmoid(z2)
    a2 = sigmoid(z2)
    cache = {
        "z1": z1,
        "a1": a1,
        "mask1": mask1,
        "z2": z2,
        "a2": a2
    }
    return a2, cache


In [26]:
def compute_cost(a2, y, parameters, lambd=0):
    m = y.shape[1]
    logprobs = (np.log(a2) * y) + np.log(1 - a2) * (1 - y)
    cost = -np.sum(logprobs) / m
    l2_loss = np.sum(np.square(parameters['w1'])) + np.sum(np.square(parameters['w2']))
    cost = np.squeeze(cost)
    return cost + lambd * l2_loss / (2 * m)

In [27]:
def backward_propagetion_with_dropout(parameters, cache, x, y, lambd=0, keep_prob=0.7):
    m = x.shape[1]
    w1 = parameters['w1']
    w2 = parameters['w2']
    a1 = cache['a1']
    a2 = cache['a2']
    mask1 = cache['mask1']
    
    dz2 = a2 - y
    dw2 = np.dot(dz2, a1.T) / m + lambd / m * w2
    db2 = np.sum(dz2, axis=1, keepdims=True) / m
 
    # Dropout的关键操作
    da1 = np.dot(w2.T, dz2)
 
    da1 = da1 * mask1
    da1 = da1 / keep_prob
 
    dz1 = np.multiply(np.dot(w2.T, dz2), (1 - np.power(a1, 2)))
    dw1 = np.dot(dz1, x.T) / m + lambd / m * w1
    db1 = np.sum(dz1, axis=1, keepdims=True) / m
 
    grads = {
        "dw1": dw1,
        "db1": db1,
        "dw2": dw2,
        "db2": db2,
    }
    return grads


In [28]:
def update_parameters(parameters, grads, learning_rate=0.01):
    w1 = parameters['w1']
    b1 = parameters['b1']
    w2 = parameters['w2']
    b2 = parameters['b2']
 
    dw1 = learning_rate * grads["dw1"]
    db1 = learning_rate * grads["db1"]
    dw2 = learning_rate * grads["dw2"]
    db2 = learning_rate * grads["db2"]
 
    w1 = w1 - dw1
    b1 = b1 - db1
    w2 = w2 - dw2
    b2 = b2 - db2
 
    parameters = {
        "w1": w1,
        "b1": b1,
        "w2": w2,
        "b2": b2
    }
    return parameters

In [29]:
np.random.seed(1)
m = 200
x = np.random.randn(2, m)
y = (1 + (2 * (x[0, :] > 0) - 1) * (2 * (x[1, :] > 0) - 1)) / 2
y = y.reshape(1, x.shape[1])

In [30]:
x

array([[  1.62434536e+00,  -6.11756414e-01,  -5.28171752e-01,
         -1.07296862e+00,   8.65407629e-01,  -2.30153870e+00,
          1.74481176e+00,  -7.61206901e-01,   3.19039096e-01,
         -2.49370375e-01,   1.46210794e+00,  -2.06014071e+00,
         -3.22417204e-01,  -3.84054355e-01,   1.13376944e+00,
         -1.09989127e+00,  -1.72428208e-01,  -8.77858418e-01,
          4.22137467e-02,   5.82815214e-01,  -1.10061918e+00,
          1.14472371e+00,   9.01590721e-01,   5.02494339e-01,
          9.00855949e-01,  -6.83727859e-01,  -1.22890226e-01,
         -9.35769434e-01,  -2.67888080e-01,   5.30355467e-01,
         -6.91660752e-01,  -3.96753527e-01,  -6.87172700e-01,
         -8.45205641e-01,  -6.71246131e-01,  -1.26645989e-02,
         -1.11731035e+00,   2.34415698e-01,   1.65980218e+00,
          7.42044161e-01,  -1.91835552e-01,  -8.87628964e-01,
         -7.47158294e-01,   1.69245460e+00,   5.08077548e-02,
         -6.36995647e-01,   1.90915485e-01,   2.10025514e+00,
        

In [31]:
y

array([[ 0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
         1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,
         1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,
         1.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,
         1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
         0.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,
         0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,
         1.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
         1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,
         0.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
         1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
         0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
         0.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,  1.

In [32]:
num_iterations = 10
learning_rate = 0.1
n_x, n_h, n_y = 2, 30, 1
costs = []
keep_prob = 0.7

In [33]:
parameters = initialize_parameters(n_x, n_h, n_y)
w1 = parameters['w1']
b1 = parameters['b1']
w2 = parameters['w2']
b2 = parameters['b2']

In [34]:
for i in range(num_iterations):
    a2, cache = forward_propagetion_with_dropout(x, parameters)
    cost = compute_cost(a2, y, parameters)
    grads = backward_propagetion_with_dropout(parameters, cache, x, y)
    # 关键步骤，参数更新
    parameters = update_parameters(parameters, grads, learning_rate)
 
    if i % 1 == 0:
        print("Cost after iteration %i:%f" % (i, cost))
        costs.append(cost)


Cost after iteration 0:0.693123
Cost after iteration 1:0.693079
Cost after iteration 2:0.693048
Cost after iteration 3:0.693010
Cost after iteration 4:0.692978
Cost after iteration 5:0.692937
Cost after iteration 6:0.692903
Cost after iteration 7:0.692873
Cost after iteration 8:0.692840
Cost after iteration 9:0.692827


## 3,  pytorch实现dropout

In [35]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.l1 = nn.Linear(8, 60)
        self.l2 = nn.Linear(60, 4)
        self.l3 = nn.Linear(4, 1)
        self.sigmoid = nn.Sigmoid()
        # 直接调用torch.nn.Dropout
        self.dropout = nn.Dropout(p=0.4)
 
    def foward(self, x):
        out1 = self.sigmoid(self.l1(x))
        out2 = self.dropout(out1)
        out3 = self.sigmoid(self.l2(out2))
        y_pred = self.sigmoid(self.l3(out3))
        return y_pred

NameError: name 'nn' is not defined