In [1]:
%config ZMQInteractiveShell.ast_node_interactivity = "all"
%pprint

Pretty printing has been turned OFF


## numpy实现

In [69]:
import sys
sys.path.append("../d2l_func/")
import numpy as np
from linreg_numpy import LinearModel
from utils import *
from sqdm import sqdm

In [92]:
# 生成数据
input_num = 10000
true_w = np.array([2, -3.4])
true_b = np.array([4.2])

x = np.random.normal(0, 1, size=(input_num, len(true_w)))
error = np.random.normal(0, 0.01, size=input_num)
y = x@true_w + true_b + error

In [103]:
class LinearBridge(LinearModel):
    def __init__(self, weight_decay, alpha=0.01):
        super(LinearBridge, self).__init__()
        self.weight_decay = weight_decay
        
    def fit(self, X, y):
        # initialize w depend on the X shape
        fea_num = int(X.size / len(y))
        if self.count == 0:
            self.w = np.zeros(fea_num)

        # change X and y shape
        X = X.reshape(len(y), fea_num)
        y = y.reshape(-1)

        # calculate y_pred
        y_pred = self.predict(X)

        # update grad
        self.w = self.w - self.alpha * (X.T @ (y_pred - y) - self.weight_decay*self.w) / len(y)
        self.b = self.b - self.alpha * ((y_pred - y).sum() - self.weight_decay*self.b) / len(y)
        self.count += 1

In [104]:
import time

params = {
    "epoch_num": 10,
    "batch_size": 128,
    "weight_decay": 0.05,
    "alpha": 0.01,
    "model": LinearBridge,
}

process_bar = sqdm()

def train(epoch_num, model, batch_size, alpha, weight_decay):
    model = LinearBridge(weight_decay=weight_decay, alpha=alpha)
    for epoch in range(epoch_num):
        print(f"Epoch [{epoch}/{epoch_num}]")
        for xdata, ydata in data_iter(batch_size, x, y):
            model.fit(xdata, ydata)
            mse = model.score(xdata, ydata)
            process_bar.show_process(len(y), batch_size, round(mse, 4))
            time.sleep(0.01)
        print("\n")
    return model

model = train(**params)
print(f"w before update is {true_w}, w after update is {model.w}")
print(f"b before update is {true_b}, b after update is {model.b}")

Epoch [0/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 86.56593

Epoch [1/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 17.8176

Epoch [2/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 3.67499

Epoch [3/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.7618

Epoch [4/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.1601

Epoch [5/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.0351

Epoch [6/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.0087

Epoch [7/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.0031

Epoch [8/10]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

## pytorch实现

### pytorch实现1

使用pytorch框架来实现，并对比是使用哪一种方式来进行正则化（权重衰减）
- 权重衰减：$\theta_t = (1 - \beta)\theta_{t-1} - \alpha g_t$
- L2正则化：$\theta_t = \theta_{t-1} - \alpha (g_t + \lambda \theta_{t-1}) = (1 - \alpha \lambda)\theta_{t-1} - \alpha g_t$
    - 其中$g_t + \lambda \theta_{t-1}$实际上是除了batch_size
- pytorch实现中，对于$g_t + \lambda \theta_{t-1}$，前一部分$g_t$除了batch_size，后一部分$\lambda \theta_{t-1}$没有

学习率0.01，weight_decay为0.05，优化器为SGD
- 使用pytorch的backward梯度回传+自动梯度更新的方式

In [227]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.utils.data as Data
import warnings
warnings.filterwarnings("ignore")

params = {
    "input_num": 10000,
    "fea_num": 2,
    "epoch_num": 20,
    "batch_size": 128,
    "alpha": 0.01,
    "weight_decay": 0.05,
}

true_w = torch.tensor([2, -3.4])
true_b = torch.tensor([4.2])

torch.manual_seed(1000)
x = torch.normal(0, 1, size=(params["input_num"], params["fea_num"]))
error = torch.normal(0, 0.01, size=(params["input_num"], ))
y = torch.mv(x, true_w) + true_b + error

# 生成迭代器
dataset = Data.TensorDataset(x, y)
data_iter = Data.DataLoader(dataset, params["batch_size"], shuffle=True)

<torch._C.Generator object at 0x7fa0de7a8970>

In [228]:
class PLinearBridge(nn.Module):
    def __init__(self, fea_num):
        super(PLinearBridge, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(fea_num, 1)
        )
    
    def forward(self, x):
        y = self.layer(x)
        return y


net = PLinearBridge(params["fea_num"])
torch.manual_seed(100)
_ = init.normal_(net.layer[0].weight, 0, 0.01)
_ = init.constant_(net.layer[0].bias, 0)
loss = nn.MSELoss()
# optimizer = torch.optim.SGD(net.parameters(), lr=params["alpha"])
optimizer = torch.optim.SGD(net.parameters(), lr=params["alpha"], weight_decay=params["weight_decay"])

for epoch in range(params["epoch_num"]):
    print(f"Epoch [{epoch}/{params['epoch_num']}]")
    for xdata, ydata in data_iter:
        l = loss(net(xdata), ydata.reshape(net(xdata).shape))
        
        optimizer.zero_grad()
        
        l.backward()
        optimizer.step()
        process_bar.show_process(params["input_num"], params["batch_size"], round(l.item(), 4))
    print("\n")
    
print(f"w before update is {true_w}, w after update is {net.layer[0].weight}")
print(f"b before update is {true_b}, b after update is {net.layer[0].bias}")

<torch._C.Generator object at 0x7fa0de7a8970>

Epoch [0/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 2.7557

Epoch [1/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.1162

Epoch [2/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.0284

Epoch [3/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.0154

Epoch [4/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.023

Epoch [5/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.0156

Epoch [6/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.022

Epoch [7/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.018

Epoch [8/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

### pytorch实现2

学习率0.01，weight_decay为0.05，优化器为SGD
- 使用pytorch的backward梯度回传+手写梯度更新的方式
- 结果和pytorch实现1类似，说明pytorch在实现L2时，并没有对$\theta$除以batch_size

In [229]:
#! /usr/bin/env python
# -*-coding: utf-8 -*-

import sys
sys.path.append("../d2l_func/")
import torch
from sqdm import sqdm
from utils import *


def linreg(X, w, b):
    """realize linear model"""
    return torch.mv(X, w) + b


def square_loss(y_pred, y):
    """
    calculate mean square loss which divide batch_size,
    and don't divide batch_size when update gradient by mini-batch GD.
    """
    return ((y_pred - y)**2).sum()


def sgd(params, lr, weight_decay, batch_size):
    """realize optimization algorithm """
    for param in params:
#         param.data -= lr * param.grad/batch_size
        param.data = param.data - lr * param.grad/batch_size - lr * weight_decay*param.data


def train(epoch_num, net, loss, batch_size, lr, weight_decay):
    """train function"""
    for epoch in range(epoch_num):
        print(f"Epoch [{epoch}/{epoch_num}]")
        for xdata, ydata in data_iter(batch_size, x, y):
            l = loss(net(xdata, w, b), ydata)
            l.backward()
            sgd([w, b], lr, weight_decay, len(ydata))

            # clear grad, aviod grad accumulate
            w.grad.data.zero_()
            b.grad.data.zero_()

            # training bar
            mse = np.round(loss(net(xdata, w, b), ydata).item(), 5)
            process_bar.show_process(len(y), batch_size, mse)
        print("\n")


"""generate data by pytorch"""
torch.manual_seed(1000)
input_num = 10000
true_w = torch.tensor([2, -3.4])
true_b = torch.tensor([4.2])
x = torch.normal(mean=0, std=1, size=(input_num, len(true_w)))
error = torch.normal(mean=0, std=0.01, size=(input_num, ))
y = torch.mv(x, true_w) + true_b + error

"""training"""
# set parameter
params = {
    "net": linreg,
    "loss": square_loss,
    "epoch_num": 20,
    "batch_size": 128,
    "lr": 0.01,
    "weight_decay":0.05,
}

# weight and bias initialize
torch.manual_seed(100)
w = torch.normal(mean=0, std=0.01, size=(2, ), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
process_bar = sqdm()
train(**params)
print(f"w before update is {true_w}, w after update is {w}")
print(f"b before update is {true_b}, b after update is {b}")


'generate data by pytorch'

<torch._C.Generator object at 0x7fa0de7a8970>

'training'

<torch._C.Generator object at 0x7fa0de7a8970>

Epoch [0/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 21.012457

Epoch [1/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 1.745145

Epoch [2/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.46305

Epoch [3/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.30788

Epoch [4/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.28113

Epoch [5/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.27603

Epoch [6/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.27504

Epoch [7/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 0.27484

Epoch [8/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

### pytorch实现3

学习率0.01，weight_decay为0.05，优化器为SGD
- 手写计算梯度+手写梯度更新的方式

In [219]:
class PLinearBridge(LinearModel):
    def __init__(self, alpha=0.01, weight_decay=0.05):
        super(PLinearBridge, self).__init__()
        self.weight_decay = weight_decay
        self.alpha = alpha

    def fit(self, X, y):
        # change X and y shape
        fea_num = int(X.numel() / len(y))
        X = X.reshape(len(y), fea_num)
        y = y.reshape(-1)

        # calculate y_pred
        y_pred = self.predict(X)

        # update grad
        self.w = self.w - self.alpha * (X.T @ (y_pred - y)/len(y) + self.weight_decay*self.w)
        self.b = self.b - self.alpha * ((y_pred - y).sum()/len(y) + self.weight_decay*self.b)
#         self.w = self.w - self.alpha * (X.T @ (y_pred - y)) / len(y) - self.alpha * self.weight_decay * self.w
#         self.b = self.b - self.alpha * (y_pred - y).sum() / len(y) - self.alpha * self.weight_decay * self.b

In [220]:
import time

"""generate data by pytorch"""
torch.manual_seed(1000)
input_num = 10000
true_w = torch.tensor([2, -3.4])
true_b = torch.tensor([4.2])
x = torch.normal(mean=0, std=1, size=(input_num, len(true_w)))
error = torch.normal(mean=0, std=0.01, size=(input_num, ))
y = torch.mv(x, true_w) + true_b + error

params = {
    "epoch_num": 20,
    "batch_size": 128,
    "weight_decay": 0.05,
    "alpha": 0.01,
    "model": PLinearBridge,
}

process_bar = sqdm()

def train(epoch_num, model, batch_size, alpha, weight_decay):
    model = model(weight_decay=weight_decay, alpha=alpha)
    torch.manual_seed(100)
    model.w = torch.normal(mean=0, std=0.01, size=(2, ), requires_grad=True)
    print(model.w)
    model.b = torch.zeros(1, requires_grad=True)
    print(model.b)
    for epoch in range(epoch_num):
        print(f"Epoch [{epoch}/{epoch_num}]")
        for xdata, ydata in data_iter(batch_size, x, y):
            model.fit(xdata, ydata)
            mse = model.score(xdata, ydata)
            process_bar.show_process(len(y), batch_size, round(mse.item(), 5))
        print("\n")
    return model

model = train(**params)
print(f"w before update is {true_w}, w after update is {model.w}")
print(f"b before update is {true_b}, b after update is {model.b}")

'generate data by pytorch'

<torch._C.Generator object at 0x7fa0de7a8970>

tensor([ 0.0036, -0.0029], requires_grad=True)
tensor([0.], requires_grad=True)
Epoch [0/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 96.507268

Epoch [1/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 23.61678

Epoch [2/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 7.260124

Epoch [3/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 3.06195

Epoch [4/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 1.78958

Epoch [5/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 1.34098

Epoch [6/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 1.16553

Epoch [7/20]
10000/10000 [>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>] - loss: 1.092