In [92]:
%matplotlib inline
from mxnet import autograd, nd
from mxnet.gluon import nn
from mxnet import gluon
from mxnet import init
import common as comm

#### 4.1 模型构建
Block类是nn模块里提供的一个模型构造类，我们可以继承它来定义我们想要的模型。下面继承Block类构造本节开头提到的多层感知机。

In [37]:
class MLP(nn.Block):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Block的构造函数来进行必要的初始化。这样在构造实例时还可以指定其他函数
        # 参数，如“模型参数的访问、初始化和共享”一节将介绍的模型参数params
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Dense(256, activation='relu')
        self.output = nn.Dense(10)
    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        return self.output(self.hidden(x))

In [38]:
X = nd.random.uniform(shape=(2, 20))
net = MLP()
net.initialize()
net(X)


[[-0.02481476 -0.03370603 -0.01484309 -0.00795806 -0.01920623  0.01482263
   0.00418185  0.004043    0.0025303   0.07560658]
 [ 0.00473669  0.0187421   0.03095323 -0.02015721 -0.02749954 -0.00538642
   0.00796124 -0.00819326  0.01774705  0.07477219]]
<NDArray 2x10 @cpu(0)>

In [4]:
X


[[0.5488135  0.5928446  0.71518934 0.84426576 0.60276335 0.8579456
  0.5448832  0.8472517  0.4236548  0.6235637  0.6458941  0.3843817
  0.4375872  0.2975346  0.891773   0.05671298 0.96366274 0.2726563
  0.3834415  0.47766513]
 [0.79172504 0.8121687  0.5288949  0.47997716 0.56804454 0.3927848
  0.92559665 0.83607876 0.07103606 0.33739617 0.08712929 0.6481719
  0.0202184  0.36824155 0.83261985 0.95715517 0.77815676 0.14035077
  0.87001216 0.87008727]]
<NDArray 2x20 @cpu(0)>

#### 自定义Sequential

In [22]:
class MySequential(nn.Block):
    def __init(self, **kwargs):
        super(MySequential, self).__init__(**kwargs)
    def add(self, *blocks):
        for block in blocks:
            self.register_child(block)
    def forward(self, x):
        for block in self._children.values():
            x = block(x)
        return x

In [24]:
net = MySequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize()
net(X)


[[-0.0228641  -0.04095767 -0.00851615 -0.01662018  0.01368363  0.09987733
   0.03003268 -0.03530201 -0.10374807 -0.06101432]
 [ 0.00688874 -0.01857537 -0.0357835  -0.05064729  0.0012001   0.0806037
   0.00940578 -0.07143035 -0.1180478  -0.07556092]]
<NDArray 2x10 @cpu(0)>

In [51]:
class FancyMLP(nn.Block):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        # 使用get_constant创建的随机权重参数不会在训练中被迭代（即常数参数）
        self.rand_weight = self.params.get_constant('rand_weight', 
                                                    nd.random.uniform(shape=(20, 20)
                                                                          ))
        self.dense = nn.Dense(20, activation='relu')
    def forward(self, x):
        x = self.dense(x)
        # 使用创建的常数参数，以及NDArray的relu函数和dot函数
        x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1)
        # 复用全连接层。等价于两个全连接层共享参数
        x = self.dense(x)
        # 控制流，这里我们需要调用asscalar函数来返回标量进行比较
        while x.norm().asscalar() > 1:
            x /= 2
        if x.norm().asscalar() < 0.8:
            x *= 10
        return x.sum()

In [52]:
net = FancyMLP()
net.initialize()
net(X)


[3.2373087]
<NDArray 1 @cpu(0)>

In [57]:
class NestMLP(nn.Block):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential()
        self.net.add(nn.Dense(64, activation='relu'),
                     nn.Dense(32, activation='relu'))
        self.dense = nn.Dense(16, activation='relu')

    def forward(self, x):
        return self.dense(self.net(x))

In [58]:
net = nn.Sequential()
net.add(NestMLP(), nn.Dense(20), FancyMLP())

net.initialize()
net(X)


[3.5323653]
<NDArray 1 @cpu(0)>

#### 4.2 模型参数
模型参数的访问、初始化和共享

In [59]:
net = nn.Sequential()
net.add(nn.Dense(256, activation='relu'))
net.add(nn.Dense(10))
net.initialize()  # 使用默认初始化方式

X = nd.random.uniform(shape=(2, 20))
Y = net(X)  # 前向计算

In [60]:
#Sequential类构造的神经网络，我们可以通过方括号[]来访问网络的任一层
net[0].params, type(net[0].params)

(dense51_ (
   Parameter dense51_weight (shape=(256, 20), dtype=float32)
   Parameter dense51_bias (shape=(256,), dtype=float32)
 ), mxnet.gluon.parameter.ParameterDict)

In [62]:
print(net[0].params['dense51_weight'])

Parameter dense51_weight (shape=(256, 20), dtype=float32)


In [63]:
net[0].params['dense51_weight'].data()


[[ 0.00675588  0.00335923  0.00606937 ... -0.00599905  0.06216455
   0.04231021]
 [-0.0648632   0.03409437 -0.04936399 ... -0.00977961  0.05226472
  -0.00962454]
 [-0.00777294  0.00436731  0.00033506 ...  0.04357625 -0.02547599
  -0.02989651]
 ...
 [-0.03717463  0.0048242  -0.05455612 ...  0.01619268 -0.02881011
  -0.02803967]
 [-0.00116915  0.01388479 -0.0595667  ...  0.00585768 -0.06936526
   0.02626163]
 [-0.06674968  0.0334196  -0.06857213 ... -0.00617229 -0.06087813
  -0.00311278]]
<NDArray 256x20 @cpu(0)>

In [64]:
net[0].params['dense51_weight'].grad()


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<NDArray 256x20 @cpu(0)>

In [67]:
net[0].params['dense51_bias'].data()


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 256 @cpu(0)>

In [66]:
net[0].params['dense51_bias'].grad()


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
<NDArray 256 @cpu(0)>

In [68]:
# 使用collect_params函数来获取net变量所有嵌套
net.collect_params()

sequential5_ (
  Parameter dense51_weight (shape=(256, 20), dtype=float32)
  Parameter dense51_bias (shape=(256,), dtype=float32)
  Parameter dense52_weight (shape=(10, 256), dtype=float32)
  Parameter dense52_bias (shape=(10,), dtype=float32)
)

In [70]:
net.collect_params(".*bias")

sequential5_ (
  Parameter dense51_bias (shape=(256,), dtype=float32)
  Parameter dense52_bias (shape=(10,), dtype=float32)
)

初始化模型参数¶

默认初始化 权重参数元素为[-0.07, 0.07]之间均匀分布的随机数，偏差参数则全为0。

In [73]:
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]


[ 0.01817497  0.00101184 -0.01082114  0.00268631  0.01740005  0.00260317
 -0.03099813  0.00170382  0.00884803 -0.00333901  0.00408539 -0.02333756
  0.00353178 -0.01652882 -0.00973991 -0.00987439  0.00016979 -0.00126417
  0.01020485 -0.00687493]
<NDArray 20 @cpu(0)>

In [74]:
net.initialize(init=init.Constant(3), force_reinit=True)
net[0].weight.data()[0]


[3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3. 3.]
<NDArray 20 @cpu(0)>

In [75]:
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
net[0].weight.data()[0]


[ 0.0946622  -0.10525544 -0.03743418 -0.04616296 -0.08939345  0.00864312
 -0.11831925  0.0361426   0.07330991 -0.06794105 -0.01396172 -0.07042709
  0.06302193 -0.13376406  0.12249702  0.09516214 -0.10421677 -0.13920105
  0.12360677 -0.04109657]
<NDArray 20 @cpu(0)>

In [80]:
# 自定义初始化
class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        data[:]=nd.random.uniform(low=-10, high=10, shape=data.shape)
        data *= data.abs() >= 5

In [81]:
net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[0]

Init dense51_weight (256, 20)
Init dense52_weight (10, 256)



[ 0.         5.065213   0.        -5.4838233  7.802231   5.0276194
  9.781769   6.7210617  0.         0.         6.9072895 -9.451985
  5.5607777 -0.        -0.         0.         7.5138454  8.761868
 -9.144737  -5.649917 ]
<NDArray 20 @cpu(0)>

#### 共享模型参数
在构造第三隐藏层时通过params来指定它使用第二隐藏层的参数。因为模型参数里包含了梯度，所以在反向传播计算时，第二隐藏层和第三隐藏层的梯度都会被累加在shared.params.grad()里。

In [85]:
# 共享模型参数

net = nn.Sequential()
shared = nn.Dense(8, activation='relu')
net.add(nn.Dense(8, activation='relu'),
        shared,
        nn.Dense(8, activation='relu', params=shared.params),
        nn.Dense(10))
net.initialize()

X = nd.random.uniform(shape=(2, 20))
net(X)

print(net[1].weight.data() == net[2].weight.data())


[[1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 8x8 @cpu(0)>


In [84]:
net[1].weight.data()


[[ 0.05065522  0.05895775  0.04820579  0.00081181 -0.02583488  0.02213752
   0.01394139 -0.06139019]
 [-0.00977468 -0.02220326  0.05727299 -0.01545183 -0.04376948  0.00076623
   0.02768198  0.04881077]
 [ 0.06585255  0.04333929 -0.04546143 -0.06863115 -0.0417247  -0.04824803
   0.02712127 -0.01719746]
 [ 0.03908155  0.00168244 -0.00132313  0.01497848  0.01535611 -0.0497843
  -0.04022446  0.02748072]
 [-0.00327401  0.05470908 -0.05430993 -0.03867951 -0.02500093  0.00558397
  -0.03013084  0.06856707]
 [-0.00775245  0.02017801  0.06021769 -0.05694444 -0.04462253  0.0382697
  -0.01380564  0.00110231]
 [ 0.01618361 -0.05431794  0.06251799  0.00212141 -0.05135925  0.03573355
   0.05850273 -0.05360014]
 [-0.05865247 -0.02012416 -0.0026962  -0.05846451 -0.00635742 -0.04879041
  -0.04065562 -0.0626504 ]]
<NDArray 8x8 @cpu(0)>

#### 4.4 自定义层

In [86]:
#不带参数的自定义层
class CenteredLayer(nn.Block):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)

    def forward(self, x):
        return x - x.mean()

In [87]:
layer = CenteredLayer()
layer(nd.array([1, 2, 3, 4, 5]))


[-2. -1.  0.  1.  2.]
<NDArray 5 @cpu(0)>

In [88]:
net = nn.Sequential()
net.add(nn.Dense(128),
        CenteredLayer())

In [89]:
net.initialize()
y = net(nd.random.uniform(shape=(4, 8)))
y.mean().asscalar()

-2.1100277e-10

In [90]:
#含模型参数的自定义层

In [94]:
#自定义参数
params = gluon.ParameterDict()
params.get('param2', shape=(2, 3))
params

(
  Parameter param2 (shape=(2, 3), dtype=<class 'numpy.float32'>)
)

In [95]:

class MyDense(nn.Block):
    # units为该层的输出个数，in_units为该层的输入个数
    def __init__(self, units, in_units, **kwargs):
        super(MyDense, self).__init__(**kwargs)
        self.weight = self.params.get('weight', shape=(in_units, units))
        self.bias = self.params.get('bias', shape=(units,))

    def forward(self, x):
        linear = nd.dot(x, self.weight.data()) + self.bias.data()
        return nd.relu(linear)

In [96]:
dense = MyDense(units=3, in_units=5)
dense.params

mydense0_ (
  Parameter mydense0_weight (shape=(5, 3), dtype=<class 'numpy.float32'>)
  Parameter mydense0_bias (shape=(3,), dtype=<class 'numpy.float32'>)
)

In [97]:
dense.initialize()
dense(nd.random.uniform(shape=(2, 5))) # 前向计算


[[0.06308134 0.02173875 0.        ]
 [0.07082637 0.02660567 0.        ]]
<NDArray 2x3 @cpu(0)>

In [98]:

net = nn.Sequential()
net.add(MyDense(8, in_units=64),
        MyDense(1, in_units=8))
net.initialize()
net(nd.random.uniform(shape=(2, 64)))


[[0.06597041]
 [0.06828965]]
<NDArray 2x1 @cpu(0)>

In [99]:
#4.5模型的读写与存储

In [104]:
#单个元素
x = nd.ones(3)
nd.save('result/x', x)

In [107]:
x


[1. 1. 1.]
<NDArray 3 @cpu(0)>

In [103]:
!mkdir /data10t/mgf/ml/handsonmxnet/result

In [105]:
a = nd.load('result/x')

In [106]:
a

[
 [1. 1. 1.]
 <NDArray 3 @cpu(0)>]

In [109]:
#数组
y = nd.ones(4)
nd.save('result/xy', [x, y])
x2, y2 = nd.load('result/xy')
(x2, y2)

(
 [1. 1. 1.]
 <NDArray 3 @cpu(0)>, 
 [1. 1. 1. 1.]
 <NDArray 4 @cpu(0)>)

In [110]:
#词典
mydict = {'x': x, 'y': y}
nd.save('result/mydict', mydict)
mydict2 = nd.load('result/mydict')
mydict2

{'x': 
 [1. 1. 1.]
 <NDArray 3 @cpu(0)>, 'y': 
 [1. 1. 1. 1.]
 <NDArray 4 @cpu(0)>}

In [111]:
# 读写模型参数
class MLP(nn.Block):
    def __init__(self, **kwargs):
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Dense(256, activation='relu')
        self.output = nn.Dense(10)

    def forward(self, x):
        return self.output(self.hidden(x))

net = MLP()
net.initialize()
X = nd.random.uniform(shape=(2, 20))
Y = net(X)

In [112]:
filename = 'result/mlp.params'
net.save_parameters(filename)

In [113]:
net2 = MLP()
net2.load_parameters(filename)

In [114]:
Y2 = net(X)

In [115]:
Y2 == Y


[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
<NDArray 2x10 @cpu(0)>

#### GPU计算

In [121]:
!nvidia-smi

Sun Nov  3 01:36:08 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.00    Driver Version: 418.87.00    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  Off  | 00000000:65:00.0 Off |                  N/A |
|  0%   43C    P2    58W / 260W |    401MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|    0  

In [118]:
import mxnet as mx

In [119]:
mx.cpu(), mx.gpu()

(cpu(0), gpu(0))

In [120]:
x = nd.array([1, 2, 3])
a = nd.array([1, 2, 3], ctx=mx.gpu())

In [122]:
y = x.copyto(mx.gpu())

In [123]:
y


[1. 2. 3.]
<NDArray 3 @gpu(0)>

In [124]:
z = y.as_in_context(mx.cpu())

In [125]:
z


[1. 2. 3.]
<NDArray 3 @cpu(0)>

In [126]:
y


[1. 2. 3.]
<NDArray 3 @gpu(0)>

如果源变量和目标变量的context一致，as_in_context函数
使目标变量和源变量共享源变量的内存或显存。
MXNet可以指定用来存储和计算的设备，如使用内存的CPU或者使用显存的GPU。在默认情况下，MXNet会将数据创建在内存，然后利用CPU来计算。
MXNet要求计算的所有输入数据都在内存或同一块显卡的显存上。

In [127]:
!git init

/data10t/mgf/ml/handsonmxnet
