In [19]:
from __future__ import print_function
import mxnet as mx
import numpy as np
from mxnet import nd, autograd, gluon
mx.random.seed(1)
ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()

def transform(data, label):
    return data.astype(np.float32)/255, label.astype(np.float32)

def relu(X):
    return nd.maximum(X, 0)

def dropout(X, drop_probability):
    keep_probability = 1 - drop_probability
    mask = nd.random_uniform(0, 1.0, X.shape, ctx=X.context) < keep_probability
    #############################
    #  Avoid division by 0 when scaling
    #############################
    if keep_probability > 0.0:
        scale = (1/keep_probability)
    else:
        scale = 0.0
    return mask * X * scale

def softmax(y_linear):
    exp = nd.exp(y_linear-nd.max(y_linear))
    partition = nd.nansum(exp, axis=0, exclude=True).reshape((-1,1))
    return exp / partition

def softmax_cross_entropy(yhat_linear, y):
    return - nd.nansum(y * nd.log_softmax(yhat_linear), axis=0, exclude=True)

def net(X, drop_prob=0.0):
    #######################
    #  Compute the first hidden layer
    #######################
    h1_linear = nd.dot(X, W1) + b1
    h1 = relu(h1_linear)
    h1 = dropout(h1, drop_prob)

    #######################
    #  Compute the second hidden layer
    #######################
    h2_linear = nd.dot(h1, W2) + b2
    h2 = relu(h2_linear)
    h2 = dropout(h2, drop_prob)

    #######################
    #  Compute the output layer.
    #  We will omit the softmax function here
    #  because it will be applied
    #  in the softmax_cross_entropy loss
    #######################
    yhat_linear = nd.dot(h2, W3) + b3
    return yhat_linear

def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad
        

def evaluate_accuracy(data_iterator, net):
    numerator = 0.
    denominator = 0.
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(ctx).reshape((-1,784))
        label = label.as_in_context(ctx)
        output = net(data)
        predictions = nd.argmax(output, axis=1)
        numerator += nd.sum(predictions == label)
        denominator += data.shape[0]
    return (numerator / denominator).asscalar()



In [24]:

mnist = mx.test_utils.get_mnist()
batch_size = 64

train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True, transform=transform),
                                      batch_size, shuffle=True)
test_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=False, transform=transform),
                                     batch_size, shuffle=False)

In [21]:
W1 = nd.random_normal(shape=(784,256), ctx=ctx) *.01
b1 = nd.random_normal(shape=256, ctx=ctx) * .01

W2 = nd.random_normal(shape=(256,128), ctx=ctx) *.01
b2 = nd.random_normal(shape=128, ctx=ctx) * .01

W3 = nd.random_normal(shape=(128,10), ctx=ctx) *.01
b3 = nd.random_normal(shape=10, ctx=ctx) *.01

params = [W1, b1, W2, b2, W3, b3]

for param in params:
    param.attach_grad()
    

In [25]:
def run_model(epochs, moving_loss, learning_rate, drop_prob=0.0):
    for e in range(epochs):
        for i, (data, label) in enumerate(train_data):
            data = data.as_in_context(ctx).reshape((-1,784))
            label = label.as_in_context(ctx)
            label_one_hot = nd.one_hot(label, 10)
            with autograd.record():
                ################################
                #   Drop out 50% of hidden activations on the forward pass
                ################################
                output = net(data, drop_prob=drop_prob)
                loss = softmax_cross_entropy(output, label_one_hot)
            loss.backward()
            SGD(params, learning_rate)

            ##########################
            #  Keep a moving average of the losses
            ##########################
            if i == 0:
                moving_loss = nd.mean(loss).asscalar()
            else:
                moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar()

        test_accuracy = evaluate_accuracy(test_data, net)
        train_accuracy = evaluate_accuracy(train_data, net)
        print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))


In [27]:
epochs = 10
moving_loss = 0.
learning_rate = .001
drop_prob_1 = .5
drop_prob_0 = .2
drop_prob_2 = .8

run_model(epochs, moving_loss, learning_rate, drop_prob_0)
print("########")
run_model(epochs, moving_loss, learning_rate, drop_prob_1)
print("########")
run_model(epochs, moving_loss, learning_rate, drop_prob_2)

Epoch 0. Loss: 0.07306404552387685, Train_acc 0.98628336, Test_acc 0.9771
Epoch 1. Loss: 0.06342638775173977, Train_acc 0.9884833, Test_acc 0.9776
Epoch 2. Loss: 0.05340114151046477, Train_acc 0.9892, Test_acc 0.979
Epoch 3. Loss: 0.049753681222384195, Train_acc 0.9913333, Test_acc 0.9793
Epoch 4. Loss: 0.04727674141073009, Train_acc 0.99205, Test_acc 0.9799
Epoch 5. Loss: 0.044759081296268315, Train_acc 0.9926, Test_acc 0.9806
Epoch 6. Loss: 0.04101160041203503, Train_acc 0.99413335, Test_acc 0.9807
Epoch 7. Loss: 0.04085139174864844, Train_acc 0.9949833, Test_acc 0.981
Epoch 8. Loss: 0.037974419814203124, Train_acc 0.99521667, Test_acc 0.9808
Epoch 9. Loss: 0.03294160778695116, Train_acc 0.9952833, Test_acc 0.9808
########
Epoch 0. Loss: 0.10964125070984783, Train_acc 0.9920167, Test_acc 0.9795
Epoch 1. Loss: 0.09674480023945269, Train_acc 0.9920833, Test_acc 0.9803
Epoch 2. Loss: 0.09693814251609556, Train_acc 0.99161667, Test_acc 0.9786
Epoch 3. Loss: 0.09040206071322242, Train_acc

In [35]:
#Build with gluon

num_hidden = 256
num_output = 10
num_input = 784
drop_prob = .5
epochs = 10
smoothing_constant = .01
learning_rate = .1

net = gluon.nn.Sequential()

def run_model_gluon(num_hidden, drop_prob, learning_rate=.1, epochs = 10, smoothing_constants = .1):
    with net.name_scope():
        net.add(gluon.nn.Dense(num_hidden, activation="relu"))
        net.add(gluon.nn.Dropout(drop_prob))
        net.add(gluon.nn.Dense(num_hidden, activation="relu"))
        net.add(gluon.nn.Dropout(drop_prob))
        net.add(gluon.nn.Dense(num_output))
    
    net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': learning_rate})
    
    for e in range(epochs):
        for i, (data, label) in enumerate(train_data):
            data = data.as_in_context(ctx).reshape((-1, 784))
            label = label.as_in_context(ctx)
            with autograd.record():
                output = net(data)
                loss = softmax_cross_entropy(output, label)
                loss.backward()
            trainer.step(data.shape[0])

            ##########################
            #  Keep a moving average of the losses
            ##########################
            curr_loss = nd.mean(loss).asscalar()
            moving_loss = (curr_loss if ((i == 0) and (e == 0))
                           else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss)

        test_accuracy = evaluate_accuracy(test_data, net)
        train_accuracy = evaluate_accuracy(train_data, net)
        print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" %
              (e, moving_loss, train_accuracy, test_accuracy))
    
    return net
        
def save_model(net, output_path):
    net.save_parameters(output_path)
    
net1 = run_model_gluon(num_hidden, drop_prob=.2)
save_model(net1, os.path.join("checkpoints", "net1.params"))
# print("#########")
# run_model_gluon(num_hidden, drop_prob=.5)
# print("#########")
# run_model_gluon(num_hidden, drop_prob=.8)
    

Epoch 0. Loss: 0.23804695222742336, Train_acc 0.9492, Test_acc 0.9473
Epoch 1. Loss: 0.15733224429679663, Train_acc 0.96648335, Test_acc 0.9638
Epoch 2. Loss: 0.1288373542143002, Train_acc 0.97361666, Test_acc 0.9682
Epoch 3. Loss: 0.1081733464996873, Train_acc 0.97978336, Test_acc 0.9729
Epoch 4. Loss: 0.0872821198329437, Train_acc 0.9798167, Test_acc 0.9722
Epoch 5. Loss: 0.07667797150629169, Train_acc 0.98606664, Test_acc 0.977
Epoch 6. Loss: 0.07144925333083696, Train_acc 0.98861665, Test_acc 0.9778
Epoch 7. Loss: 0.06285207139919195, Train_acc 0.98866665, Test_acc 0.9778
Epoch 8. Loss: 0.05466447627110476, Train_acc 0.99111664, Test_acc 0.9784
Epoch 9. Loss: 0.05116152321979755, Train_acc 0.99275, Test_acc 0.9799


NameError: name 'os' is not defined