In [2]:
"""
Network Structure
-----------------

First, let's import the necessary libraries into python.

"""
from __future__ import division

import argparse, time, logging, random, math

import numpy as np
import mxnet as mx

from mxnet import gluon, nd, init
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms

from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, TrainingHistory

################################################################
#
# There are numerous structures for convolutional neural networks.
# Here we pick a simple yet well-performing structure, ``cifar_resnet20_v1``, for the
# tutorial.

# number of GPUs or CPU to use if you have
num_gpus = 1
ctx = [mx.gpu(i) for i in range(num_gpus)]

# ctx = mx.cpu(0)


############################################################
# your code here to define your net according to problem 2 #
net = nn.Sequential()
net.add(
    nn.Conv2D(6,kernel_size=5,strides=1,activation='relu'),
    nn.MaxPool2D(pool_size=2,strides=2), #pool_size???
    nn.Conv2D(16,kernel_size=5,strides=1,activation='relu'),
    nn.MaxPool2D(pool_size=2,strides=2), #pool_size???
    nn.Dense(128,activation='relu'), #nn.Dropout(0.5)
    nn.Dense(84,activation='relu'), #nn.Dropout(0.5)
    nn.Dense(10))



############################################################
# your code here to do initialization using existing API #
# net.initialize(init=init.MSRAPrelu(),ctx=ctx)
net.initialize(init=init.Normal(0.1),ctx=ctx)



################################################################
# Data Augmentation and Data Loader
# ---------------------------------
#
# Data augmentation is a common technique used for training. It is
# base on the assumption that, for the same object, photos under different
# composition, lighting condition, or color should all yield the same prediction.
#
# Here are photos of the Golden Bridge, taken by many people,
# at different time from different angles.
# We can easily tell that they are photos of the same thing.
#
# |image-golden-bridge|
#
# We want to teach this invariance to our model, by playing "augmenting"
# input image. Our augmentation transforms the image with
# resizing, cropping, flipping and other techniques.
#
# With ``Gluon``, we can create our transform function as following:

transform_train = transforms.Compose([
    # Randomly crop an area, and then resize it to be 32x32
    transforms.RandomResizedCrop(32),
    # Randomly flip the image horizontally
    transforms.RandomFlipLeftRight(),
    # Randomly jitter the brightness, contrast and saturation of the image
    transforms.RandomColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    # Randomly adding noise to the image
    transforms.RandomLighting(0.1),
    # Transpose the image from height*width*num_channels to num_channels*height*width
    # and map values from [0, 255] to [0,1]
    transforms.ToTensor(),
    # Normalize the image with mean and standard deviation calculated across all images
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

################################################################
# You may have noticed that most of the operations are randomized. This in effect
# increases the number of different images the model sees during training.
# The more data we have, the better our model generalizes over
# unseen images.
#
# On the other hand, when making prediction, we would like to remove all
# random operations in order to get a deterministic result. The transform
# function for prediction is:

transform_test = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

################################################################
# Note that it is important to keep the normalization step, since the
# model only works well on inputs from the same distribution.
#
# With the transform functions, we can define data loaders for our
# training and validation datasets.

# Batch Size for Each GPU
per_device_batch_size = 128
# Number of data loader workers
num_workers = 8
# Calculate effective total batch size
batch_size = per_device_batch_size * num_gpus

# Set train=True for training data
# Set shuffle=True to shuffle the training data
train_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
    batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

# Set train=False for validation data
val_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
    batch_size=batch_size, shuffle=False, num_workers=num_workers)

################################################################
# Optimizer, Loss and Metric
# --------------------------
#
# Optimizer improves the model during training. Here we use the popular
# Nesterov accelerated gradient descent algorithm.

# Learning rate decay factor
lr_decay = 0.1
# Epochs where learning rate decays
lr_decay_epoch = [80, 160, np.inf]

# standard SGD gradient descent
optimizer = 'sgd'
# Set parameters
optimizer_params = {'learning_rate': 0.01, 'wd': 0.0005, 'momentum': 0.9}

# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

################################################################
# In the above code, ``lr_decay`` and ``lr_decay_epoch`` are not directly
# used in ``trainer``. One important idea in model training is to
# gradually decrease learning rate. This means the optimizer takes large
# steps at the beginning, but step size becomes smaller and smaller in time.
#
#
# In order to optimize our model, we need a loss function.
# In essence, loss functions compute the difference between predictions and the
# ground-truth as a measure of model performance.
# We can then take the gradients of the loss w.r.t. the weights.
# Gradients points the optimizer to the direction weights should move to
# improve model performance.
#
# For classification tasks, we usually use softmax cross entropy as the
# loss function.

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

################################################################
# Metrics are similar to loss functions, but they are different in the
# following aspects:
#
# -  Metric is how we evaluate model performance. Each metric is related to a
#    specific task, but independent from the model training process.
# -  For classification, we usually only use one loss function to train
#    our model, but we can have several metrics for evaluating
#    performance.
# -  Loss function can be used as a metric, but sometimes its values are hard
#    to interpretate. For instance, the concept "accuracy" is
#    easier to understand than "softmax cross entropy"
#
# For simplicity, we use accuracy as the metric to monitor our training
# process. Besides, we record metric values, and will print them at the
# end of training.

train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])

################################################################
# Validation
# ----------
#
# Validation dataset provides us a way of monitoring the training process.
# We have labels for validation data, but they are held out during training.
# Instead, we use them to evaluate the models performance on unseen data
# and prevent overfitting.

def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
    return metric.get()

################################################################
# In order to evaluate performance, we need a metric. Then, we loop
# through the validation data and predict with our model.
# We'll run this function at the end of every epoch to show improvement.
# over the last epoch.
#
# Training
# --------
#
# After all the preparations, we can finally start training!
# Following is the script.
#
# .. note::
#   In order to finish the tutorial quickly, we only train for 3 epochs.
#   In your experiments, we recommend setting ``epochs=240``.

epochs = 240
lr_decay_count = 0

for epoch in range(epochs):
    tic = time.time()
    train_metric.reset()
    train_loss = 0

    # Learning rate decay
    if epoch == lr_decay_epoch[lr_decay_count]:
        trainer.set_learning_rate(trainer.learning_rate*lr_decay)
        lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
        # Extract data and label
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

        # AutoGrad
        with ag.record():
            output = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

        # Backpropagation
        for l in loss:
            l.backward()

        # Optimize
        trainer.step(batch_size)

        # Update metrics
        train_loss += sum([l.sum().asscalar() for l in loss])
        train_metric.update(label, output)

    name, acc = train_metric.get()
    # Evaluate on Validation data
    name, val_acc = test(ctx, val_data)

    # Update history and print metrics
    train_history.update([1-acc, 1-val_acc])
    print('[Epoch %d] train=%f val=%f loss=%f time: %f' %
        (epoch, acc, val_acc, train_loss, time.time()-tic))




# We can plot the metric scores with:

###############################################################
# your code here to plot the training curve and test accuracy #
train_history.plot(save_path='out.png')




###############################################################
# your code here to save parameters and visualize the ﬁlters  #
net.save_parameters('net.params')
net[0].weight.data()



  from ._conv import register_converters as _register_converters


[Epoch 0] train=0.268510 val=0.404400 loss=97869.595718 time: 4.304236
[Epoch 1] train=0.334455 val=0.434100 loss=90065.010742 time: 4.189026
[Epoch 2] train=0.358974 val=0.454000 loss=87052.667435 time: 6.111366
[Epoch 3] train=0.376803 val=0.472000 loss=85202.494583 time: 4.672479
[Epoch 4] train=0.387640 val=0.484400 loss=83685.420227 time: 3.883602
[Epoch 5] train=0.400300 val=0.480200 loss=82241.524948 time: 3.971216
[Epoch 6] train=0.410837 val=0.463500 loss=80964.010788 time: 3.985952
[Epoch 7] train=0.411959 val=0.494300 loss=80467.413132 time: 4.252895
[Epoch 8] train=0.418369 val=0.505200 loss=79686.102448 time: 3.603512
[Epoch 9] train=0.425982 val=0.504100 loss=78805.956879 time: 4.268838
[Epoch 10] train=0.430489 val=0.517400 loss=78328.335373 time: 4.536991
[Epoch 11] train=0.434295 val=0.529700 loss=78021.712494 time: 4.626204
[Epoch 12] train=0.438061 val=0.527000 loss=77530.543457 time: 3.556561
[Epoch 13] train=0.441927 val=0.524800 loss=77034.296951 time: 3.884007
[E

[Epoch 114] train=0.569471 val=0.672800 loss=60654.689285 time: 5.283784
[Epoch 115] train=0.569050 val=0.671900 loss=60172.771500 time: 4.071172
[Epoch 116] train=0.572236 val=0.675000 loss=60200.461655 time: 4.919038
[Epoch 117] train=0.569932 val=0.669400 loss=60661.151894 time: 3.356211
[Epoch 118] train=0.566727 val=0.671100 loss=60657.912231 time: 4.245722
[Epoch 119] train=0.572336 val=0.669800 loss=60282.429520 time: 4.995581
[Epoch 120] train=0.572817 val=0.672100 loss=60345.879547 time: 5.373185
[Epoch 121] train=0.568329 val=0.671700 loss=60651.235840 time: 3.487617
[Epoch 122] train=0.568970 val=0.674600 loss=60565.491096 time: 4.477429
[Epoch 123] train=0.572736 val=0.674000 loss=60062.547882 time: 3.598536
[Epoch 124] train=0.573458 val=0.672100 loss=60442.444725 time: 3.635899
[Epoch 125] train=0.571374 val=0.671400 loss=60386.256989 time: 4.337191
[Epoch 126] train=0.570252 val=0.671200 loss=60441.113403 time: 3.434189
[Epoch 127] train=0.571875 val=0.670700 loss=60148.

[Epoch 227] train=0.587500 val=0.680400 loss=58192.125618 time: 4.762910
[Epoch 228] train=0.586218 val=0.685000 loss=58413.458969 time: 4.653869
[Epoch 229] train=0.583253 val=0.680700 loss=58738.419518 time: 4.462377
[Epoch 230] train=0.586078 val=0.683100 loss=58239.435425 time: 6.059239
[Epoch 231] train=0.586899 val=0.682000 loss=58270.564407 time: 5.252048
[Epoch 232] train=0.586799 val=0.686100 loss=58449.507309 time: 4.499361
[Epoch 233] train=0.584996 val=0.683300 loss=58373.706093 time: 4.445098
[Epoch 234] train=0.585056 val=0.682300 loss=58288.696999 time: 6.262074
[Epoch 235] train=0.583854 val=0.682400 loss=58596.321091 time: 6.307427
[Epoch 236] train=0.586619 val=0.683300 loss=58430.249565 time: 5.569807
[Epoch 237] train=0.581811 val=0.685500 loss=58535.783394 time: 4.576275
[Epoch 238] train=0.584415 val=0.683500 loss=58936.603111 time: 5.578128
[Epoch 239] train=0.584315 val=0.682900 loss=58455.247375 time: 4.602535


Parameter conv0_weight (shape=(6, 3, 5, 5), dtype=<class 'numpy.float32'>)

In [2]:
"""
Network Structure
-----------------

First, let's import the necessary libraries into python.

"""
from __future__ import division

import argparse, time, logging, random, math

import numpy as np
import mxnet as mx

from mxnet import gluon, nd, init
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms

from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, TrainingHistory
import sys

sys.setdefaultencoding()

################################################################
#
# There are numerous structures for convolutional neural networks.
# Here we pick a simple yet well-performing structure, ``cifar_resnet20_v1``, for the
# tutorial.

# number of GPUs or CPU to use if you have
num_gpus = 1
ctx = [mx.gpu(i) for i in range(num_gpus)]

# ctx = mx.cpu(0)


############################################################
# your code here to define your net according to problem 2 #
net = nn.Sequential()
net.add(
    nn.Conv2D(6,kernel_size=5,strides=1,activation='relu'),
    nn.MaxPool2D(pool_size=2,strides=2), #pool_size???
    nn.Conv2D(16,kernel_size=5,strides=1,activation='relu'),
    nn.MaxPool2D(pool_size=2,strides=2), #pool_size???
    nn.Dense(128,activation='relu'), nn.Dropout(0.5),
    nn.Dense(84,activation='relu'), nn.Dropout(0.5),
    nn.Dense(10))



############################################################
# your code here to do initialization using existing API #
# net.initialize(init=init.MSRAPrelu(),ctx=ctx)
net.initialize(init=init.Normal(0.1),ctx=ctx)



################################################################
# Data Augmentation and Data Loader
# ---------------------------------
#
# Data augmentation is a common technique used for training. It is
# base on the assumption that, for the same object, photos under different
# composition, lighting condition, or color should all yield the same prediction.
#
# Here are photos of the Golden Bridge, taken by many people,
# at different time from different angles.
# We can easily tell that they are photos of the same thing.
#
# |image-golden-bridge|
#
# We want to teach this invariance to our model, by playing "augmenting"
# input image. Our augmentation transforms the image with
# resizing, cropping, flipping and other techniques.
#
# With ``Gluon``, we can create our transform function as following:

transform_train = transforms.Compose([
    # Randomly crop an area, and then resize it to be 32x32
    transforms.RandomResizedCrop(32),
    # Randomly flip the image horizontally
    transforms.RandomFlipLeftRight(),
    # Randomly jitter the brightness, contrast and saturation of the image
    transforms.RandomColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    # Randomly adding noise to the image
    transforms.RandomLighting(0.1),
    # Transpose the image from height*width*num_channels to num_channels*height*width
    # and map values from [0, 255] to [0,1]
    transforms.ToTensor(),
    # Normalize the image with mean and standard deviation calculated across all images
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

################################################################
# You may have noticed that most of the operations are randomized. This in effect
# increases the number of different images the model sees during training.
# The more data we have, the better our model generalizes over
# unseen images.
#
# On the other hand, when making prediction, we would like to remove all
# random operations in order to get a deterministic result. The transform
# function for prediction is:

transform_test = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

################################################################
# Note that it is important to keep the normalization step, since the
# model only works well on inputs from the same distribution.
#
# With the transform functions, we can define data loaders for our
# training and validation datasets.

# Batch Size for Each GPU
per_device_batch_size = 128
# Number of data loader workers
num_workers = 8
# Calculate effective total batch size
batch_size = per_device_batch_size * num_gpus

# Set train=True for training data
# Set shuffle=True to shuffle the training data
train_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
    batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)

# Set train=False for validation data
val_data = gluon.data.DataLoader(
    gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
    batch_size=batch_size, shuffle=False, num_workers=num_workers)

################################################################
# Optimizer, Loss and Metric
# --------------------------
#
# Optimizer improves the model during training. Here we use the popular
# Nesterov accelerated gradient descent algorithm.

# Learning rate decay factor
lr_decay = 0.1
# Epochs where learning rate decays
lr_decay_epoch = [80, 160, np.inf]

# standard SGD gradient descent
optimizer = 'sgd'
# Set parameters
optimizer_params = {'learning_rate': 0.01, 'wd': 0.0005, 'momentum': 0.9}

# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)

################################################################
# In the above code, ``lr_decay`` and ``lr_decay_epoch`` are not directly
# used in ``trainer``. One important idea in model training is to
# gradually decrease learning rate. This means the optimizer takes large
# steps at the beginning, but step size becomes smaller and smaller in time.
#
#
# In order to optimize our model, we need a loss function.
# In essence, loss functions compute the difference between predictions and the
# ground-truth as a measure of model performance.
# We can then take the gradients of the loss w.r.t. the weights.
# Gradients points the optimizer to the direction weights should move to
# improve model performance.
#
# For classification tasks, we usually use softmax cross entropy as the
# loss function.

loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()

################################################################
# Metrics are similar to loss functions, but they are different in the
# following aspects:
#
# -  Metric is how we evaluate model performance. Each metric is related to a
#    specific task, but independent from the model training process.
# -  For classification, we usually only use one loss function to train
#    our model, but we can have several metrics for evaluating
#    performance.
# -  Loss function can be used as a metric, but sometimes its values are hard
#    to interpretate. For instance, the concept "accuracy" is
#    easier to understand than "softmax cross entropy"
#
# For simplicity, we use accuracy as the metric to monitor our training
# process. Besides, we record metric values, and will print them at the
# end of training.

train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])

################################################################
# Validation
# ----------
#
# Validation dataset provides us a way of monitoring the training process.
# We have labels for validation data, but they are held out during training.
# Instead, we use them to evaluate the models performance on unseen data
# and prevent overfitting.

def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
    return metric.get()

################################################################
# In order to evaluate performance, we need a metric. Then, we loop
# through the validation data and predict with our model.
# We'll run this function at the end of every epoch to show improvement.
# over the last epoch.
#
# Training
# --------
#
# After all the preparations, we can finally start training!
# Following is the script.
#
# .. note::
#   In order to finish the tutorial quickly, we only train for 3 epochs.
#   In your experiments, we recommend setting ``epochs=240``.

epochs = 240
lr_decay_count = 0

for epoch in range(epochs):
    tic = time.time()
    train_metric.reset()
    train_loss = 0

    # Learning rate decay
    if epoch == lr_decay_epoch[lr_decay_count]:
        trainer.set_learning_rate(trainer.learning_rate*lr_decay)
        lr_decay_count += 1

    # Loop through each batch of training data
    for i, batch in enumerate(train_data):
        # Extract data and label
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)

        # AutoGrad
        with ag.record():
            output = [net(X) for X in data]
            loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]

        # Backpropagation
        for l in loss:
            l.backward()

        # Optimize
        trainer.step(batch_size)

        # Update metrics
        train_loss += sum([l.sum().asscalar() for l in loss])
        train_metric.update(label, output)

    name, acc = train_metric.get()
    # Evaluate on Validation data
    name, val_acc = test(ctx, val_data)

    # Update history and print metrics
    train_history.update([1-acc, 1-val_acc])
    print('[Epoch %d] train=%f val=%f loss=%f time: %f' %
        (epoch, acc, val_acc, train_loss, time.time()-tic))




# We can plot the metric scores with:

###############################################################
# your code here to plot the training curve and test accuracy #
train_history.plot(save_path='out1.png')




###############################################################
# your code here to save parameters and visualize the ﬁlters  #
net.save_parameters('net1.params')
net[0].weight.data()


AttributeError: module 'sys' has no attribute 'setdefaultencoding'