#### Automatic Differentiation

`torch.Tensor` is the central class of the package. If you set its attribute `.requires_grad` as `True`, it starts to track all operations on it. When you finish your computation you can call `.backward()` and have all the gradients computed automatically. The gradient for this tensor will be accumulated into `.grad` attribute.

To stop a tensor from tracking history, you can call `.detach()` to detach it from the computation history, and to prevent future computation from being tracked.

To prevent tracking history,  you can also wrap the code block in with `torch.no_grad():`. This can be particularly helpful when evaluating a model because the model may have trainable parameters with `requires_grad=True`, but for which we don’t need the gradients.

If you want to compute the derivatives, you can call `.backward()` on a Tensor. If Tensor is a scalar, you don’t need to specify any arguments to `backward()`, however if it has more elements, you need to specify a gradient argument that is a tensor of matching shape.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):  # 自定义神经网络

    def __init__(self):
        super(Net, self).__init__()
# 1 input image channel, 6 output channels, 3x3 square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
# The learnable parameters of a model are returned by net.parameters()
import torch.optim as optim
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

optimizer.zero_grad()
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()  # Does the update

##### Loading and normalizing CIFAR10

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

##### Define Convolutional Neural Network

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
net = Net()

##### Define Loss Function and optimizer

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

##### Training the network

In [None]:
for epoch in range(2):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

##### Test the network on the test data

In [None]:
dataiter = iter(testloader)
images, labels = dataiter.next()
outputs = net(images)
_, predicted = torch.max(outputs, 1)
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

In [None]:
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        c = (predicted == labels).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1

##### Training on GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
inputs, labels = data[0].to(device), data[1].to(device)

#### Data Parallelism
It’s natural to execute your forward, backward propagations on multiple GPUs. However, Pytorch will only use one GPU by default. You can easily run your operations on multiple GPUs by making your model run parallelly using `DataParallel`

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

input_size, output_size, batch_size, data_size = 5, 2, 30, 100
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)
    def __getitem__(self, index):
        return self.data[index]
    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, data_size),
                         batch_size=batch_size, shuffle=True)

class Model(nn.Module):

    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
    def forward(self, input):
        output = self.fc(input)
        return output

In [None]:
''' First, we need to make a model instance and check 
if we have multiple GPUs. If we have multiple GPUs,
we can wrap our model using nn.DataParallel.
Then we can put our model on GPUs by model.to(device)'''
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model.to(device)
for data in rand_loader:
    input = data.to(device)
    output = model(input)

#### Learning pytorch with examples
A PyTorch Tensor is conceptually identical to a numpy array: a Tensor is an n-dimensional array, and PyTorch provides many functions for operating on these Tensors. Behind the scenes, Tensors can keep track of a computational graph and gradients, but they’re also useful as a generic tool for scientific computing.

Also unlike numpy, PyTorch Tensors can utilize GPUs to accelerate their numeric computations. To run a PyTorch Tensor on GPU, you simply need to cast it to a new datatype.

In [None]:
dtype = torch.float
device = torch.device("cpu")
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0) # relu函数
    y_pred = h_relu.mm(w2)
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()# item用于提取单个数值的Tensor
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

##### Autograd
When using autograd, the forward pass of your network will define a computational graph; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

This sounds complicated, it’s pretty simple to use in practice. Each Tensor represents a node in a computational graph. If `x` is a Tensor that has `x.requires_grad=True` then `x.grad` is another Tensor holding the gradient of `x` with respect to some scalar value.

In [None]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients
# with respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, 
                 dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    '''Use autograd to compute the backward pass. This call will compute
    the gradient of loss with respect to all Tensors with 
requires_grad=True. After this call w1.grad and w2.grad will be Tensors
holding the gradient of the loss with respect to w1 and w2 respectively.'''
    loss.backward()
'''Manually update weights using gradient descent. Wrap in 
torch.no_grad() because weights have requires_grad=True, but we don't 
need to track this in autograd.
首先在定义需要训练的参数是满足requires_grad=True,前向传播求出最终的loss，
调用loss。backward执行反向传播，得到各个参数的梯度计算值，在with torch.no_grad()
作用下更新参数的梯度，使用这个函数就是保证在梯度更新过程中不记录梯度，最后将参数
的梯度值置为0，如果不置为0，梯度会进行累加计算，影响后续梯度更新。'''
    with torch.no_grad():#Context-manager that disabled gradient calculation.
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

##### Defining new autograd function
Under the hood, each primitive autograd operator is really two functions that operate on Tensors. The forward function computes output Tensors from input Tensors. The backward function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

In PyTorch we can easily define our own autograd operator by defining a subclass of `torch.autograd.Function` and implementing the forward and backward functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data.

$\frac{\partial{f}}{\partial{x}} = \frac{\partial{f}}{\partial{y}} \times \frac{\partial{y}}{\partial{x}}$

In [None]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        """In the forward pass we receive a Tensor containing the input 
and return a Tensor containing the output. ctx is a context object that 
can be used to stash information for backward computation. You can cache
arbitrary objects for use in the backward pass using the
ctx.save_for_backward method."""
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    @staticmethod
    def backward(ctx, grad_output):
        """In the backward pass we receive a Tensor containing the
gradient of the loss with respect to the output, and we need to compute
the gradient of the loss with respect to the input.计算损失函数对输入的导数"""
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method.
    relu = MyReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)

##### Static graph
In TensorFlow, we define the computational graph once and then execute the same graph over and over again, possibly feeding different input data to the graph. In PyTorch, each forward pass defines a new computational graph.

##### pytorch.nn module
Computational graphs and autograd are a very powerful paradigm for defining complex operators and automatically taking derivatives; however for large neural networks raw autograd can be a bit too low-level.

When building neural networks we frequently think of arranging the computation into layers, some of which have learnable parameters which will be optimized during learning.

In PyTorch, the `nn` package serves this same purpose. The `nn` package defines a set of Modules, which are roughly equivalent to neural network layers. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters. The `nn` package also defines a set of useful loss functions that are commonly used when training neural networks.

In [None]:
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),torch.nn.Linear(H, D_out),)

# The nn package also contains definitions of popular loss functions;
# in this case we will use Mean Squared Error as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
'''Forward pass: compute predicted y by passing x to the model. Module 
objects override the __call__ operator so you can call them like 
functions. When doing so you pass a Tensor of input data to the Module 
and it produces a Tensor of output data.'''
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    # Zero the gradients before running the backward pass.
    model.zero_grad()
'''Backward pass: compute gradient of the loss with respect to all the
learnable parameters of the model. Internally, the parameters of each 
Module are stored in Tensors with requires_grad=True, so this call will 
compute gradients for all learnable parameters in the model.'''
    loss.backward()
'''Update the weights using gradient descent. Each parameter is a 
Tensor, so we can access its gradients like we did before.'''
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

##### optim


In [None]:
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
'''Before the backward pass, use the optimizer object to zero all of the
gradients for the variables it will update. This is because by default,
gradients are accumulated in buffers whenever .backward() is called. '''
    optimizer.zero_grad()
    loss.backward()
# Calling the step function on an Optimizer makes an update to its parameters
    optimizer.step()

##### Custom nn module
Sometimes you will want to specify models that are more complex than a sequence of existing Modules; for these cases you can define your own Modules by subclassing `nn.Module` and defining a forward which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors.

In [None]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """In the constructor we instantiate two nn.Linear modules and 
        assign them as member variables. """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """In the forward function we accept a Tensor of input data and 
        we must return a Tensor of output data. We can use Modules 
defined in the constructor as well as arbitrary operators on Tensors."""
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

model = TwoLayerNet(D_in, H, D_out)
'''Construct our loss function and an Optimizer. The call to model.
parameters() in the SGD constructor will contain the learnable parameters
of the two nn.Linear modules which are members of the model.'''
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

##### Control Flow + Weight Sharing
As an example of dynamic graphs and weight sharing, we implement a very strange model: a fully-connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers.

For this model we can use normal Python flow control to implement the loop, and we can implement weight __sharing among the innermost layers by simply reusing the same Module multiple times__ when defining the forward pass.

In [None]:
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """In the constructor we construct three nn.Linear instances 
    that we will use in the forward pass."""
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """Since each forward pass builds a dynamic computation graph,
we can use normal Python control-flow operators like loops or conditional
statements when defining the forward pass of the model. Here we also see 
that it is perfectly safe to reuse the same Module many times when 
defining a computational graph. """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred