# Pytorch 

- Better compatibility with Python libraries such as Scikit-learn, numpy
- More Pythonian
- Used by Facebook, Tweeter and NVIDIA
- Computational graphs are at the core of DL packages and allow efficient parallel processing

Ref:http://adventuresinmachinelearning.com/pytorch-tutorial-deep-learning/

# Basics: Tensor, Variable, Autograd

In [81]:
import torch

In [48]:
# TENSORS: are basically the Arrays in numpy
x = torch.ones(2,3)
y = torch.ones(2,3) * 2
z = x+y
z

tensor([[ 3.,  3.,  3.],
        [ 3.,  3.,  3.]])

In [66]:
# AUTOGRAD --> refer to the gradient of the backpropagation
from torch.autograd import Variable

# Create a variable that has gradient (i.e. updated during backprop)
x = Variable(torch.ones(2, 2), requires_grad=True)
# Variable containing:
# 1  1
# 1  1
# [torch.FloatTensor of size 2x2]
print(x.grad)

None


In [67]:
# Suppose that y = f(x), say
y = 2*x*x + 3*x # dy/dx=4x+3
print(y)

tensor([[ 5.,  5.],
        [ 5.,  5.]])


In [68]:
# We can backprop with:
y.backward(torch.ones(2, 2))
print(x.grad)

tensor([[ 7.,  7.],
        [ 7.,  7.]])


In [69]:
# The backprop can not be repeated... because hte buffer is freed after calculation!
y.backward(torch.ones(2, 2))
print(x.grad)

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [71]:
# To do backprop several time, use retain_graph
y = 2*x*x + 3*x # dy/dx=4x+3
print(y)
y.backward(torch.ones(2, 2), retain_graph=True)
# the retain_variables flag will prevent the internal buffers from being freed
print(x.grad) # 7+7

tensor([[ 5.,  5.],
        [ 5.,  5.]])
tensor([[ 14.,  14.],
        [ 14.,  14.]])


In [77]:
# Backprop done in a chain
x = Variable(torch.ones(2,2),requires_grad=True)
y = 2*x # 
print(y)
z = y * y * 3 # z=12x**2 --> dz/dx=24x 
#out = z.mean()
#print(z, out)

#out.backward()
z.backward(torch.ones(2,2))
print(x.grad)

tensor([[ 2.,  2.],
        [ 2.,  2.]])
tensor([[ 12.,  12.],
        [ 12.,  12.]])
tensor([[ 24.,  24.],
        [ 24.,  24.]])


In [79]:
# Backprop done in a chain. But here, the gradient is split through the graph!!
x = Variable(torch.ones(2,2),requires_grad=True)
y = 2*x # 
print(y)
z = y * y * 3 # z=12x**2 --> dz/dx=24x 

out = z.mean()
print(z, out)
print('Notice that out is the mean of z. When 24 is propagated back, it goes to 4 leaves')
print('x --> y --> z')
print('x --> y --> z |-->out')
print('x --> y --> z')
print('x --> y --> z')

out.backward()
print(x.grad)


tensor([[ 2.,  2.],
        [ 2.,  2.]])
tensor([[ 12.,  12.],
        [ 12.,  12.]]) tensor(12.)
Notice that out is the mean of z. When 24 is propagated back, it goes to 4 leaves
x --> y --> z
x --> y --> z |-->out
x --> y --> z
x --> y --> z
tensor([[ 6.,  6.],
        [ 6.,  6.]])


# DNN: fully connected neural net

Example with the MNIST dataset (28x28=724). Neural net with 4 layers as follows:

728 --> 200 --> 200 --> 10

In [93]:
import torch.nn as nn
import torch.nn.functional as F # activation functions
import torch.optim as optim

# 1) Define the Neural Net

class Net(nn.Module):  # Inherited from class nn.Module
    # Architecture
    def __init__(self):
        super(Net, self).__init__() # we need this to init the inherited class nn
        self.fc1 = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 10)
    
    # Define the activation functions and set the flow
    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        out = F.log_softmax(self.fc3(x))
        return out
        

In [94]:
# 2) Create an instance
net = Net()
print(net)

Net(
  (fc1): Linear(in_features=784, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=200, bias=True)
  (fc3): Linear(in_features=200, out_features=10, bias=True)
)


In [97]:
# 3) Define Optimizer
learning_rate = 0.001
mom = 0.9 # weight given to previous weight (concept: inertia)
optimizer = optim.SGD(net.parameters(),lr=learning_rate, momentum=mom)
#optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)
#optimizer = optim.RMSprop(params=net.parameters(), lr=learning_rate)

In [108]:
# 4) Define Loss
criterion = nn.NLLLoss() # Cross-entropy = Negative LogLik + Log_softmax

In [111]:
# 5) Define the datasets
from torchvision import datasets, transforms

spath = 'D:\\Dropbox (LCN)\\jisoft_LARGE\\0_data_MNIST'

def get_train_loader(batch_size):
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(spath, train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True)
    return(train_loader)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST(spath, train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=4, shuffle=True)
    

In [116]:
# 5) Train
epochs = 20
batch_size = 32

train_loader = get_train_loader(batch_size)
log_interval=40

# run the main training loop
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data), Variable(target)
        # resize data from (batch_size, 1, 28, 28) to (batch_size, 28*28)
        data = data.view(-1, 28*28)
        optimizer.zero_grad()
        net_out = net(data)
        loss = criterion(net_out, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                           100. * batch_idx / len(train_loader), loss.data[0]))
            





In [117]:
# 6) Testing

# run a test loop
test_loss = 0
correct = 0
for data, target in test_loader:
    data, target = Variable(data, volatile=True), Variable(target)
    data = data.view(-1, 28 * 28)
    net_out = net(data)
    # sum up batch loss
    test_loss += criterion(net_out, target).data[0]
    pred = net_out.data.max(1)[1]  # get the index of the max log-probability
    correct += pred.eq(target.data).sum()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


  import sys
  # This is added back by InteractiveShellApp.init_path()



Test set: Average loss: 0.0178, Accuracy: 9805/10000 (98%)

