In [10]:
# !pip3 install -U torch torchvision
import torch
torch.manual_seed(7)
torch.__version__, torch.cuda.is_available()

('1.1.0', False)

`.xxx_()` means inplace function

https://www.udacity.com/course/deep-learning-pytorch--ud188  ,  https://github.com/udacity/deep-learning-v2-pytorch/tree/master/intro-to-pytorch

# Tensors

## Numbers

In [195]:
x = torch.tensor(5)
x, x.item()            # convert to python scalar

(tensor(5), 5)

In [207]:
x.type(torch.ByteTensor) # typecasting

tensor([1, 1, 0, 0, 0], dtype=torch.uint8)

## Linear Alegra

In [29]:
features = torch.randn((1,5)) #tensor of size (1,5)
weights = torch.randn_like(features)
bias = torch.randn((1,1))
print(features,weights,bias)

tensor([[0.1328, 0.1373, 0.2405, 1.3955, 1.3470]]) tensor([[2.4382, 0.2028, 2.4505, 2.0256, 1.7792]]) tensor([[-0.9179]])


In [23]:
# matrix multiplication must have (ncols from 1st = nrows from 2nd)
print(torch.mm(features,weights.transpose(0,1)))  #args are dimenions to swap
print(torch.mm(features,weights.reshape(5,1))) #args are new nrow,ncol
print(torch.mm(features,weights.resize_(5,1))) #args are new nrow,ncol; if different numel, remove or add (uninitialized)
print(torch.mm(features,weights.view(5,1))) #.view returns new tensor; can use -1

tensor([[-1.9796]])
tensor([[-1.9796]])
tensor([[-1.9796]])
tensor([[-1.9796]])


## Functions

In [24]:
1/(1 + torch.exp(torch.sum(features * weights) + bias))

tensor([[0.9552]])

## To/from Numpy

memory is shared

In [27]:
import numpy as np
a = np.random.rand(2,2)
b = torch.from_numpy(a)
print(b)
print(b.numpy())

tensor([[0.2744, 0.6924],
        [0.9054, 0.0106]], dtype=torch.float64)
[[0.2743871  0.69241128]
 [0.90539865 0.01060206]]


In [28]:
b.mul_(2) # inplace multiplicatino
a # shows that memory is shared

array([[0.54877421, 1.38482256],
       [1.8107973 , 0.02120412]])

## Autograd

In [108]:
x = torch.randn(2,2, requires_grad=True)
y = x*x
z = y.mean()
print(z)

tensor(1.3915, grad_fn=<MeanBackward0>)


In [109]:
z.backward()
print(x.grad)
print(x/2)

tensor([[ 0.9434, -0.2670],
        [ 0.6160, -0.2253]])
tensor([[ 0.9434, -0.2670],
        [ 0.6160, -0.2253]], grad_fn=<DivBackward0>)


proving $
\frac{\partial z}{\partial x} = \frac{\partial}{\partial x}\left[\frac{1}{n}\sum_i^n x_i^2\right] = \frac{x}{2}
$

# Datasets

## Images

http://pytorch.org/docs/master/torchvision/datasets.html#imagefolder

In [None]:
from torchvision import datasets, transforms
dataset = datasets.ImageFolder('path/to/datafolder', transform=transform)

labels are automatically set to subfolder names inside

image transformations: http://pytorch.org/docs/master/torchvision/transforms.html
* `transforms.Resize(255)`: 
* `transforms.CenterCrop(224)`: 
* `transforms.ToTensor()`: to PyTorch tensors
* `transforms.RandomRotation(30)`: add randomness to improve network's robustness (different for each iteration); for test data, don't add randomness
* `transforms.RandomResizedCrop(224)`
* `transforms.RandomHorizontalFlip()`
* `transforms.Normalize((0.5,), (0.5,))`: mean and std for each channel:  `(input[channel] - mean[channel]) / std[channel]`

create generator that go through batches:

`dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)`

## MNIST

* `datasets.MNIST`
* `datasets.FashionMNIST`

In [179]:
from torchvision import datasets, transforms

# image transform pipeline
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,)), 
                               ])
trainset = datasets.MNIST('~/.pytorch/MNIST_data/', download=True, train=True, transform=transform)   # control train/test on train kw
testset = datasets.MNIST('~/.pytorch/MNIST_data/',  download=True, train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)  #generator
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)  #generator

images, labels = next(iter(trainloader))  #first batch of 64
images.shape

torch.Size([64, 1, 28, 28])

## Cats/Dogs

In [None]:
train_transforms = transforms.Compose([transforms.RandomRotation(30),
                                       transforms.RandomResizedCrop(224),
                                       transforms.RandomHorizontalFlip(),
                                       transforms.ToTensor(),
                                       transforms.Normalize([0.485, 0.456, 0.406],
                                                            [0.229, 0.224, 0.225])])

test_transforms = transforms.Compose([transforms.Resize(255),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize([0.485, 0.456, 0.406],
                                                           [0.229, 0.224, 0.225])])

train_data = datasets.ImageFolder('~/.pytorch/catdog', transform=train_transforms)
test_data = datasets.ImageFolder('~/.pytorch/catdog', transform=test_transforms)

trainloader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
testloader = torch.utils.data.DataLoader(test_data, batch_size=64)

# Constructing Network

In [157]:
from torch import nn
import torch.nn.functional as F

## Instantiate

Purely functional operations (without parameters/weights/bias) can just use torch.nn.functional in place of nn elements.

In [158]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden  = nn.Linear(784, 256)  # can access weights on .hidden.weight, .hidden.bias
        self.output  = nn.Linear(256, 10)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)    # dim=1 calculates softmax across the columns
    def forward(self, x):    # x is 2D: batchsize*inputdim
#         x = x.reshape(x.shape[0],784)  #make sure dimensions agree
        x = self.hidden(x)
        x = self.sigmoid(x)  # equiv. x = F.sigmoid(x); can also use like F.relu(x), etc...
        x = self.output(x)
        x = self.softmax(x)  # equiv. x = F.softmax(x, dim=1)
        return x

In [159]:
model = Network()
print(model)
print(model.hidden.weight)

Network(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (output): Linear(in_features=256, out_features=10, bias=True)
  (sigmoid): Sigmoid()
  (softmax): Softmax()
)
Parameter containing:
tensor([[ 0.0156,  0.0062,  0.0037,  ..., -0.0175,  0.0002, -0.0028],
        [-0.0309, -0.0274, -0.0252,  ...,  0.0282, -0.0348, -0.0207],
        [-0.0301, -0.0090, -0.0113,  ..., -0.0034, -0.0336, -0.0045],
        ...,
        [ 0.0103,  0.0287,  0.0193,  ...,  0.0154,  0.0196, -0.0301],
        [ 0.0153, -0.0250, -0.0262,  ...,  0.0334,  0.0218,  0.0190],
        [ 0.0265,  0.0232, -0.0021,  ..., -0.0223, -0.0150,  0.0172]],
       requires_grad=True)


Examples of `forward` member functions:

In [None]:
def forward(self, x):
    x = self.dropout(F.relu(self.fc1(x)))    # self.dropout = nn.Dropout(p=0.2)
    x = self.dropout(F.relu(self.fc2(x)))
    x = F.log_softmax(self.fc2(x), dim=1)
    return x

## Initialization

Access member variable to fill them (`_` indicates in-place):

In [160]:
model.hidden.bias.data.fill_(0)
model.output.weight.data.normal_(std=0.01)

tensor([[-0.0154, -0.0056, -0.0049,  ..., -0.0126, -0.0023,  0.0071],
        [ 0.0025,  0.0191, -0.0067,  ...,  0.0007, -0.0099,  0.0121],
        [-0.0073, -0.0180, -0.0143,  ...,  0.0043,  0.0129,  0.0060],
        ...,
        [ 0.0062, -0.0167, -0.0162,  ...,  0.0028, -0.0019, -0.0234],
        [-0.0151, -0.0146, -0.0093,  ...,  0.0034, -0.0014,  0.0079],
        [ 0.0068, -0.0141, -0.0177,  ...,  0.0089,  0.0093, -0.0067]])

## Forward

In [None]:
images=images.view(64, -1)  #originally 64x1x28x28
model.forward(images)  #forward all images
model(images) #equivalent

## Using nn. without class

In [164]:
model = nn.Sequential(nn.Linear(784, 256),
                      nn.ReLU(),
                      nn.Linear(256, 10),
                      nn.Softmax(dim=1))
print(model[0])  #accessing element
# model.forward(images)     # equiv model(images)

Linear(in_features=784, out_features=256, bias=True)


In [None]:
from collections import OrderedDict

model = nn.Sequential(OrderedDict([
                      ('fc1', nn.Linear(784, 256)),
                      ('relu1', nn.ReLU()),
                      ('output', nn.Linear(256, 10)),
                      ('softmax', nn.Softmax(dim=1))]))
print(model[0].bias)    # accessing elements
print(model.fc1.bias)   # accessing elements
# model.forward(images)    # equiv model(images)

# Training Network

## Defining loss function

* Linear output with CrossEntropyLoss, or
* LogSoftmax with NLLLoss

In [166]:
model = nn.Sequential(nn.Linear(784, 256),  #could also use class or OrderedDict
                      nn.ReLU(),
                      nn.Linear(256, 10),
                      nn.LogSoftmax(dim=1))
criterion = nn.NLLLoss()

images = images.view(64, -1)              #originally 64x1x28x28
loss = criterion( model(images), labels)  #averaged over the batch
print(loss)

tensor(2.3173, grad_fn=<NllLossBackward>)


## backprop

In [167]:
print('first layer grad Before backward pass: \n', model[0].weight.grad)
loss.backward()
print('first layer grad After backward pass: \n', model[0].weight.grad)

first layer grad Before backward pass: 
 None
first layer grad After backward pass: 
 tensor([[-0.0003, -0.0003, -0.0003,  ..., -0.0003, -0.0003, -0.0003],
        [-0.0058, -0.0058, -0.0058,  ..., -0.0058, -0.0058, -0.0058],
        [-0.0059, -0.0059, -0.0059,  ..., -0.0059, -0.0059, -0.0059],
        ...,
        [-0.0012, -0.0012, -0.0012,  ..., -0.0012, -0.0012, -0.0012],
        [-0.0015, -0.0015, -0.0015,  ..., -0.0015, -0.0015, -0.0015],
        [ 0.0038,  0.0038,  0.0038,  ...,  0.0038,  0.0038,  0.0038]])


## Optimizer

http://pytorch.org/docs/master/optim.html
* `optim.SGD`
* `optim.Adam`

In [171]:
from torch import optim
optimizer = optim.SGD(model.parameters(), lr=0.005)

In [173]:
# One step
optimizer.zero_grad()                   # gradients are accumulated -- clear first
loss = criterion(model(images), labels) # forward a batch
loss.backward()
# print(model[0].weight.grad)           # get the grad of parameters
print('Initial weights - ', model[0].weight)
optimizer.step()                        # update params with accumulated grad
print('Gradient -', model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[ 0.0225,  0.0054, -0.0342,  ..., -0.0061,  0.0087,  0.0269],
        [-0.0331,  0.0253,  0.0078,  ...,  0.0262,  0.0045, -0.0281],
        [-0.0069,  0.0357, -0.0105,  ...,  0.0260,  0.0166,  0.0045],
        ...,
        [ 0.0043, -0.0352,  0.0160,  ...,  0.0255, -0.0023, -0.0298],
        [ 0.0159, -0.0356, -0.0056,  ...,  0.0277,  0.0312,  0.0060],
        [-0.0139,  0.0074,  0.0274,  ..., -0.0272, -0.0031, -0.0049]],
       requires_grad=True)
Gradient - tensor([[-7.0587e-05, -7.0587e-05, -7.0587e-05,  ..., -7.0587e-05,
         -7.0587e-05, -7.0587e-05],
        [-4.9133e-03, -4.9133e-03, -4.9133e-03,  ..., -4.9133e-03,
         -4.9133e-03, -4.9133e-03],
        [-6.5717e-03, -6.5717e-03, -6.5717e-03,  ..., -6.5717e-03,
         -6.5717e-03, -6.5717e-03],
        ...,
        [-5.8080e-04, -5.8080e-04, -5.8080e-04,  ..., -5.8080e-04,
         -5.8080e-04, -5.8080e-04],
        [-1.1184e-03, -1.1184e-03, -1.1184e-03,  ..., -1.1184e

In [174]:
# Full epochs
for epoch in range(5):
    running_loss = 0
    for images, labels in trainloader:        
        images = images.view(images.shape[0], -1)
        
        optimizer.zero_grad()                     # empty grad
        loss = criterion(model(images), labels)   # forward
        loss.backward()                           # backward        
        optimizer.step()                          # update params with accumulated grad
        
        running_loss += loss.item()               # loss value itself??
    else:
        print(f"Training loss: {running_loss/len(trainloader)}")

Training loss: 0.9859724194129139
Training loss: 0.44501842942827546
Training loss: 0.3749764274432461
Training loss: 0.34324646042163437
Training loss: 0.3232783184154456


## On GPU

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.cuda() #move to cuda
# model.cpu()  #move back to cpu
model.to(device)
images=images.to(device)  #all data

# Inference

* `with torch.no_grad()`:  no grad tracking
* `model.eval()`        :  zero dropout probability
* `model.train()`       :  back to nonzero original dropout

In [188]:
# inference
with torch.no_grad():             # tells torch does not need to keep track of grad
    model.eval()                  # .eval() mode sets dropout probability to zero
    prob=torch.exp(model(images)) # The last layer is LogSoftmax, not Softmax
model.train()                     # opposite of .eval()
    
print(prob[0])
top_p, top_class = prob[:3].topk(2,dim=1)     # top-k most likely probability and the predicted classes


tensor([1.5433e-01, 1.0443e-03, 3.7916e-01, 5.2210e-02, 3.2306e-04, 1.2050e-02,
        3.9326e-01, 2.8806e-04, 7.2790e-03, 5.5042e-05])
torch.return_types.topk(
values=tensor([[0.3933, 0.3792],
        [0.9888, 0.0055],
        [0.9653, 0.0344]]),
indices=tensor([[6, 2],
        [6, 2],
        [7, 9]]))


In [189]:
# Evaluate on test
running_loss = 0
running_acc = 0
for images, labels in testloader:
    images = images.view(images.shape[0], -1)
    with torch.no_grad():             # tells torch does not need to keep track of grad
        out = model(images)           # forward
        
    loss = criterion(out, labels)     
    prob = torch.exp(out)
    top_p, top_class = prob.topk(1, dim=1)
    equals = (top_class == labels.view(*top_class.shape))
    
    running_loss += loss.item()
    running_acc += torch.mean(equals.type(torch.FloatTensor))   #cast torch.ByteTensor to torch.FloatTensor for taking .mean()

print(f"Test LogLoss:  {running_loss/len(trainloader)}")
print(f"Test Accuracy: {running_acc/len(trainloader)}")

Test LogLoss:  0.31288541873285514
Test Accuracy: 0.9099313616752625


# Save / Load

In [None]:
model.state_dict() # to ordered dict
torch.save(model.state_dict(), 'filename')      # save
model.load_state_dict( torch.load('filename') ) # load; "model" variable must have the same architecture

Need to manually build a dictionary with all the information you need to compeletely rebuild the model.

## Pretrained Models

http://pytorch.org/docs/0.3.0/torchvision/models.html

In [None]:
from torchvision import models
model = models.densenet121(pretrained=True)
for param in model.parameters():      # Freeze parameters of the feature detector
    param.requires_grad = False
model.classifier = nn.Sequential(OrderedDict([     # custom classifier
                          ('fc1', nn.Linear(1024, 256)),
                          ('relu', nn.ReLU()),
                          ('fc2', nn.Linear(256, 2)),
                          ('output', nn.LogSoftmax(dim=1))
                          ]))  