Today, we will be intoducing PyTorch, "an open source deep learning platform that provides a seamless path from research prototyping to production deployment".

This notebook is by no means comprehensive. If you have any questions the documentation and Google are your friends.

Goal takeaways:

Automatic differentiation is a powerful tool
PyTorch implements common functions used in deep learning
Data Processing with PyTorch DataSet
Mixed Presision Training in PyTorch

### Tensors
Tensors and relation to numpy
By this point, we have worked with numpy quite a bit. PyTorch's basic building block, the tensor is similar to numpy's ndarray

In [2]:
import torch
import numpy as np
# we create tensors in a similar way to numpy nd arrays
x_numpy = np.array([0.1, 0.2, 0.3])
x_torch = torch.tensor([0.1, 0.2, 0.3])
print('x_numpy, x_torch')
print(x_numpy, x_torch)
print()

# to and from numpy, pytorch
print('to and from numpy and pytorch')
print(torch.from_numpy(x_numpy), x_torch.numpy())
print()

# we can do basic operations like +-*/
y_numpy = np.array([3,4,5.])
y_torch = torch.tensor([3,4,5.])
print("x+y")
print(x_numpy + y_numpy, x_torch + y_torch)
print()

# many functions that are in numpy are also in pytorch
print("norm")
print(np.linalg.norm(x_numpy), torch.norm(x_torch))
print()

# to apply an operation along a dimension,
# we use the dim keyword argument instead of axis
print("mean along the 0th dimension")
x_numpy = np.array([[1,2],[3,4.]])
x_torch = torch.tensor([[1,2],[3,4.]])
print(np.mean(x_numpy, axis=0), torch.mean(x_torch, dim=0))

x_numpy, x_torch
[0.1 0.2 0.3] tensor([0.1000, 0.2000, 0.3000])

to and from numpy and pytorch
tensor([0.1000, 0.2000, 0.3000], dtype=torch.float64) [0.1 0.2 0.3]

x+y
[3.1 4.2 5.3] tensor([3.1000, 4.2000, 5.3000])

norm
0.37416573867739417 tensor(0.3742)

mean along the 0th dimension
[2. 3.] tensor([2., 3.])


In [3]:
N, C, W, H = 10000, 3, 28, 28
X = torch.randn((N, C, W, H))

print(X.shape)
print(X.view(N, C, 784).shape)
print(X.view(-1, C, 784).shape) # automatically choose the 0th dimension

torch.Size([10000, 3, 28, 28])
torch.Size([10000, 3, 784])
torch.Size([10000, 3, 784])


In [4]:
a = torch.tensor(2.0, requires_grad=True) # we set requires_grad=True to let PyTorch know to keep the graph
b = torch.tensor(1.0, requires_grad=True)
c = a + b
d = b + 1
e = c * d
print('c', c)
print('d', d)
print('e', e)

c tensor(3., grad_fn=<AddBackward0>)
d tensor(2., grad_fn=<AddBackward0>)
e tensor(6., grad_fn=<MulBackward0>)


In [17]:
def f(x):
    return (x-2)**2

def fp(x):
    return 2*(x-2)

x = torch.tensor([1.0], requires_grad=True)

y = f(x)
y.backward()

print('Analytical f\'(x):', fp(x))
print('PyTorch\'s f\'(x):', x.grad)

Analytical f'(x): tensor([-2.], grad_fn=<MulBackward0>)
PyTorch's f'(x): tensor([-2.])


In [None]:
It can also find gradients of functions.

Let w=[w1,w2]T
Consider g(w)=2w1w2+w2cos(w1)
Q: Compute ∇wg(w) and verify ∇wg([π,1])=[2,π−1]T

In [13]:

def f(w):
    return 2*w[0]*w[1]+w[1]*torch.cos(w[0])
def fp(w):
    return torch.tensor([2*w[1]-w[1]*torch.sin(w[0]),2*w[0]+torch.cos(w[0])])
x = torch.tensor([np.pi,1],requires_grad=True)
y = f(x)
y.backward()
print(fp(x))
print(x.grad)

tensor([2.0000, 5.2832])
tensor([2.0000, 5.2832])


In [25]:
def f(x):
    return (x-2)**2

def fp(x):
    return 2*(x-2)
x = torch.tensor([5.],requires_grad=True)
step_size = 0.25
for i in range(15):
    y = f(x)
    y.backward()
    print('{},\t{:.3f},\t{:.3f},\t{:.3f},\t{:.3f}'.format(i, x.item(), f(x).item(), fp(x).item(), x.grad.item()))
#     x.data -= step_size*x.data
    x.data = x.data - step_size * x.grad
    x.grad.detach_()
    x.grad.zero_()


0,	5.000,	9.000,	6.000,	6.000
1,	3.500,	2.250,	3.000,	3.000
2,	2.750,	0.562,	1.500,	1.500
3,	2.375,	0.141,	0.750,	0.750
4,	2.188,	0.035,	0.375,	0.375
5,	2.094,	0.009,	0.188,	0.188
6,	2.047,	0.002,	0.094,	0.094
7,	2.023,	0.001,	0.047,	0.047
8,	2.012,	0.000,	0.023,	0.023
9,	2.006,	0.000,	0.012,	0.012
10,	2.003,	0.000,	0.006,	0.006
11,	2.001,	0.000,	0.003,	0.003
12,	2.001,	0.000,	0.001,	0.001
13,	2.000,	0.000,	0.001,	0.001
14,	2.000,	0.000,	0.000,	0.000


In [28]:
d = 2
n = 50
X = torch.randn(n,d)
true_w = torch.tensor([[-1.0], [2.0]])
y = X @ true_w + torch.randn(n,1) * 0.1
print('X shape', X.shape)
print('y shape', y.shape)
print('w shape', true_w.shape)

X shape torch.Size([50, 2])
y shape torch.Size([50, 1])
w shape torch.Size([2, 1])


In [None]:
∇wLRSS(w;X)=∇w1n||y−Xw||22=−2nXT(y−Xw)

In [None]:
d = 2
n = 50
X = torch.randn(n,d)
true_w = torch.tensor([[-1.0], [2.0]])
y = X @ true_w + torch.randn(n,1) * 0.1
def model(w,x):
    return x @ w
def RSS(y, y_hat):
    return torch.norm(y-y_hat)**2/n
def grad_rss(X, y ,y_hat):
    return -2*x.t() @ (y-x @ W)/n
w = torch.tensor([[1.],[0]],requires_grad=True)
y_hat = model(w,X)
loss = RSS(y, y_hat)
loss.backward()

In [32]:
def model(X, w):
    return X @ w

# the residual sum of squares loss function
def rss(y, y_hat):
    return torch.norm(y - y_hat)**2 / n

# analytical expression for the gradient
def grad_rss(X, y, w):
    return -2*X.t() @ (y - X @ w) / n

w = torch.tensor([[1.], [0]], requires_grad=True)
y_hat = model(X, w)

loss = rss(y, y_hat)
loss.backward()

print('Analytical gradient', grad_rss(X, y, w).detach().view(2).numpy())
print('PyTorch\'s gradient', w.grad.view(2).numpy())

Analytical gradient [ 5.6441026 -4.8636155]
PyTorch's gradient [ 5.6441035 -4.8636155]


In [37]:
step_size = 0.1

print('iter,\tloss,\tw')
for i in range(20):
    y_hat = model(X, w)
    loss = rss(y, y_hat)
    
    loss.backward() # compute the gradient of the loss
    
    w.data = w.data - step_size * w.grad # do a gradient descent step
#     w.data -= step_size * w.grad
    print('{},\t{:.2f},\t{}'.format(i, loss.item(), w.view(2).detach().numpy()))
    
    # We need to zero the grad variable since the backward()
    # call accumulates the gradients in .grad instead of overwriting.
    # The detach_() is for efficiency. You do not need to worry too much about it.
    w.grad.detach()
    w.grad.zero_()

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', w.view(2).detach().numpy())

iter,	loss,	w
0,	0.01,	[-1.0158551  1.9810736]
1,	0.01,	[-1.0158551  1.9810736]
2,	0.01,	[-1.0158551  1.9810736]
3,	0.01,	[-1.0158551  1.9810736]
4,	0.01,	[-1.0158551  1.9810736]
5,	0.01,	[-1.0158551  1.9810736]
6,	0.01,	[-1.0158551  1.9810736]
7,	0.01,	[-1.0158551  1.9810736]
8,	0.01,	[-1.0158551  1.9810736]
9,	0.01,	[-1.0158551  1.9810736]
10,	0.01,	[-1.0158551  1.9810736]
11,	0.01,	[-1.0158551  1.9810736]
12,	0.01,	[-1.0158551  1.9810736]
13,	0.01,	[-1.0158551  1.9810736]
14,	0.01,	[-1.0158551  1.9810736]
15,	0.01,	[-1.0158551  1.9810736]
16,	0.01,	[-1.0158551  1.9810736]
17,	0.01,	[-1.0158551  1.9810736]
18,	0.01,	[-1.0158551  1.9810736]
19,	0.01,	[-1.0158551  1.9810736]

true w		 [-1.  2.]
estimated w	 [-1.0158551  1.9810736]


In [48]:
import torch.nn as nn
d_in = 3
d_out = 4
linear_module = nn.Linear(d_in, d_out)

example_tensor = torch.tensor([[1.,2,3], [4,5,6]])
# applys a linear transformation to the data
transformed = linear_module(example_tensor)
print('example_tensor', example_tensor.shape)
print('transormed', transformed.shape)
print()
print('We can see that the weights exist in the background\n')
print('W:', linear_module.weight)
print('b:', linear_module.bias)

example_tensor torch.Size([2, 3])
transormed torch.Size([2, 4])

We can see that the weights exist in the background

W: Parameter containing:
tensor([[-0.1580, -0.0981, -0.5399],
        [ 0.4638,  0.0480,  0.3148],
        [-0.1467, -0.1837,  0.5089],
        [-0.1196,  0.5382,  0.0448]], requires_grad=True)
b: Parameter containing:
tensor([ 0.4099,  0.5509, -0.3729,  0.2608], requires_grad=True)


In [49]:
activation_fn = nn.ReLU() # we instantiate an instance of the ReLU module
example_tensor = torch.tensor([-1.0, 1.0, 0.0])
activated = activation_fn(example_tensor)
print('example_tensor', example_tensor)
print('activated', activated)

example_tensor tensor([-1.,  1.,  0.])
activated tensor([0., 1., 0.])


In [51]:
d_in = 3
d_hidden = 4
d_out = 1
model = torch.nn.Sequential(
                            nn.Linear(d_in, d_hidden),
                            nn.Tanh(),
                            nn.Linear(d_hidden, d_out),
                            nn.Sigmoid()
                           )

example_tensor = torch.tensor([[1.,2,3],[4,5,6]])
transformed = model(example_tensor)
print('transformed', transformed)

transformed tensor([[0.3223],
        [0.3359]], grad_fn=<SigmoidBackward>)


In [52]:
params = model.parameters()

for param in params:
    print(param)

Parameter containing:
tensor([[ 0.4880, -0.1715,  0.3732],
        [ 0.3882,  0.0937, -0.4099],
        [ 0.3652, -0.3831,  0.0227],
        [ 0.4116, -0.0352,  0.2695]], requires_grad=True)
Parameter containing:
tensor([ 0.1442, -0.2094, -0.5451,  0.0690], requires_grad=True)
Parameter containing:
tensor([[-0.1441,  0.4343,  0.0841,  0.1366]], requires_grad=True)
Parameter containing:
tensor([-0.3677], requires_grad=True)


In [53]:
mse_loss_fn = nn.MSELoss()

input = torch.tensor([[0., 0, 0]])
target = torch.tensor([[1., 0, -1]])

loss = mse_loss_fn(input, target)

print(loss)

tensor(0.6667)


In [56]:
# create a simple model
model = nn.Linear(1, 1)

# create a simple dataset
X_simple = torch.tensor([[1.]])
y_simple = torch.tensor([[2.]])

# create our optimizer
optim = torch.optim.SGD(model.parameters(), lr=1e-2)
mse_loss_fn = nn.MSELoss()

y_hat = model(X_simple)
print('model params before:', model.weight)
loss = mse_loss_fn(y_hat, y_simple)
optim.zero_grad()
loss.backward()
optim.step()
print('model params after:', model.weight)

model params before: Parameter containing:
tensor([[-0.9543]], requires_grad=True)
model params after: Parameter containing:
tensor([[-0.8754]], requires_grad=True)


In [68]:
d = 2
linear_module = nn.Linear(d, 1, bias=False)
optim = torch.optim.SGD(linear_module.parameters(),lr=step_size)
step_size = 0.1

mse_loss = nn.MSELoss()
for i in range(20):
    y_hat = model(X) 
    loss = mse_loss(y, y_hat)
    optim.zero_grad()
    loss.backward()
    optim.step()
#     print('{},\t{:.2f},\t{}'.format(i, loss.item(), linear_module.weight.view(2).detach().numpy()))
    print("{},\t{:.2f},\t{}".format(i, loss.item(), model.weight.view(2).detach().numpy()))

0,	9.39,	[ 0.12639225 -0.50910246]
1,	9.39,	[ 0.12639225 -0.50910246]
2,	9.39,	[ 0.12639225 -0.50910246]
3,	9.39,	[ 0.12639225 -0.50910246]
4,	9.39,	[ 0.12639225 -0.50910246]
5,	9.39,	[ 0.12639225 -0.50910246]
6,	9.39,	[ 0.12639225 -0.50910246]
7,	9.39,	[ 0.12639225 -0.50910246]
8,	9.39,	[ 0.12639225 -0.50910246]
9,	9.39,	[ 0.12639225 -0.50910246]
10,	9.39,	[ 0.12639225 -0.50910246]
11,	9.39,	[ 0.12639225 -0.50910246]
12,	9.39,	[ 0.12639225 -0.50910246]
13,	9.39,	[ 0.12639225 -0.50910246]
14,	9.39,	[ 0.12639225 -0.50910246]
15,	9.39,	[ 0.12639225 -0.50910246]
16,	9.39,	[ 0.12639225 -0.50910246]
17,	9.39,	[ 0.12639225 -0.50910246]
18,	9.39,	[ 0.12639225 -0.50910246]
19,	9.39,	[ 0.12639225 -0.50910246]


In [64]:
step_size = 0.01

linear_module = nn.Linear(d, 1)
loss_func = nn.MSELoss()
optim = torch.optim.SGD(linear_module.parameters(), lr=step_size)
print('iter,\tloss,\tw')
for i in range(200):
    rand_idx = np.random.choice(n) # take a random point from the dataset
    x = X[rand_idx] 
    y_hat = linear_module(x)
    loss = loss_func(y_hat, y[rand_idx]) # only compute the loss on the single point
    optim.zero_grad()
    loss.backward()
    optim.step()
    
    if i % 20 == 0:
        print('{},\t{:.2f},\t{}'.format(i, loss.item(), linear_module.weight.view(2).detach().numpy()))

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', linear_module.weight.view(2).detach().numpy())

iter,	loss,	w
0,	8.97,	[-0.31551167  0.08615857]
1,	5.06,	[-0.5181806   0.54466945]
2,	2.86,	[-0.66269034  0.89205366]
3,	1.62,	[-0.76561904  1.1552937 ]
4,	0.92,	[-0.83884495  1.354807  ]
5,	0.53,	[-0.8908728  1.5060471]
6,	0.30,	[-0.9277874  1.6207129]
7,	0.18,	[-0.9539389  1.7076629]
8,	0.10,	[-0.97243416  1.7736064 ]
9,	0.06,	[-0.98549044  1.8236257 ]
10,	0.04,	[-0.9946883  1.8615714]
11,	0.03,	[-1.0011531  1.8903618]
12,	0.02,	[-1.0056854  1.9122086]
13,	0.01,	[-1.0088539  1.9287884]
14,	0.01,	[-1.0110618  1.9413726]
15,	0.01,	[-1.0125947  1.9509252]
16,	0.01,	[-1.0136546  1.9581772]
17,	0.01,	[-1.0143838  1.9636834]
18,	0.01,	[-1.0148827  1.9678643]
19,	0.01,	[-1.0152218  1.9710393]

true w		 [-1.  2.]
estimated w	 [-1.0152218  1.9710393]


In [69]:
loss = nn.CrossEntropyLoss()

input = torch.tensor([[-1., 1],[-1, 1],[1, -1]]) # raw scores correspond to the correct class
# input = torch.tensor([[-3., 3],[-3, 3],[3, -3]]) # raw scores correspond to the correct class with higher confidence
# input = torch.tensor([[1., -1],[1, -1],[-1, 1]]) # raw scores correspond to the incorrect class
# input = torch.tensor([[3., -3],[3, -3],[-3, 3]]) # raw scores correspond to the incorrect class with incorrectly placed confidence

target = torch.tensor([1, 1, 0])
output = loss(input, target)
print(output)

tensor(0.1269)


In [73]:
im_channels = 3 # if we are working with RGB images, there are 3 input channels, with black and white, 1
out_channels = 16 # this is a hyperparameter we can tune
kernel_size = 3 # this is another hyperparameter we can tune
batch_size = 4
image_width = 32
image_height = 32

im = torch.randn(batch_size, im_channels, image_width, image_height)

m = nn.Conv2d(im_channels, out_channels, kernel_size)
convolved = m(im) # it is a module so we can call it

print('im shape', im.shape)
print('convolved im shape', convolved.shape)

im shape torch.Size([4, 3, 32, 32])
convolved im shape torch.Size([4, 16, 30, 30])


In [None]:
class FaceLandmarksDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.landmarks_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.landmarks_frame.iloc[idx, 0])
        image = io.imread(img_name)
        landmarks = self.landmarks_frame.iloc[idx, 1:]
        landmarks = np.array([landmarks])
        landmarks = landmarks.astype('float').reshape(-1, 2)
        sample = {'image': image, 'landmarks': landmarks}

        if self.transform:
            sample = self.transform(sample)

        return sample