In [7]:
import torch
import torch.nn as nn
import numpy as np

In [11]:
# synthetic data generation
# https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e
np.random.seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

In [24]:
x_train_tensor = torch.from_numpy(x_train).float().to(device)  # why do we call float()?
y_train_tensor = torch.from_numpy(y_train).float().to(device)

In [30]:
# You will find it out later about why we call float() when creating dataset
# Both float()'ed tensor and torch's default float are float32 type.
x_train_tensor.dtype, torch.float

(torch.float32, torch.float32)

What if we do not call float()?  
It's float64!! This is considered as 'Double'.  

The thing is that Pytorch models that inherits nn.Module 
expects parameters of float32 by default.  
Therefore if we put float64 into our model it will spit out an error, saying  
**"RuntimeError: expected scalar type Float but found Double"**

In [31]:
x_train_no_float_call = torch.from_numpy(x_train).to(device)
x_train_no_float_call.dtype

torch.float64

## Manual implementation of Linear model

In [9]:
class MyLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
        self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float))
    
    def forward(self, x):
        return self.a + self.b * x

In [15]:
model = MyLinearRegression().to(device)
print(model.state_dict())

lr = 1e-1
epochs = 1000

mse_loss = nn.MSELoss(reduction='mean')
# because in the MyLinearRegression class we have set nn.Parameter()s, 
# we can call model.parmeters() as optimizer params
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

for epoch in range(epochs):
    model.train()
    
    y_hat = model(x_train_tensor)
    loss = mse_loss(y_train_tensor, y_hat)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
print(model.state_dict())

OrderedDict([('a', tensor([0.2576])), ('b', tensor([1.0971]))])
OrderedDict([('a', tensor([1.0235])), ('b', tensor([1.9690]))])


## torch.nn.Linear()  --> Nested model

In [16]:
class MyLinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(1, 1)
        
    def forward(self, x):
        return self.linear(x)

### Automating training steps

In [26]:
def return_train_step_function(model, loss_function, optimizer):
    # Build function that is to be returned and used in every epoch
    def train_step(x, y):
        model.train()
        y_hat = model(x)
        loss = loss_function(y, y_hat)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        return loss.item()
    
    # returning a function
    return train_step

In [27]:
model = MyLinearModel().to(device)

train_step_func = return_train_step_function(model=model, 
                                             loss_function=nn.MSELoss(reduction='mean'),
                                             optimizer=torch.optim.SGD(model.parameters(), lr=1e-1)
                                            )

In [28]:
losses = []
for epoch in range(300):
    loss = train_step_func(x_train_tensor, y_train_tensor)  # returns loss.item()
    losses.append(loss)

print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9518]])), ('linear.bias', tensor([1.0323]))])


## Neural Network
e.g. CNN

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [72]:
class NeuralNet(nn.Module):
    def __init__(self):
        # Watch out what goes into the super method!
        super(NeuralNet, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # Number of channels: 1 -> 6 -> 16
        
        # FC layers
        # from Conv2d layers, we have 16 channels of 6x6 image
        # we will flatten that later in forward() and connect to 120 Nodes
        self.fc1 = nn.Linear(16*6*6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over a (2,2) window on ReLu'ed conv1
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        # if window size is square, can provide one number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # Flatten the matirx to vector
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # All dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [73]:
net = NeuralNet()
net

NeuralNet(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [None]:
params = list(net.parameters())
params

In [75]:
len(params)

10

In [76]:
for i in range(len(params)):
    print(i, params[i].size())

0 torch.Size([6, 1, 3, 3])
1 torch.Size([6])
2 torch.Size([16, 6, 3, 3])
3 torch.Size([16])
4 torch.Size([120, 576])
5 torch.Size([120])
6 torch.Size([84, 120])
7 torch.Size([84])
8 torch.Size([10, 84])
9 torch.Size([10])


In [77]:
params[0]  # conv1's weight

Parameter containing:
tensor([[[[ 0.1101, -0.0640,  0.0269],
          [-0.1618, -0.1212, -0.2035],
          [-0.1999,  0.1351, -0.1900]]],


        [[[ 0.0920,  0.0144, -0.0516],
          [-0.1780, -0.2131, -0.0819],
          [-0.2825, -0.1357, -0.2350]]],


        [[[ 0.1570, -0.1577, -0.2885],
          [ 0.3063,  0.2045,  0.1341],
          [ 0.0466, -0.2474,  0.2811]]],


        [[[ 0.3035, -0.0514,  0.0914],
          [-0.3250,  0.0667,  0.1270],
          [-0.2774, -0.1120,  0.1409]]],


        [[[-0.1817, -0.1134, -0.2313],
          [ 0.2806,  0.2203,  0.1618],
          [ 0.2273,  0.2513,  0.1589]]],


        [[[-0.2056,  0.3156,  0.0334],
          [ 0.2586,  0.3097, -0.2287],
          [ 0.2469, -0.2683, -0.3172]]]], requires_grad=True)

In [78]:
params[1]

Parameter containing:
tensor([ 0.1241,  0.1750, -0.3206,  0.1212, -0.0399, -0.3041],
       requires_grad=True)

In [79]:
# Let's try random 32x32 input
input = torch.randn(1, 1, 32, 32)
out = net(input)
out

tensor([[ 0.0259,  0.0583, -0.1105,  0.0673,  0.0034,  0.0630, -0.1216,  0.0342,
         -0.0689, -0.1136]], grad_fn=<AddmmBackward>)

In [80]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [82]:
output = net(input)
target = torch.randn(10)  # dummy target
target = target.view(1, -1) # make it the same shape as output
criterion = nn.MSELoss()
loss = criterion(output, target)
loss

tensor(1.0335, grad_fn=<MseLossBackward>)

In [83]:
loss.grad_fn  # MSELoss

<MseLossBackward at 0x7ff2fc2303a0>

In [85]:
loss.grad_fn.next_functions[0][0]  # Linear

<AddmmBackward at 0x7ff2fc230d00>

In [86]:
loss.grad_fn.next_functions[0][0].next_functions[0][0]  # ReLu

<AccumulateGrad at 0x7ff2fd365d30>

### Backprop

In [89]:
net.zero_grad()  # zeros the gradient buffers of all params
print(net.conv1)
print(net.conv1.bias)
print(net.conv1.bias.grad)

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
Parameter containing:
tensor([ 0.1241,  0.1750, -0.3206,  0.1212, -0.0399, -0.3041],
       requires_grad=True)
tensor([0., 0., 0., 0., 0., 0.])


In [90]:
loss.backward()

In [91]:
print(net.conv1)
print(net.conv1.bias)
print(net.conv1.bias.grad)

Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
Parameter containing:
tensor([ 0.1241,  0.1750, -0.3206,  0.1212, -0.0399, -0.3041],
       requires_grad=True)
tensor([-0.0077,  0.0041, -0.0051,  0.0052,  0.0027,  0.0074])


### Weight Update

In [92]:
# SGD Manual implementation
# weight = weight - lr * gradient
lr = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * lr)

In [93]:
import torch.optim as optim

In [97]:
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in training loop
for epoch in range(10):
    optimizer.zero_grad()
    output = net(input)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()