* https://jovian.ai/aakashns/02-linear-regression
* In each part, we can Ctrl+O (refresh all records), and restart.

## Part 1: Simple Linear Regression with pytorch

In [1]:
import numpy as np
import torch

In [2]:
inputs = np.array([
    [73,67,43],
    [91,88,64],
    [87,134,58],
    [102,43,37],
    [69,96,70]], dtype='float32')

targets = np.array([
    [56,70],
    [81,101],
    [119,133],
    [22,37],
    [103,119]], dtype='float32')

* @ : matrix multiplication in torch
* In pytorch, in order for matrix multiplication to be feasible, they should have the same dtype. ('float64' * 'float32' not compatible.)
* In numpy, different dtype can still be computed.

In [3]:
# numpy : float32 * float64 is compatible.
a = np.array([  [1,2,3], [4,5,6]  ], dtype='float32')
b = np.array([  [1,2,3], [4,5,6]  ], dtype='float64')
np.matmul(a, b.transpose())

array([[14., 32.],
       [32., 77.]])

In [4]:
print(torch.from_numpy(a) @ torch.from_numpy(a).t()) # compatible: float32 * float32
print(torch.from_numpy(a) @ torch.from_numpy(b).t()) # incompatible: float32 * float64

tensor([[14., 32.],
        [32., 77.]])


RuntimeError: expected scalar type Float but found Double

### normal way of getting $\hat{\beta}$

In [None]:
const = np.repeat(1.,5)
X=np.hstack((const.reshape(5,1), inputs))

XTX = np.matmul(X.T, X)
XTY = np.matmul(X.T, targets)
betahat = np.matmul(np.linalg.inv(XTX), XTY)
np.matmul(X, betahat)

### what we do in pytorch

In [5]:
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


* For torch random numbers, we must use torch.manual_seed().
* For numpy random numbers, we must use np.random.seed().

* tensor.numel() : length

In [6]:
torch.manual_seed(0) 
w = torch.randn(2,3,requires_grad=True) # 2 by 3: 2 output columns, 3 variables in input data.
b= torch.randn(2, requires_grad=True)
print(w)
print(b)
b.dtype

tensor([[ 1.5410, -0.2934, -2.1788],
        [ 0.5684, -1.0845, -1.3986]], requires_grad=True)
tensor([0.4033, 0.8380], requires_grad=True)


torch.float32

* mse() should return torch object since we are going to do loss.backward() later.

In [31]:
def model(x):
    return x @ w.t() + b
def mse(t1,t2):
    diff = t1 - t2
    return torch.sum(diff * diff) / diff.numel() # torch should be included, since we are goigng to do loss.backward().

* torch.no_grad: When updating w and b, keep the gradient the same.
* At the very beginning, w.grad is None, basically meaning 0.
* w.grad.zero_(): Pytorch accumulates gradients by default. When we invoke .backward(), it adds up the new gradient to already-existing one. That is why we need to put zero the gradients.

In [35]:
for i in range(100): # Repeat this loop until loss does not get smaller.
    preds = model(inputs) # We should define preds before we do loss.backward().
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad(): # This part is always needed for gardient descent.
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_() # This part is necessary!
        b.grad.zero_()
        
print(loss) 

tensor(561.0864, grad_fn=<DivBackward0>)


## Part 2: Understanding "with" block
* with ~ : run the command only in the given context.

In [2]:
file = open("data/temp.txt", "w")
file.write("hello world")
file.close() # We should put file.close() here. Otherwise, it can be viewed as open in the computer.

* The file is open inside the "with" block, and it will closed once we leave the block. So we no longer need to file.close() this.

In [None]:
with open("data/temp.txt", "w") as file:
    file.write("hello world") 

* one more trial for gradient

In [None]:
import torch
torch.manual_seed(0) 
x=torch.rand( (1), requires_grad=True  )
y=torch.rand( (1), requires_grad=True  )
print(x); print(y)

* torch.no_grad() : tells pytorch not to track gradients done inside the context.
* "with" gives us that context. (from where to where to apply torch.no_grad())
* So it means: Do not track gradient inside the "with" block.

In [None]:
t=x*y
with torch.no_grad():
    z=x*y
w=x*y

In [None]:
print(t)
print(w)
print(z) # no gradient tracked.

* Of course there is None here, because 1) we did not backward() anyting, 2) we .grad is attached upon explanatory variables (x and y).

In [None]:
print(t.grad)
print(w.grad)
print(z.grad)

In [None]:
w.backward()
print(x.grad)
print(y.grad)

* In order to run this one, we should redefine x and y.
* If not, it still runs, but we do not x.grad and y.grad adds up from before.

In [None]:
x.grad.zero_()
y.grad.zero_()
t.backward()
print(x.grad)
print(y.grad)

## Part 3: TensorDataset

In [1]:
import torch
from torch.utils.data import TensorDataset
import numpy as np

In [2]:
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70], 
                   [74, 66, 43], 
                   [91, 87, 65], 
                   [88, 134, 59], 
                   [101, 44, 37], 
                   [68, 96, 71], 
                   [73, 66, 44], 
                   [92, 87, 64], 
                   [87, 135, 57], 
                   [103, 43, 36], 
                   [68, 97, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119],
                    [57, 69], 
                    [80, 102], 
                    [118, 132], 
                    [21, 38], 
                    [104, 118], 
                    [57, 69], 
                    [82, 100], 
                    [118, 134], 
                    [20, 38], 
                    [102, 120]], 
                   dtype='float32')

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)

### TensorDataset
* With TensorDataset(tensor, tensor), we can bind two tensors with the same length.
* When we say length, we refer to the number of first-step elements, which is row vectors for a matrix.
* Just so we know, TensorDataset was not used in logistic regression for MNIST data. This acts differently from MNIST dataset. For instance, we can do train_ds[0:3] here, whereas we cannot with MNIST data.

In [3]:
train_ds = TensorDataset(inputs, targets)
train_ds[0:3]

(tensor([[ 73.,  67.,  43.],
         [ 91.,  88.,  64.],
         [ 87., 134.,  58.]]),
 tensor([[ 56.,  70.],
         [ 81., 101.],
         [119., 133.]]))

In [4]:
for xb,yb in train_ds:
    print(xb)
    print(yb)
    break

tensor([73., 67., 43.])
tensor([56., 70.])


### DataLoader
* DataLoader(train_ds, batch_size, shuffle = True) : shuffle (without replacement) the first-step elements, which in our case are rows of the matrix.
* The resulting object should be viewed as a random-object generating function, instead of a fixed object.
* Just so we know, a dataloader object is just a function that generates random batches. The object itself does not have randomness. So, the seed that comes before dataloader does not matter at all, but only the seed that comes before actual implementation of dataloader (xb, yb in our case) removes the randomness.

In [5]:
from torch.utils.data import DataLoader 
batch_size=5
train_dl = DataLoader(train_ds, batch_size, shuffle = True) # more like a random object generating function, instead of an opject.
# train_dl = DataLoader(train_ds, batch_size) # If shuffle=False (default), then we get the same result.
print(train_ds)

<torch.utils.data.dataset.TensorDataset object at 0x000001C30E649E50>


* For every tensor, it has the first-step element. For instance in numpy, mat[0] returns the 1st row vector and arr[0] returns the 1st matrix.

In [6]:
mat = np.array([
    [1,2],
    [4,5]
])
print(mat[0])

arr = np.array([
        [[1,2,0], 
        [3,4,0]],
        [[6,7,0],
         [8,9,0]]
        ])
print(arr[0])

[1 2]
[[1 2 0]
 [3 4 0]]


In [7]:
print(inputs)
for xb in inputs:
    print(xb)
    break # If break, then we only output one element.

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.],
        [ 74.,  66.,  43.],
        [ 91.,  87.,  65.],
        [ 88., 134.,  59.],
        [101.,  44.,  37.],
        [ 68.,  96.,  71.],
        [ 73.,  66.,  44.],
        [ 92.,  87.,  64.],
        [ 87., 135.,  57.],
        [103.,  43.,  36.],
        [ 68.,  97.,  70.]])
tensor([73., 67., 43.])


In [8]:
print(train_ds)
for xb, yb in train_ds: # There are two types of inputs in train_ds.
    print(xb)
    print(yb)
    break # If break, then we only output one element.

<torch.utils.data.dataset.TensorDataset object at 0x000001C30E649E50>
tensor([73., 67., 43.])
tensor([56., 70.])


* The reason why only three are displayed is as follows.
* There are 15 rows (both for inputs and targets) and the batch size is 5. Each row is used only once, without replication.
* If there were 16 rows (with the batch size being 5), then we would have 4 batches, with size = 5,5,5,1.

In [9]:
for xb, yb in train_dl: # This is random due to shuffle = True.
    print(xb)
    print(yb)
#     break

tensor([[ 91.,  88.,  64.],
        [ 69.,  96.,  70.],
        [ 74.,  66.,  43.],
        [ 91.,  87.,  65.],
        [101.,  44.,  37.]])
tensor([[ 81., 101.],
        [103., 119.],
        [ 57.,  69.],
        [ 80., 102.],
        [ 21.,  38.]])
tensor([[ 87., 134.,  58.],
        [ 68.,  97.,  70.],
        [103.,  43.,  36.],
        [ 73.,  67.,  43.],
        [ 88., 134.,  59.]])
tensor([[119., 133.],
        [102., 120.],
        [ 20.,  38.],
        [ 56.,  70.],
        [118., 132.]])
tensor([[ 73.,  66.,  44.],
        [102.,  43.,  37.],
        [ 87., 135.,  57.],
        [ 68.,  96.,  71.],
        [ 92.,  87.,  64.]])
tensor([[ 57.,  69.],
        [ 22.,  37.],
        [118., 134.],
        [104., 118.],
        [ 82., 100.]])


## Part 4: nn.Linear

* preparation: import packages and data 

In [10]:
import torch 
import torch.nn as nn
import numpy as np

inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70], 
                   [74, 66, 43], 
                   [91, 87, 65], 
                   [88, 134, 59], 
                   [101, 44, 37], 
                   [68, 96, 71], 
                   [73, 66, 44], 
                   [92, 87, 64], 
                   [87, 135, 57], 
                   [103, 43, 36], 
                   [68, 97, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119],
                    [57, 69], 
                    [80, 102], 
                    [118, 132], 
                    [21, 38], 
                    [104, 118], 
                    [57, 69], 
                    [82, 100], 
                    [118, 134], 
                    [20, 38], 
                    [102, 120]], 
                   dtype='float32')

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
print(inputs.shape)

torch.Size([15, 3])


* initial values are randomly genarated.
* nn.Linear(3,2) : 3 variables in the input & 2 output variables. -> 2 by 3 weight & 2 bias terms.
* Since it is SGD, it only uses a mini-batch of designated size.

In [11]:
from torch.utils.data import TensorDataset
train_ds = TensorDataset(inputs, targets)
from torch.utils.data import DataLoader 
batch_size=5
train_dl = DataLoader(train_ds, batch_size, shuffle = True) 
# more like a random object generating function, instead of an opject.
# note that we designated the batch size. In every step of SGD, we are using only that many samples.

In [12]:
for xb, yb in train_dl: # This is random due to shuffle = True.
    print(xb)
    print(yb)
    # break # With or without break, there is only a single (pair of) element, so the output stays the same.

tensor([[ 87., 135.,  57.],
        [103.,  43.,  36.],
        [102.,  43.,  37.],
        [ 92.,  87.,  64.],
        [ 74.,  66.,  43.]])
tensor([[118., 134.],
        [ 20.,  38.],
        [ 22.,  37.],
        [ 82., 100.],
        [ 57.,  69.]])
tensor([[ 73.,  67.,  43.],
        [ 91.,  87.,  65.],
        [ 68.,  97.,  70.],
        [ 87., 134.,  58.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 80., 102.],
        [102., 120.],
        [119., 133.],
        [103., 119.]])
tensor([[ 73.,  66.,  44.],
        [ 88., 134.,  59.],
        [ 91.,  88.,  64.],
        [ 68.,  96.,  71.],
        [101.,  44.,  37.]])
tensor([[ 57.,  69.],
        [118., 132.],
        [ 81., 101.],
        [104., 118.],
        [ 21.,  38.]])


* When they first genrate model = nn.Linear(), they randomly pick initial values for the parameters.
* As we proceed, we will update these parameters, and thereby update this "model" object.

In [13]:
torch.manual_seed(1)
model = nn.Linear(3,2) # They randomly sample initial values.

In [14]:
print(model.weight) # initial weights and bias
print(model.bias)
list(model.parameters()) # both weight and bias terms

Parameter containing:
tensor([[ 0.2975, -0.2548, -0.1119],
        [ 0.2710, -0.5435,  0.3462]], requires_grad=True)
Parameter containing:
tensor([-0.1188,  0.2937], requires_grad=True)


[Parameter containing:
 tensor([[ 0.2975, -0.2548, -0.1119],
         [ 0.2710, -0.5435,  0.3462]], requires_grad=True),
 Parameter containing:
 tensor([-0.1188,  0.2937], requires_grad=True)]

In [15]:
preds = model(inputs)
preds

tensor([[ -0.2886,  -1.4525],
        [ -2.6357,  -0.7178],
        [-14.8763, -28.8820],
        [ 15.1260,  17.3737],
        [-11.8906,  -8.9504],
        [  0.2637,  -0.6379],
        [ -2.4928,   0.1720],
        [-14.6907, -28.2648],
        [ 14.5737,  16.5592],
        [-12.3000,  -8.8751],
        [ -0.1457,  -0.5627],
        [ -2.0834,   0.0968],
        [-15.0192, -29.7718],
        [ 15.5354,  17.2984],
        [-12.4430,  -9.7649]], grad_fn=<AddmmBackward0>)

### Loss Function

In [None]:
import torch.nn.functional as F

In [None]:
loss_fn = F.mse_loss
loss = loss_fn(model(inputs) , targets)
print(loss)

### Optimizer

* torch.optim.SGD : stochastic gradient descent. We only care about descent, since we can convert ascent into descent with a negative sign.
* It is stochastic, because it only uses a handful of observations, not all observations, to proceed one step. In other words, there is randomness in selecting batches.
* However if batch_size=N, then we there is no randomness in choosing which batch anymore, but there is still randomness in selecting the intial values of weight/bias parameters.

In [None]:
opt = torch.optim.SGD(model.parameters(), lr=1e-5) 
# specified optimizing funbction with the model and the selected learning rate.

In [None]:
# Utility function to train the model
def fit(num_epochs, model, loss_fn, opt, train_dl):
    
    # Repeat for given number of epochs
    for epoch in range(num_epochs):
        
        # Train with batches of data
        for xb,yb in train_dl:
            
            # 1. Generate predictions
            pred = model(xb)
            
            # 2. Calculate loss
            loss = loss_fn(pred, yb)
            
            # 3. Compute gradients
            loss.backward()
            
            # 4. Update parameters using gradients
            opt.step() # implement one step of SGD.
            
            # 5. Reset the gradients to zero
            opt.zero_grad() # Essential step!
        
        # Print the progress
        if (epoch+1) % 10 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

In [None]:
fit(100, model, loss_fn, opt, train_dl) # may repeat it until it converges.

In [None]:
preds = model(inputs)
resid = preds - targets
torch.sum(abs(resid)/resid.numel())

In [None]:
print(model(inputs[0])-preds[0])
print(model(torch.tensor([[75, 63, 44.]]))) # We can select any inputs for which we would like to predict.