# 1. Linear Regression Implementation from Scratch

In [6]:
import torch
import random

## 1.1 Generating the dataset

The true parameters generating our dataset will be $w=[2,−3.4]^{T}$ and $b=4.2$, and our synthetic labels will be assigned according to the following linear model with the noise term ϵ.

In [9]:
def synthetic_data(w, b, num_examples):
    # Random X
    X = torch.normal(0, 1, (num_examples, len(w)))
    # Calculate Y
    y = torch.matmul(X, w) + b
    # Add noise
    y += torch.normal(0, 0.01, y.shape)
    return X, y.reshape((-1, 1))

true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = synthetic_data(true_w, true_b, 1000)

print(f'Example\nFeatures: {features[0]}\nLabel: {labels[0]}')

Example
Features: tensor([-0.0711, -0.2810])
Label: tensor([5.0064])


## 1.2 Reading the dataset

In [12]:
def data_iter(batch_size, features, labels):
    n = len(features)
    indices = list(range(n))
    random.shuffle(indices)
    for i in range(0, n, batch_size):
        batch_indices = torch.tensor(
            indices[i: min(i + batch_size, n)])
        yield features[batch_indices], labels[batch_indices]

In [15]:
batch_size = 10

for X, y in data_iter(batch_size, features, labels):
    print('Features:\n', X, '\nLabels\n', y)
    break

Features:
 tensor([[-0.0085,  0.3169],
        [-1.6964,  0.0908],
        [-0.2581, -0.4166],
        [-1.6626,  0.5857],
        [ 1.4238, -0.5132],
        [ 0.0392, -0.1529],
        [ 2.0490,  1.1481],
        [-0.3911,  0.2906],
        [ 0.4990,  0.3073],
        [-0.4907,  0.5500]]) 
Labels
 tensor([[ 3.0961],
        [ 0.5078],
        [ 5.0956],
        [-1.1262],
        [ 8.7920],
        [ 4.8160],
        [ 4.3897],
        [ 2.4474],
        [ 4.1545],
        [ 1.3687]])


## 1.3 Initializing model parameters

In [68]:
w = torch.normal(0, 0.01, size=(2,1), requires_grad=True)
b = torch.zeros(1, requires_grad=True)

## 1.4 Defining the model

In [17]:
def linreg(X, w, b):
    return torch.matmul(X, w) + b

## 1.5 Defining the Loss Function

In [18]:
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

## 1.6 Defining the Optimization Algorithm 

Because our loss is calculated as a sum over the minibatch of examples, we normalize our step size by the batch size (batch_size), so that the magnitude of a typical step size does not depend heavily on our choice of the batch size.

    torch.no_grad(): context manager that disabled gradient calculation (temporarily sets all of the requires_grad flags to False)
    
    torch.Tensor.grad: This attribute is None by default and becomes a Tensor the first time a call to backward() computes gradients for self. The attribute will then contain the gradients computed and future calls to backward() will accumulate (add) gradients into it.

In [66]:
def sgd(params, lr, batch_size):
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

## 1.7 Training

In [69]:
lr = 0.03
epochs = 3

for epoch in range(epochs):
    for X, y in data_iter(batch_size, features, labels):
        l = squared_loss(linreg(X, w, b), y)
        l.sum().backward()
        sgd([w, b], lr, batch_size)
    with torch.no_grad():
        train_l = squared_loss(linreg(features, w, b), labels)
        print(f'epoch {epoch + 1}, loss {float(train_l.mean()):f}')

epoch 1, loss 0.034556
epoch 2, loss 0.000113
epoch 3, loss 0.000047
