In [9]:
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

#### In this chapter, we will:
- build a function to perform training steps
- implement our own dataset class
- use data loaders to generate mini-batches
- build a function to perform mini-batch gradient descent
- evaluate our model
- integrate TensorBoard to monitor model training
- save/checkpoint our model to disk
- load our model from disk to resume training or to deploy

### We finished the previous chapter with an important question:

**Would the code inside the training loop change if we were using a
different optimizer, or loss, or even model?**

The answer: **NO.**

----


**A function that returns another function?**

Sounds complicated, right? 

It is not as bad as it sounds, though… that’s called a
**higher-order function**, and it is very useful for reducing boilerplate.

In [3]:
def exponentiation_builder(exponent):
    def skeleton_exponentiation(x):
        return x ** exponent
    return skeleton_exponentiation

In [4]:
returned_function = exponentiation_builder(2)
returned_function

<function __main__.exponentiation_builder.<locals>.skeleton_exponentiation(x)>

In [5]:
returned_function(5)

25

## Train (working with high order functions)

In [10]:
def make_train_step(model, loss_fn, optimizer):
    # Builds function that performs a step in the train loop
    def perform_train_step(x, y):
        # Sets model to TRAIN mode
        model.train()
        
        # Step 1 - Computes model's predictions - forward pass
        yhat = model(x)
        # Step 2 - Computes the loss
        loss = loss_fn(yhat, y)
        # Step 3 - Computes gradients for "b" and "w" parameters
        loss.backward()
        # Step 4 - Updates parameters using gradients and
        # the learning rate
        optimizer.step()
        optimizer.zero_grad()
        
        # Returns the loss
        return loss.item()

    # Returns the function that will be called inside the
    # train loop
    return perform_train_step

In [11]:
%run -i ../chapter1/data_preparation/v0.py

In [12]:
%%writefile ../chapter1/model_configuration/v1.py

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1

torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = nn.Sequential(nn.Linear(1, 1)).to(device)

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Creates the train_step function for our model, loss function
# and optimizer
train_step = make_train_step(model, loss_fn, optimizer) 

Overwriting ../chapter1/model_configuration/v1.py


In [14]:
%run -i ../chapter1/model_configuration/v1.py

In [15]:
train_step

<function __main__.make_train_step.<locals>.perform_train_step(x, y)>

In [16]:
%%writefile ../chapter1/model_training/v1.py

# Defines number of epochs
n_epochs = 1000

losses = []

# For each epoch...
for epoch in range(n_epochs):
    # Performs one train step and returns the corresponding loss
    loss = train_step(x_train_tensor, y_train_tensor)
    losses.append(loss)

Writing ../chapter1/model_training/v1.py


In [18]:
%run -i ../chapter1/model_training/v1.py

In [19]:
losses

[0.7778562307357788,
 0.4567660987377167,
 0.27874717116355896,
 0.1797574758529663,
 0.12442906945943832,
 0.09322968125343323,
 0.07537204027175903,
 0.06489849090576172,
 0.058518461883068085,
 0.054414354264736176,
 0.0515819787979126,
 0.04946642741560936,
 0.04776085168123245,
 0.04629545286297798,
 0.04497610032558441,
 0.04375046119093895,
 0.04258930683135986,
 0.04147614911198616,
 0.04040160030126572,
 0.03936013579368591,
 0.038348421454429626,
 0.03736431896686554,
 0.03640634939074516,
 0.035473428666591644,
 0.03456468880176544,
 0.03367937356233597,
 0.0328168123960495,
 0.03197639435529709,
 0.03115752898156643,
 0.030359644442796707,
 0.029582196846604347,
 0.028824670240283012,
 0.028086528182029724,
 0.027367297559976578,
 0.026666488498449326,
 0.025983627885580063,
 0.025318246334791183,
 0.024669911712408066,
 0.024038175120949745,
 0.023422613739967346,
 0.02282281592488289,
 0.022238384932279587,
 0.02166890725493431,
 0.02111402526497841,
 0.020573347806930542

**After updating two out of three fundamental parts, our current**

state of development is:
- **Data Preparation V0**
- **Model Configuration V1**
- **Model Training V1**

In [20]:
# Checks model's parameters
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


### create custom dataset (sample)

In [21]:
class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
    
    def __getitem__(self, index):
        return (self.x[index], self.y[index])
    
    def __len__(self):
        return len(self.x)
    
# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.as_tensor(x_train).float()
y_train_tensor = torch.as_tensor(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.1209]), tensor([1.2417]))


Did you notice we built our training tensors out of Numpy arrays,

but we did not send them to a device? So, they are CPU tensors

now! Why?

We don’t want our whole training data to be loaded into GPU

tensors, as we have been doing in our example so far, because it

takes up space in our precious graphics card’s RAM.

### TensorDataset (using pytorch dataset)

In [24]:
train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.1209]), tensor([1.2417]))


### DataLoader

We tell it which dataset to use
(the one we have just built in the previous section), the desired mini-batch size, 

and
if we’d like to shuffle it or not. That’s it!

#### IMPORTANT: 

in the absolute majority of cases, you should set
**shuffle=True** 
for your training set to improve the performance
of gradient descent. There are a few exceptions, though, like time
series problems, where shuffling actually leads to data leakage.

#### What about the validation and test sets?
There is **no need** to
shuffle them since **we are not computing gradients with them**.

Our **loader** will behave like an **iterator**, so we can **loop over it** and **fetch a different mini-batch** every time.

It is typical to **use powers of two** for mini-batch sizes, like **16, 32, 64 or 128, and 32**
seems to be the choice of most people

In our example, we have only 80 training points, so I chose a mini-batch size of 16
to conveniently split the training set into five mini-batches.

In [28]:
train_loader = DataLoader(
dataset=train_data,
batch_size=16,
shuffle=True,
)

To retrieve a mini-batch, one can simply run the command below — it will return a
list containing two tensors, one for the features, another one for the labels:

In [30]:
next(iter(train_loader))

[tensor([[0.3950],
         [0.0951],
         [0.5758],
         [0.7735],
         [0.3614],
         [0.2897],
         [0.9002],
         [0.2700],
         [0.1209],
         [0.5504],
         [0.9091],
         [0.9118],
         [0.2501],
         [0.6718],
         [0.1992],
         [0.3464]]),
 tensor([[1.7900],
         [1.1903],
         [2.1516],
         [2.5469],
         [1.7229],
         [1.5795],
         [2.8004],
         [1.5399],
         [1.2417],
         [2.1008],
         [2.8183],
         [2.8237],
         [1.5003],
         [2.3436],
         [1.3984],
         [1.6929]])]

If you call **list(train_loader)**, you’ll get; as a result, a list of 5 elements, that is, all
five mini-batches. Then you could take the first element of that list to obtain a
single mini-batch as in the example above. It would defeat the purpose of using the
iterable provided by the DataLoader, that is, to iterate over the elements (mini-
batches, in that case) one at a time.

### change data_preparation1 with dataSet and dataLoader

In [32]:
%%writefile ../chapter1/data_preparation/v1.py

# Our data was in Numpy arrays, but we need to transform them
# into PyTorch's Tensors
x_train_tensor = torch.as_tensor(x_train).float()
y_train_tensor = torch.as_tensor(y_train).float()

# Builds Dataset
train_data = TensorDataset(x_train_tensor, y_train_tensor)

# Builds DataLoader
train_loader = DataLoader(
dataset=train_data,
batch_size=16,
shuffle=True,
)

Writing ../chapter1/data_preparation/v1.py


In [34]:
%run -i ../chapter1/data_preparation/v1.py

In [35]:
%run -i ../chapter1/model_configuration/v1.py

In [38]:
%%writefile ../chapter1/model_training/v2.py

# Defines number of epochs
n_epochs = 1000

losses = []

# For each epoch...
for epoch in range(n_epochs):
    # inner loop
    mini_batch_losses = []
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        # Performs one train step and returns the
        # corresponding loss for this mini-batch
        mini_batch_loss = train_step(x_batch, y_batch)
        mini_batch_losses.append(mini_batch_loss)
    
    # Computes average loss over all mini-batches
    # That's the epoch loss
    loss = np.mean(mini_batch_losses)

    losses.append(loss)

Overwriting ../chapter1/model_training/v2.py


In [39]:
%run -i ../chapter1/model_training/v2.py

After another two updates, our current state of development is:
- **Data Preparation V1**
- **Model Configuration V1**
- **Model Training V2**

In [41]:
# Checks model's parameters
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


### Did you notice it is taking longer to train now? Can you guess why?

**ANSWER:** 

the training time is longer now because the inner loop is executed five
times for each epoch (in our example, since we are using a mini-batch of size 16 and
we have 80 training data points in total, we execute the inner loop 80 / 16 = 5
times). 

So, in total, we are calling the train_step a total of 5,000 times now! No
wonder it’s taking longer!

In [43]:
def mini_batch(device, data_loader, step):
    mini_batch_losses = []
    for x_batch, y_batch in data_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        mini_batch_loss = step(x_batch, y_batch)
        mini_batch_losses.append(mini_batch_loss)
    
    loss = np.mean(mini_batch_losses)
    return loss

In [44]:
%run -i ../chapter1/data_preparation/v1.py
%run -i ../chapter1/model_configuration/v1.py

In [50]:
%%writefile ../chapter1/model_training/v3.py

#In the last section, we realized that we were executing five times more updates
#(the train_step function) per epoch due to the mini-batch inner loop. Before, 1,000
#epochs meant 1,000 updates. Now, we only need 200 epochs to perform the same
#1,000 updates
# Defines number of epochs
n_epochs = 200 

losses = []

for epoch in range(n_epochs):
    # inner loop
    loss = mini_batch(device, train_loader, train_step)
    losses.append(loss)

Overwriting ../chapter1/model_training/v3.py


In [47]:
%run -i ../chapter1/model_training/v3.py

After updating the model training part, our current state of
development is:
- **Data Preparation V1**
- **Model Configuration V1**
- **Model Training V3**

In [49]:
# Checks model's parameters
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


### Random Split
PyTorch’s random_split() method is an easy and familiar way of performing a
training-validation split.
So far, we’ve been using x_train_tensor and y_train_tensor, built out of the
original split in Numpy, to build the training dataset. Now, we’re going to be using
the full data from Numpy (x and y), to build a PyTorch Dataset first and only then
split the data using random_split()

In [51]:
%%writefile ../chapter1/data_preparation/v2.py

torch.manual_seed(13)

# Builds tensors from numpy arrays BEFORE split
x_tensor = torch.as_tensor(x).float()
y_tensor = torch.as_tensor(y).float()

# Builds dataset containing ALL data points
dataset = TensorDataset(x_tensor, y_tensor)

# Performs the split
ratio = .8
n_total = len(dataset)
n_train = int(n_total * ratio)
n_val = n_total - n_train
train_data, val_data = random_split(dataset, [n_train, n_val])

# Builds a loader of each set
train_loader = DataLoader(
dataset=train_data,
batch_size=16,
shuffle=True,
)
val_loader = DataLoader(dataset=val_data, batch_size=16) 

Writing ../chapter1/data_preparation/v2.py


In [52]:
%run -i ../chapter1/data_preparation/v2.py

### evaluation the model

In [53]:
def make_val_step(model, loss_fn):
    # Builds function that performs a step
    # in the validation loop
    def perform_val_step(x, y):
        # Sets model to EVAL mode
        model.eval()
        
        # Step 1 - Computes our model's predicted output
        # forward pass
        yhat = model(x)
        # Step 2 - Computes the loss
        loss = loss_fn(yhat, y)
        # There is no need to compute Steps 3 and 4,
        # since we don't update parameters during evaluation
        return loss.item()
    
    return perform_val_step

In [54]:
 %%writefile ../chapter1/model_configuration/v2.py

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Sets learning rate - this is "eta" ~ the "n"-like Greek letter
lr = 0.1

torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = nn.Sequential(nn.Linear(1, 1)).to(device)

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Creates the train_step function for our model, loss function
# and optimizer
train_step = make_train_step(model, loss_fn, optimizer)

# Creates the val_step function for our model and loss function
val_step = make_val_step(model, loss_fn) 

Writing ../chapter1/model_configuration/v2.py


In [56]:
%run -i ../chapter1/model_configuration/v2.py

In [58]:
%%writefile ../chapter1/model_training/v4.py

# Defines number of epochs
n_epochs = 200

losses = []
val_losses = []

for epoch in range(n_epochs):
    # inner loop
    loss = mini_batch(device, train_loader, train_step)
    losses.append(loss)

# VALIDATION - no gradients in validation!
with torch.no_grad():
    val_loss = mini_batch(device, val_loader, val_step)
    val_losses.append(val_loss) 

Overwriting ../chapter1/model_training/v4.py


In [59]:
%run -i ../chapter1/model_training/v4.py

After updating all parts, in sequence, our current state of
development is:
- **Data Preparation V2**
- **Model Configuration V2**
- **Model Training V4**

In [61]:
# Checks model's parameters
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


In [62]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [63]:
%tensorboard --logdir runs #you can run it separately in the terminal

In [64]:
writer = SummaryWriter('runs/test')

If we do not specify any folder, TensorBoard will default to
runs/CURRENT_DATETIME_HOSTNAME, which is not such a great
name if you’d be looking for your experiment results in the
future.

So, it is recommended to try to name it in a more meaningful
way, like runs/test or runs/simple_linear_regression. It will
then create a subfolder inside runs (the folder we specified when
we started TensorBoard).

Even better, you should name it in a meaningful way and add
datetime or a sequential number as a suffix, like runs/test_001
or runs/test_20200502172130, to avoid writing data of multiple
runs into the same folder (we’ll see why this is bad in the
add_scalars section below).

The summary writer implements several methods to allow us sending information to TensorBoard:

- **add_graph**
- **add_scalars**
- **add_scalar**
- **add_histogram**
- **add_images**
- **add_image**
- **add_figure**
- **add_video**
- **add_audio**
- **add_text**
- **add_embedding**
- **add_pr_curve**
- **add_custom_scalars**
- **add_mesh add_hparams**
- 
It also implements two other methods for effectively writing data to disk:
- **flush**
- **close**

In [66]:
# Fetching a tuple of feature (dummy_x) and label (dummy_y)
dummy_x, dummy_y = next(iter(train_loader))
# Since our model was sent to device, we need to do the same
# with the data.
# Even here, both model and data need to be on the same device!
writer.add_graph(model, dummy_x.to(device))

### add_scalars

What about sending the loss values to TensorBoard? I’m on it! We can use

*add_scalars* method to send multiple scalar values at once, and it needs three
arguments:

- **main_tag:** the parent name of the tags or, the "group tag", if you will
- **tag_scalar_dict:** the dictionary containing the key: value pairs for the scalars

you want to keep track of (in our case, training and validation losses)
- **global_step:** step value, that is, the index you’re associating with the values
you’re sending in the dictionary - the epoch comes to mind in our case, as losses
are computed for each epoch

In [67]:
writer.add_scalars(
main_tag='loss',
tag_scalar_dict={'training': loss,
'validation': val_loss},
global_step=epoch
)

In [68]:
%run -i ../chapter1/data_preparation/v2.py

In [71]:
%%writefile ../chapter1/model_configuration/v3.py

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Sets learning rate - this is "eta" ~ the "n"-like Greek letter
lr = 0.1

torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = nn.Sequential(nn.Linear(1, 1)).to(device)

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Creates the train_step function for our model,
# loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)

# Creates the val_step function for our model and loss function
val_step = make_val_step(model, loss_fn)

# Creates a Summary Writer to interface with TensorBoard
writer = SummaryWriter('runs/simple_linear_regression')
# Fetches a single mini-batch so we can use add_graph
x_dummy, y_dummy = next(iter(train_loader))
writer.add_graph(model, x_dummy.to(device))

Overwriting ../chapter1/model_configuration/v3.py


In [72]:
%run -i ../chapter1/model_configuration/v3.py

In [74]:
%%writefile ../chapter1/model_training/v5.py

# Defines number of epochs
n_epochs = 200

losses = []
val_losses = []

for epoch in range(n_epochs):
    # inner loop
    loss = mini_batch(device, train_loader, train_step)
    losses.append(loss)

    # VALIDATION - no gradients in validation!
    with torch.no_grad():
        val_loss = mini_batch(device, val_loader, val_step)
        val_losses.append(val_loss)
    
    # Records both losses for each epoch under tag "loss"
    writer.add_scalars(main_tag='loss',
                        tag_scalar_dict={
                        'training': loss,
                        'validation': val_loss},
                        global_step=epoch)

# Closes the writer
writer.close()

Writing ../chapter1/model_training/v5.py


In [75]:
%run -i ../chapter1/model_training/v5.py

After the last update of both model configuration and training
parts, our current state of development is:
- **Data Preparation V2**
- **Model Configuration V3**
- **Model Training V5**

In [77]:
# Checks model's parameters
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


**If our model**
**were big or complex enough to take at least a couple of minutes to train, we would**
**be able to see the evolution of our losses in TensorBoard during training.**

### **Saving and Loading Models**

Training a model successfully is great, no doubt about that, but not all models will
be that fast to be trained, and maybe training gets interrupted (computer crashing,
timeout after 12h of continuous GPU usage on Google Colab, etc.).

It would be a pity to have to start over

So, it is important to be able to checkpoint or save our model, 

that is, saving it to
disk, in case we’d like to restart training later or deploy it as an application to make
predictions

### Saving
Now, wrap everything into a Python dictionary and use **torch.save()** to dump it
all into a file. 

Easy peasy! We have just saved our model to a file named
**model_checkpoint.pth**.

In [78]:
checkpoint = {'epoch': n_epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': losses,
            'val_loss': val_losses}
torch.save(checkpoint, 'model_checkpoint.pth')

#### **Resuming Training**
If we’re starting fresh (as if we had just turned on the computer and started
Jupyter), we have to set up the stage before actually loading the model. 

This means
we need to load the data and configure the model.
    
Luckily, we have code for that already: **Data Preparation V2** and **Model Configuration V3**:

In [79]:
%run -i ../chapter1/data_preparation/v2.py
%run -i ../chapter1/model_configuration/v3.py

#### let’s double-check that we do have an **untrained model**:

In [80]:
print(model.state_dict())

OrderedDict([('0.weight', tensor([[0.7645]], device='cuda:0')), ('0.bias', tensor([0.8300], device='cuda:0'))])


Now we are ready to load the model back, which is easy:
- **load the dictionary back using torch.load()**
- **load model and optimizer state dictionaries back using the load_state_dict()
method**
- **load everything else into their corresponding variables**

In [82]:
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
saved_epoch = checkpoint['epoch']
saved_losses = checkpoint['loss']
saved_val_losses = checkpoint['val_loss']
model.train() # always use TRAIN for resuming training

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=True)
)

In [83]:
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


#### **Next, we can run **Model Training V5** to train it for another 200 epochs.**

In [84]:
%run -i ../chapter1/model_training/v5.py

In [85]:
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


Well, it didn’t change at all, which is no surprise: the original model had converged
already; that is, the loss was at a minimum. 

These extra epochs served an
educational purpose only; they did not improve the model. 

### **Deploying / Making Predictions**

Again, if we’re starting fresh (as if we had just turned on the computer and started
Jupyter), we have to set up the stage before actually loading the model.

But, this
time, we only need to configure the model:

In [86]:
%run -i ../chapter1/model_configuration/v3.py

In [87]:
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])


In [89]:
#test the model
new_inputs = torch.tensor([[.20], [.34], [.57]])
model.eval() # always use EVAL for fully trained models!
model(new_inputs.to(device))

tensor([[1.4000],
        [1.6800],
        [2.1400]], device='cuda:0', grad_fn=<AddmmBackward0>)

**After loading a fully trained model for deployment / to make predictions, make sure you ALWAYS set it to evaluation mode:**


**model.eval()**

# **Putting It All Together**

your pipeline: **Data Preparation V2**, **Model Configuration V3**, and **Model Training V5**!

In [90]:
# %load data_preparation/v2.py

torch.manual_seed(13)

# Builds tensors from numpy arrays BEFORE split
x_tensor = torch.as_tensor(x).float()
y_tensor = torch.as_tensor(y).float()

# Builds dataset containing ALL data points
dataset = TensorDataset(x_tensor, y_tensor)

# Performs the split
ratio = .8
n_total = len(dataset)
n_train = int(n_total * ratio)
n_val = n_total - n_train
train_data, val_data = random_split(dataset, [n_train, n_val])
# Builds a loader of each set
train_loader = DataLoader(
                dataset=train_data,
                batch_size=16,
                shuffle=True,
                )
val_loader = DataLoader(dataset=val_data, batch_size=16)

In [91]:
# %load model_configuration/v3.py

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Sets learning rate - this is "eta" ~ the "n"-like Greek letter
lr = 0.1

torch.manual_seed(42)
# Now we can create a model and send it at once to the device
model = nn.Sequential(nn.Linear(1, 1)).to(device)

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(model.parameters(), lr=lr)

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Creates the train_step function for our model,
# loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)

# Creates the val_step function for our model and loss function
val_step = make_val_step(model, loss_fn)

# Creates a Summary Writer to interface with TensorBoard
writer = SummaryWriter('runs/simple_linear_regression')
# Fetches a single mini-batch so we can use add_graph
x_dummy, y_dummt = next(iter(train_loader))
writer.add_graph(model, x_dummy.to(device))

In [92]:
# %load model_training/v5.py

# Defines number of epochs
n_epochs = 200

losses = []
val_losses = []

for epoch in range(n_epochs):
    # inner loop
    loss = mini_batch(device, train_loader, train_step)
    losses.append(loss)
    
    # VALIDATION - no gradients in validation!
    with torch.no_grad():
        val_loss = mini_batch(device, val_loader, val_step)
        val_losses.append(val_loss)
    
    # Records both losses for each epoch under tag "loss"
    writer.add_scalars(main_tag='loss',
    tag_scalar_dict={
            'training': loss,
            'validation': val_loss},
            global_step=epoch)

# Closes the writer
writer.close()

In [93]:
print(model.state_dict())

OrderedDict([('0.weight', tensor([[2.0000]], device='cuda:0')), ('0.bias', tensor([1.0000], device='cuda:0'))])
