In [1]:
# Experiment tracking helps you figure out what works and what doesn't

In [2]:
# Different ways to to track ML experiments:

# Simple print-out, say save experiment results in CSV

# TensorBoard

# MLFlow
# Full MLOps lifecycle management

In [3]:
# This notebook focuses on using TensorBoard to track experiments.

In [4]:
# For this notebook to run with updated APIs, we need torch 1.12+ and torchvision 0.13+
try:
    import torch
    import torchvision
    assert int(torch.__version__.split(".")[1]) >= 12, "torch version should be 1.12+"
    assert int(torchvision.__version__.split(".")[1]) >= 13, "torchvision version should be 0.13+"
    print(f"torch version: {torch.__version__}")
    print(f"torchvision version: {torchvision.__version__}")
except:
    print(f"[INFO] torch/torchvision versions not as required, installing nightly versions.")
    !pip3 install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
    import torch
    import torchvision
    print(f"torch version: {torch.__version__}")
    print(f"torchvision version: {torchvision.__version__}")

[INFO] torch/torchvision versions not as required, installing nightly versions.
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
torch version: 2.0.0+cu117
torchvision version: 0.15.1+cu117


In [5]:
from src_05_modular import utils

In [6]:
device = utils.get_device()
device

'cuda'

In [7]:
import pathlib

In [8]:
image_path = pathlib.Path("data/")/"pizza_steak_sushi"
image_path

WindowsPath('data/pizza_steak_sushi')

In [9]:
train_dir = image_path/"train"
test_dir = image_path/"test"

train_dir, test_dir

(WindowsPath('data/pizza_steak_sushi/train'),
 WindowsPath('data/pizza_steak_sushi/test'))

In [10]:
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT

In [11]:
automatic_transforms = weights.transforms()
automatic_transforms

ImageClassification(
    crop_size=[224]
    resize_size=[256]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BICUBIC
)

In [12]:
from src_05_modular import data_setup, engine

In [13]:
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    transform=automatic_transforms,
    batch_size=64
)

In [14]:
# Set up pre-trained model (freeze base layers; change the classifier head)

In [15]:
model = torchvision.models.efficientnet_b0(weights=weights)

for param in model.features.parameters():
    param.requires_grad = False

    
torch.manual_seed(42)

model.classifier = torch.nn.Sequential(
    torch.nn.Dropout(p=0.2, inplace=True),
    torch.nn.Linear(in_features=1280, out_features=len(class_names), bias=True)
)

In [16]:
import torchinfo

In [17]:
torchinfo.summary(
    model, 
    input_size=(32, 3, 224, 224), # (batch_size, color_channels, height, width)
    col_names=["input_size", "output_size", "num_params", "trainable"],
    col_width=20,
    row_settings=["var_names"])

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]    [32, 3]              --                   Partial
├─Sequential (features)                                      [32, 3, 224, 224]    [32, 1280, 7, 7]     --                   False
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]    [32, 32, 112, 112]   --                   False
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]    [32, 32, 112, 112]   (864)                False
│    │    └─BatchNorm2d (1)                                  [32, 32, 112, 112]   [32, 32, 112, 112]   (64)                 False
│    │    └─SiLU (2)                                         [32, 32, 112, 112]   [32, 32, 112, 112]   --                   --
│    └─Sequential (1)                                        [32, 32, 112, 112]   [32, 

In [18]:
model = model.to(device)

In [19]:
next(model.parameters()).device

device(type='cuda', index=0)

In [20]:
# Define loss function and optimizer

In [21]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

## Track results using TensorBoard

In [22]:
# Use tensorboard.SummaryWriter() class to save various parts of model training progress

# Be default, SummaryWriter() saves info to a file defined by the log_dir parameter,
# and the default location of log_dir is runs/CURRENT_DATETIME_HOSTNAME.
# Note: the TensorBoard format is part of TensorFlow library.

In [23]:
from torch.utils.tensorboard import SummaryWriter

In [24]:
writer = SummaryWriter()

In [25]:
dir(writer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_caffe2_blob',
 '_encode',
 '_get_file_writer',
 'add_audio',
 'add_custom_scalars',
 'add_custom_scalars_marginchart',
 'add_custom_scalars_multilinechart',
 'add_embedding',
 'add_figure',
 'add_graph',
 'add_histogram',
 'add_histogram_raw',
 'add_hparams',
 'add_image',
 'add_image_with_boxes',
 'add_images',
 'add_mesh',
 'add_onnx_graph',
 'add_pr_curve',
 'add_pr_curve_raw',
 'add_scalar',
 'add_scalars',
 'add_text',
 'add_video',
 'all_writers',
 'close',
 'default_bins',
 'file_writer',
 'filename_suffix',
 'flush',
 'flush_secs',
 'get_logdir',
 'log_dir',
 'max_queue',
 'purge

In [26]:
# We need to modify the src_05_modular.engine.train() function to 
# log model's training and test accuracies and losses, using
# writer.add_scalars(
# main_tag: str="Loss", 
# tag_scalar_dict: dict={"train_loss":train_loss, "test_loss":test_loss})

In [27]:
# For changes, see src_07_exp_tracking.engine.train()

In [28]:
# def train(model: torch.nn.Module, 
#           train_dataloader: torch.utils.data.DataLoader, 
#           test_dataloader: torch.utils.data.DataLoader, 
#           optimizer: torch.optim.Optimizer,
#           loss_fn: torch.nn.Module,
#           epochs: int,
#           device: torch.device) -> Dict[str, List]:

#    Omit ...

#         # Update results dictionary
#         results["train_loss"].append(train_loss)
#         results["train_acc"].append(train_acc)
#         results["test_loss"].append(test_loss)
#         results["test_acc"].append(test_acc)

# NEW:


#         ### New: Experiment tracking ###
#         # Add loss results to SummaryWriter
#         writer.add_scalars(main_tag="Loss", 
#                            tag_scalar_dict={"train_loss": train_loss,
#                                             "test_loss": test_loss},
#                            global_step=epoch)

#         # Add accuracy results to SummaryWriter
#         writer.add_scalars(main_tag="Accuracy", 
#                            tag_scalar_dict={"train_acc": train_acc,
#                                             "test_acc": test_acc}, 
#                            global_step=epoch)
        
#         # Track the PyTorch model architecture
#         writer.add_graph(model=model, 
#                          # Pass in an example input
#                          input_to_model=torch.randn(32, 3, 224, 224).to(device))
    
#     # Close the writer
#     writer.close()
    
#     ### End new ###

#     # Return the filled results at the end of the epochs
#     return results

In [29]:
from src_07_exp_tracking import engine

In [30]:
results = engine.train(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    epochs=5,
    device=device,
    writer=writer)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1: Train loss: 1.0581, Train acc: 0.4635 |  Test loss: 0.9669, Test acc: 0.6165
Epoch 2: Train loss: 0.9331, Train acc: 0.6352 |  Test loss: 0.7968, Test acc: 0.7685
Epoch 3: Train loss: 0.7905, Train acc: 0.7753 |  Test loss: 0.7218, Test acc: 0.8764
Epoch 4: Train loss: 0.6901, Train acc: 0.8332 |  Test loss: 0.6576, Test acc: 0.9233
Epoch 5: Train loss: 0.6254, Train acc: 0.8606 |  Test loss: 0.5687, Test acc: 0.9531

>>>Tensorboard logs saved in runs\Mar28_22-30-30_LAPTOP-NU5KNMS8



In [31]:
results

{'train_loss': [1.0581388622522354,
  0.9331417083740234,
  0.7905052900314331,
  0.6900958269834518,
  0.6254110038280487],
 'train_acc': [0.46354166666666663,
  0.6351799242424243,
  0.7753314393939394,
  0.8332149621212122,
  0.8605587121212122],
 'test_loss': [0.966871589422226,
  0.7968323826789856,
  0.7218093574047089,
  0.6575829982757568,
  0.5686689615249634],
 'test_acc': [0.6164772727272727,
  0.7684659090909092,
  0.8764204545454546,
  0.9232954545454546,
  0.953125]}

In [32]:
%load_ext tensorboard
%tensorboard --logdir runs

Reusing TensorBoard on port 6006 (pid 7696), started 0:43:55 ago. (Use '!kill 7696' to kill it.)

In [33]:
# Create a helper function to define usage of SummaryWriter() 
# such that each experiment gets its own log directory with names like
# Experiment timestamp
# Experiment name
# Experiment model's name
# Anything extra
# Example:
# runs/YYYY-MM-DD/exp_name/model_name/extra

In [34]:
from datetime import datetime

In [35]:
datetime.now().strftime("%Y-%m-%d")

'2023-03-28'

In [36]:
# See its details in src_07_exp_tracking
def create_writer(
    experiment_name:str, 
    model_name:str, 
    extra:str=None
) -> torch.utils.tensorboard.writer.SummaryWriter():
    pass

In [37]:
import importlib

In [52]:
from src_07_exp_tracking import engine

In [53]:
importlib.reload(engine)

<module 'src_07_exp_tracking.engine' from 'C:\\Users\\james\\repos\\pytorch-basics\\src_07_exp_tracking\\engine.py'>

In [54]:
# Try various experiments
# * Change num of epochs
# * Change num of layers/hidden units
# * Change amount of data
# * Change learning rate
# * Try diff kinds of data augmentation
# * Choose diff model architecture

In [55]:
# In the following, we'll try the following combinations:
# 1. Diff amount of data (10% of Pizza/Steak/Sushi vs 20%)
# 2. Diff models (efficientnet_b0 vs efficientnet_b2)
# 3. Diff training time (5 vs 10 epochs)
# So that's 2*2*2 = 8 experiments to track

In [56]:
# Download data: 10% vs 20%

In [59]:
from pathlib import Path
import os
import zipfile
import requests

def download_data(source: str, 
                  destination: str,
                  remove_source: bool = True) -> Path:
    """Downloads a zipped dataset from source and unzips to destination.

    Args:
        source (str): A link to a zipped file containing data.
        destination (str): A target directory to unzip data to.
        remove_source (bool): Whether to remove the source after downloading and extracting.
    
    Returns:
        pathlib.Path to downloaded data.
    
    Example usage:
        download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                      destination="pizza_steak_sushi")
    """
    # Setup path to data folder
    data_path = Path("data/")
    image_path = data_path / destination

    # If the image folder doesn't exist, download it and prepare it... 
    if image_path.is_dir():
        print(f"[INFO] {image_path} directory exists, skipping download.")
    else:
        print(f"[INFO] Did not find {image_path} directory, creating one...")
        image_path.mkdir(parents=True, exist_ok=True)
        
        # Download pizza, steak, sushi data
        target_file = Path(source).name
        with open(data_path / target_file, "wb") as f:
            request = requests.get(source)
            print(f"[INFO] Downloading {target_file} from {source}...")
            f.write(request.content)

        # Unzip pizza, steak, sushi data
        with zipfile.ZipFile(data_path / target_file, "r") as zip_ref:
            print(f"[INFO] Unzipping {target_file} data...") 
            zip_ref.extractall(image_path)

        # Remove .zip file
        if remove_source:
            os.remove(data_path / target_file)
    
    return image_path

In [61]:
# Download 10 percent and 20 percent training data (if necessary)
data_10_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
                                     destination="pizza_steak_sushi")

data_20_percent_path = download_data(source="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip",
                                     destination="pizza_steak_sushi_20_percent")

[INFO] data\pizza_steak_sushi directory exists, skipping download.
[INFO] Did not find data\pizza_steak_sushi_20_percent directory, creating one...
[INFO] Downloading pizza_steak_sushi_20_percent.zip from https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip...
[INFO] Unzipping pizza_steak_sushi_20_percent.zip data...
