In [1]:
import torch
from torch import nn

In [2]:
!git clone https://github.com/invis166/efficient-dl-systems.git

Cloning into 'efficient-dl-systems'...
remote: Enumerating objects: 1042, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (138/138), done.[K
remote: Total 1042 (delta 86), reused 177 (delta 63), pack-reused 835[K
Receiving objects: 100% (1042/1042), 46.16 MiB | 9.09 MiB/s, done.
Resolving deltas: 100% (428/428), done.


In [3]:
import os
os.chdir('efficient-dl-systems/week03_fast_pipelines/homework/task1')

In [4]:
!./download_data.sh

--2024-02-11 17:39:54--  https://www.dropbox.com/s/tc1qo73rrm3gt3m/CARVANA.zip
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/tc1qo73rrm3gt3m/CARVANA.zip [following]
--2024-02-11 17:39:54--  https://www.dropbox.com/s/raw/tc1qo73rrm3gt3m/CARVANA.zip
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucb4c9ac85253e3df7b50ab91fb0.dl.dropboxusercontent.com/cd/0/inline/CNGB9Dq64KmBHDrIk-Qf0rHjmO1Z4rmJ1iG8zwYy2m9vmR7Db_njUQ7auN1yqTMH9Nx6vfDj76vK8Fv5KxEo8lDxMNHbzo8mm6ZOVz3g0y-F39hbd70dZiU3oe_XFMMRnkTrpJ-aCwfl6UCfxOGFZQW9/file# [following]
--2024-02-11 17:39:55--  https://ucb4c9ac85253e3df7b50ab91fb0.dl.dropboxusercontent.com/cd/0/inline/CNGB9Dq64KmBHDrIk-Qf0rHjmO1Z4rmJ1iG8zwYy2m9vmR7Db_njUQ7auN1yqTMH9Nx6vfDj76vK8Fv5KxEo8lDxMNHbzo8mm6ZOVz

In [5]:
import torch
from torch import nn
from tqdm.auto import tqdm

from unet import Unet

from dataset import get_train_data


torch.manual_seed(42)

<torch._C.Generator at 0x785059f89690>

# Training in full precision

In [10]:
def train_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    criterion: torch.nn.modules.loss._Loss,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
) -> None:
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in pbar:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        accuracy = ((outputs > 0.5) == labels).float().mean()

        pbar.set_description(f"Loss: {round(loss.item(), 4)} " f"Accuracy: {round(accuracy.item() * 100, 4)}")


def train():
    torch.manual_seed(42)

    device = torch.device("cuda:0")
    model = Unet().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    train_loader = get_train_data()

    num_epochs = 5
    for epoch in range(0, num_epochs):
        train_epoch(train_loader, model, criterion, optimizer, device=device)


In [13]:
train()



  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

# Training in mixed precision without torch loss scaler

In [15]:
def train_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    criterion: torch.nn.modules.loss._Loss,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
) -> None:
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in pbar:
        images = images.to(device)
        labels = labels.to(device)

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        accuracy = ((outputs > 0.5) == labels).float().mean()

        pbar.set_description(f"Loss: {round(loss.item(), 4)} " f"Accuracy: {round(accuracy.item() * 100, 4)}")


def train():
    torch.manual_seed(42)

    device = torch.device("cuda:0")
    model = Unet().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    train_loader = get_train_data()

    num_epochs = 5
    for epoch in range(0, num_epochs):
        train_epoch(train_loader, model, criterion, optimizer, device=device)


In [16]:
train()



  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

# Training in mixed precision with pytorch scaler

In [17]:
def train_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    criterion: torch.nn.modules.loss._Loss,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
    scaler,
) -> None:
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in pbar:
        images = images.to(device)
        labels = labels.to(device)

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        accuracy = ((outputs > 0.5) == labels).float().mean()

        pbar.set_description(f"Loss: {round(loss.item(), 4)} " f"Accuracy: {round(accuracy.item() * 100, 4)}")


def train():
    torch.manual_seed(42)

    device = torch.device("cuda:0")
    model = Unet().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scaler = torch.cuda.amp.GradScaler()

    train_loader = get_train_data()

    num_epochs = 5
    for epoch in range(0, num_epochs):
        train_epoch(train_loader, model, criterion, optimizer, device=device, scaler=scaler)


In [18]:
train()



  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

Conclusions:
1. Mixed precision training without grad scaling is bad
2. Mixed precision training with grad scaling converges better than full precision training (in that specific case)

# Static loss scaling

In [12]:
def train_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    criterion: torch.nn.modules.loss._Loss,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
) -> None:
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in pbar:
        images = images.to(device)
        labels = labels.to(device)

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        # TODO: your code for loss scaling here
        scale_factor = 128
        loss *= scale_factor
        loss.backward()
        for name, parameter in model.named_parameters():
            parameter.grad /= scale_factor
        optimizer.step()
        optimizer.zero_grad()
        loss /= scale_factor
        #

        accuracy = ((outputs > 0.5) == labels).float().mean()

        pbar.set_description(f"Loss: {round(loss.item(), 4)} " f"Accuracy: {round(accuracy.item() * 100, 4)}")


def train():
    torch.manual_seed(42)

    device = torch.device("cuda:0")
    model = Unet().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    train_loader = get_train_data()

    num_epochs = 5
    for epoch in range(0, num_epochs):
        train_epoch(train_loader, model, criterion, optimizer, device=device)


train()

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

# Dynamic loss scaling

In [16]:
class CustomGradScaler:
    '''Simple gradient scaler. Implements exponential-backoff strategy for
    updating the scale factor, like in the original PyTorch GradientScaler'''
    def __init__(self, init_scale, growth_factor, backoff_factor, growth_interval):
        self.current_scale = init_scale
        self.growth_factor = growth_factor
        self.backoff_factor = backoff_factor
        self.growth_interval = growth_interval

        self._performed_update_last_step = False
        self._steps_with_update = 0

    def scale(self, loss):
        return loss * self.current_scale

    def step(self, optimizer):
        if not self._should_update_params(optimizer):
            optimizer.zero_grad()
            self._performed_update_last_step = False
            self._steps_with_update = 0
            return

        for group in optimizer.param_groups:
            for param in group['params']:
                param.grad /= self.current_scale

        optimizer.step()
        optimizer.zero_grad()

        self._steps_with_update += 1
        self._performed_update_last_step = True

    def _should_update_params(self, optimizer):
        for group in optimizer.param_groups:
            for param in group['params']:
                if torch.any(param.grad.isnan()) or torch.any(param.grad.isinf()):
                    return False

        return True

    def update(self):
        if not self._performed_update_last_step:
            self.current_scale *= self.backoff_factor
        elif self._steps_with_update >= self.growth_interval:
            self.current_scale *= self.growth_factor


def train_epoch(
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    criterion: torch.nn.modules.loss._Loss,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
    scaler,
) -> None:
    model.train()

    pbar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in pbar:
        images = images.to(device)
        labels = labels.to(device)

        with torch.cuda.amp.autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        accuracy = ((outputs > 0.5) == labels).float().mean()

        pbar.set_description(f"Loss: {round(loss.item(), 4)} " f"Accuracy: {round(accuracy.item() * 100, 4)}")


def train():
    torch.manual_seed(42)

    device = torch.device("cuda:0")
    model = Unet().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    train_loader = get_train_data()
    scaler = CustomGradScaler(init_scale=32, growth_factor=2, backoff_factor=0.5, growth_interval=20)

    num_epochs = 5
    for epoch in range(0, num_epochs):
        train_epoch(train_loader, model, criterion, optimizer, device=device, scaler=scaler)


train()



  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]