<p align="center" width="100%">
    <img width="66%" src="https://raw.githubusercontent.com/linukc/master_dlcourse/main/images/logo.png">
</p>

 # **[MIPT DL frameworks Spring 2024](https://wiki.cogmodel.mipt.ru/s/mtai/doc/2024-nejrosetevye-frejmvorki-glubokogo-obucheniya-ZBGd69bxLd). Class 2: Training on different platforms**

# Pytorch

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [None]:
# Download training data from open datasets.
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [None]:
batch_size = 128

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}, {X.dtype}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([128, 1, 28, 28]), torch.float32
Shape of y: torch.Size([128]) torch.int64


In [None]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 64)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

## CPU / GPU

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
model = NeuralNetwork().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
#model = torch.compile(model)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

time: 837 µs (started: 2024-02-19 00:58:24 +00:00)


In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

time: 516 µs (started: 2024-02-19 00:58:24 +00:00)


In [None]:
epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 4.161592  [   64/60000]
loss: 4.118782  [ 6464/60000]
loss: 4.108646  [12864/60000]
loss: 4.032907  [19264/60000]
loss: 4.016744  [25664/60000]
loss: 3.971307  [32064/60000]
loss: 3.931408  [38464/60000]
loss: 3.884785  [44864/60000]
loss: 3.714263  [51264/60000]
loss: 3.658295  [57664/60000]
Done!
time: 11.3 s (started: 2024-02-19 00:58:24 +00:00)


In [None]:
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

Saved PyTorch Model State to model.pth
time: 8.12 ms (started: 2024-02-19 00:58:36 +00:00)


In [None]:
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

time: 14.9 ms (started: 2024-02-19 00:58:36 +00:00)


## MultiGPU (DDP)

In [None]:
#%%writefile train.py
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os


def ddp_setup(rank, world_size):
    """
    Args:
        rank: Unique identifier of each process
        world_size: Total number of processes
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

class Trainer:
    def __init__(
        self,
        model: torch.nn.Module,
        train_data: DataLoader,
        optimizer: torch.optim.Optimizer,
        gpu_id: int,
        save_every: int,
    ) -> None:
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.train_data = train_data
        self.optimizer = optimizer
        self.save_every = save_every
        self.model = DDP(model, device_ids=[gpu_id])

    def _run_batch(self, source, targets):
        self.optimizer.zero_grad()
        output = self.model(source.view(-1, 28*28))
        loss = F.cross_entropy(output, targets)
        loss.backward()
        self.optimizer.step()

    def _run_epoch(self, epoch):
        b_sz = len(next(iter(self.train_data))[0])
        print(f"[GPU{self.gpu_id}] Epoch {epoch} | Batchsize: {b_sz} | Steps: {len(self.train_data)}")
        self.train_data.sampler.set_epoch(epoch)
        for source, targets in self.train_data:
            source = source.to(self.gpu_id)
            targets = targets.to(self.gpu_id)
            self._run_batch(source, targets)

    def _save_checkpoint(self, epoch):
        ckp = self.model.module.state_dict()
        PATH = "checkpoint.pt"
        torch.save(ckp, PATH)
        print(f"Epoch {epoch} | Training checkpoint saved at {PATH}")

    def train(self, max_epochs: int):
        for epoch in range(max_epochs):
            self._run_epoch(epoch)
            if self.gpu_id == 0 and epoch % self.save_every == 0:
                self._save_checkpoint(epoch)


def load_train_objs():
    train_set = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
    )
    model = torch.nn.Linear(28 * 28, 64)  # load your model
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
    return train_set, model, optimizer


def prepare_dataloader(dataset: Dataset, batch_size: int):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        pin_memory=True,
        shuffle=False,
        sampler=DistributedSampler(dataset)
    )


def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
    ddp_setup(rank, world_size)
    dataset, model, optimizer = load_train_objs()
    train_data = prepare_dataloader(dataset, batch_size)
    trainer = Trainer(model, train_data, optimizer, rank, save_every)
    trainer.train(total_epochs)
    destroy_process_group()


if __name__ == "__main__":
    world_size = torch.cuda.device_count()
    mp.spawn(main, args=(world_size, 2, 10, 32), nprocs=world_size)

In [None]:
!python3 train.py

## TPU

A TPU is a Tensor processing unit. Each TPU has 8 cores where each core is optimized for 128x128 matrix multiplies. In general, a single TPU is about as fast as 5 V100 GPUs!

In [None]:
import tensorflow as tf

# Detect if a TPU is available and initialize it
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  strategy = tf.distribute.TPUStrategy(tpu)
  print("Number of replicas:", strategy.num_replicas_in_sync)
except ValueError:
  print("Not connected to a TPU runtime. Using CPU/GPU strategy")
  strategy = tf.distribute.get_strategy()


Running on TPU  ['10.68.97.34:8470']
Number of replicas: 8


In [None]:
!pip list | grep torch

torch                            2.1.0+cu121
torchaudio                       2.1.0+cu121
torchdata                        0.7.0
torchsummary                     1.5.1
torchtext                        0.16.0
torchvision                      0.16.0+cu121


In [None]:
%%capture
!pip install torch_xla==2.1.0 https://storage.googleapis.com/pytorch-xla-releases/wheels/xrt/tpuvm/torch_xla-2.1.0%2Bxrt-cp310-cp310-manylinux_2_28_x86_64.whl

### Single XLA device

In [None]:
import torch_xla.core.xla_model as xm

In [None]:
device = xm.xla_device()
device

device(type='xla', index=1)

In [None]:
model = NeuralNetwork().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        xm.mark_step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    print(f"batch {batch}: loss {loss.item()}")

In [None]:
epochs = 1
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 4.158014  [  128/60000]
loss: 4.127364  [12928/60000]
loss: 4.096592  [25728/60000]
loss: 4.060153  [38528/60000]
loss: 4.009383  [51328/60000]
batch 468: loss 4.006476402282715
Done!


### Multiple XLA Devices with Multi-processing

In [None]:
#%%writefile train.py
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl


class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 64)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

#API to minimize the use of host memory with the `spawn(..., start_method='fork')
WRAPPED_MODEL = xmp.MpModelWrapper(NeuralNetwork())
SERIAL_EXEC = xmp.MpSerialExecutor() #Avoid all cores downloading the same data

def train(dataloader, model, loss_fn, optimizer, device):
  size = len(dataloader.dataset)
  mp_device_loader = pl.MpDeviceLoader(dataloader, device)
  model.train()
  for batch, (X, y) in enumerate(mp_device_loader):
    X, y = X.to(device), y.to(device)

    # Compute prediction error
    optimizer.zero_grad()
    pred = model(X)
    loss = loss_fn(pred, y)

    # Backpropagation
    loss.backward()
    optimizer.step()
    xm.optimizer_step(optimizer)

  loss = loss.item()
  print(f"batch {batch}, loss: {loss:>7f}")

def _mp_fn(index):
  def get_data():
    return datasets.MNIST(
              root="data",
              train=True,
              download=True,
              transform=ToTensor())

  print(xm.xrt_world_size(), xm.get_ordinal())
  train_dataset = SERIAL_EXEC.run(get_data)
  train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset,
    num_replicas=xm.xrt_world_size(),
    rank=xm.get_ordinal(),
    shuffle=True)

  device = xm.xla_device()
  model = WRAPPED_MODEL.to(device)
  train_dataloader = DataLoader(train_dataset, batch_size=8*128, sampler=train_sampler)
  optimizer = torch.optim.SGD(model.parameters(), lr=8*1e-3)
  loss_fn = nn.CrossEntropyLoss()

  epochs = 1
  for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer, device)
    print("Done!")

if __name__ == '__main__':
  xmp.spawn(_mp_fn, args=(), nprocs=8, start_method='fork')

Overwriting train.py


In [None]:
!python3 train.py

8 0
Epoch 1
-------------------------------
8 3
8 1
Epoch 1
-------------------------------
Epoch 1
-------------------------------
8 7
Epoch 1
-------------------------------
8 5
Epoch 1
-------------------------------
8 6
Epoch 1
-------------------------------
8 4
Epoch 1
-------------------------------
8 2
Epoch 1
-------------------------------
batch 7, loss: 4.141041
Done!
batch 7, loss: 4.147089
batch 7, loss: 4.142928
batch 7, loss: 4.145471
Done!
Done!
batch 7, loss: 4.145340
batch 7, loss: 4.147279
Done!
Done!
batch 7, loss: 4.145200
Done!
Done!
batch 7, loss: 4.139385
Done!


https://github.com/pytorch/xla/blob/master/test/test_train_mp_mnist.py

# Pytorch Lightning

In [None]:
%%capture
!python -m pip install lightning==2.1

Collecting lightning==2.1
  Downloading lightning-2.1.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities<2.0,>=0.8.0 (from lightning==2.1)
  Downloading lightning_utilities-0.10.1-py3-none-any.whl (24 kB)
Collecting torchmetrics<3.0,>=0.7.0 (from lightning==2.1)
  Downloading torchmetrics-1.3.1-py3-none-any.whl (840 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.4/840.4 kB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning (from lightning==2.1)
  Downloading pytorch_lightning-2.2.0.post0-py3-none-any.whl (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.9/800.9 kB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lightning-utilities, torchmetrics, pytorch-lightning, lightning
Successfully installed lightning-2.1.0 lightning-utilities-0.10.1 pytorch-lightning-2.2.0.

1. Computational code goes into LightningModule
2. Set forward hook
3. Set optimizers hook
4. Set train_step hook
5. Set validation hook
6. Remove any device calls
7. Override hooks as needed
8. Init Lightning Module
9. Init Trainer
10. Pass Dataloader
11. .fit()


In [None]:
import os
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torch.utils.data import DataLoader, random_split
import pytorch_lightning as pl
from torchvision import datasets

In [None]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 64)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
class MnistModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = NeuralNetwork()
        self.loss_func = nn.CrossEntropyLoss()

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # training_step defined the train loop.
        # It is independent of forward
        x, y = batch
        out = self.model(x)
        loss = self.loss_func(out, y)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
        return optimizer

In [None]:
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=transforms.ToTensor(),
)

batch_size = 128
# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)

In [None]:
# init model
model = MnistModule()

# most basic trainer, uses good defaults (auto-tensorboard, checkpoints, logs, and more)
trainer = pl.Trainer()
#trainer = pl.Trainer(accelerator="gpu", devices=1)
#trainer = pl.Trainer(accelerator="tpu", devices=[1])
#trainer = pl.Trainer(accelerator="tpu", devices=8)
trainer.fit(model, train_dataloader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | model     | NeuralNetwork    | 697 K 
1 | loss_func | CrossEntropyLoss | 0     
-----------------------------------------------
697 K     Trainable params
0         Non-trainable params
697 K     Total params
2.790     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]