# Parallel computing with Pytorch in deep learning

author: Jing Zhang    
e-mail: zhangjingnm@hotmail.com    
date: 2024-09    
reference: http://www.idris.fr/eng/jean-zay/gpu/jean-zay-gpu-torch-multi-eng.html    
https://keras.io/guides/distributed_training_with_torch/

### Single GPU

implemention

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import timm
import os
import socket

hostname = socket.gethostname()
num_gpus = torch.cuda.device_count()
num_epochs = 2
batch_size = 64
model_path = './best.pth'

for i in range(num_gpus): print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
print(f'{num_gpus} GPU in total on the node [{hostname}]\n')

# data augmentation and preprocessing
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
])
transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
])

# load CIFAR-10 dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
valid_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

def train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion):
    best_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_loss_count = 0
        correct = 0
        total = 0
        
        # train
        for inputs, labels in train_loader:
            inputs, labels = inputs.cuda(), labels.cuda()
            # forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # backward propagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_loss_count += 1
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        epoch_loss = running_loss / running_loss_count
        epoch_acc = 100. * correct / total

        # valid
        model.eval()
        correct_val = 0
        total_val = 0
        running_loss_val = 0.0
        running_loss_count_val = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.cuda(), labels.cuda()
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                running_loss_val += loss.item()
                running_loss_count_val += 1
                _, predicted = outputs.max(1)
                total_val += labels.size(0)
                correct_val += predicted.eq(labels).sum().item()

        epoch_loss_val = running_loss_val / running_loss_count_val
        epoch_acc_val = 100. * correct_val / total_val
        print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}, Valid Loss: {epoch_loss_val:.4f}, Valid Acc: {epoch_acc_val:.2f}')
        
        # save model
        if epoch_acc_val > best_acc:
            print(f'Saving best model with validation accuracy: {epoch_acc_val:.2f}%')
            torch.save(model.state_dict(), model_path)
            best_acc = epoch_acc_val

if __name__ == "__main__":
   
    model = timm.create_model('resnet18', pretrained=True, num_classes=10)
    model = model.cuda()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion)

output
```
GPU 0: NVIDIA RTX A5000
1 GPU in total on the node [cn39]

Epoch 1/2: Train Loss: 1.1341, Train Acc: 59.92, Valid Loss: 0.7287, Valid Acc: 75.31
Saving best model with validation accuracy: 75.31%
Epoch 2/2: Train Loss: 0.7425, Train Acc: 74.28, Valid Loss: 0.6481, Valid Acc: 77.84
Saving best model with validation accuracy: 77.84%
 
 
This program takes 0d:0h:1m:20s
```

## Multi-GPU on one node with SLURM

### Data parallelism

implemention 0:Data Parallelism    
easy but not recommand. reference: https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html    
https://torchhogehoge.hatenablog.com/entry/pytorch_DDP

In [None]:
# Multi GPU(DP)
import torch
import torchvision
import torch.nn as nn

class NN(nn.Module):
    def __init__(self):
        super().__init__()
        self.n = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
      return self.n(x)
data = torchvision.datasets.MNIST(root= "data", train=True, download=True, transform = torchvision.transforms.ToTensor())
data_loader = torch.utils.data.DataLoader(data, batch_size=64*torch.cuda.device_count(), shuffle=True, num_workers=2, pin_memory=True)

model = NN().cuda()
model = torch.nn.DataParallel(model)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

for epoch in range(2):
  total_loss = 0
  for imgs, labels in data_loader:
    predict = model(imgs.cuda())
    loss = criterion(predict, labels.cuda())
    total_loss+= loss.item()
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f"{epoch:3d}: {total_loss:.4f}")
torch.save(model.module.state_dict(), 'model.pth')

implemention 1:Distributed Data Parallelism    
reference: https://pytorch.org/docs/stable/notes/ddp.html#ddp    
https://www.youtube.com/watch?v=bwNtfxEDjGA    
![DDP](../figures/DDP.png)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.utils.data.distributed as data_dist
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import hostlist
import timm
import os
import socket

hostname = socket.gethostname()
num_gpus = torch.cuda.device_count()
num_epochs = 2
batch_size = 64
model_path = './best.pth'

for i in range(num_gpus): print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
print(f'{num_gpus} GPU in total on the node [{hostname}]\n')

def setup_device(current_gpu_index, num_gpus):
    # Device setup
    os.environ['MASTER_ADDR'] = str(hostname) # or localhost
    os.environ['MASTER_PORT'] = str(12345 + int(num_gpus)) # to avoid port conflict on the same node
    gpu = torch.device("cuda:{}".format(current_gpu_index))
    dist.init_process_group(backend="nccl", world_size=num_gpus, rank=current_gpu_index)
    torch.cuda.set_device(gpu)

def cleanup():
    dist.destroy_process_group()

def prepare_dataloader(dataset, current_gpu_index, num_gpus, batch_size):
    sampler = data_dist.DistributedSampler(dataset, num_replicas=num_gpus, rank=current_gpu_index, shuffle=False)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size, shuffle=False)
    return dataloader

def per_device_launch_fn(current_gpu_index, num_gpu):
    # Setup the process groups
    setup_device(current_gpu_index, num_gpu)  
    print(f'current_gpu_index: {current_gpu_index}')
 
    # data augmentation and preprocessing
    transform = transforms.Compose([
        transforms.RandomHorizontalFlip(),  
        transforms.RandomCrop(32, padding=4),  
        transforms.ToTensor(),  
        transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261]) 
    ])
    transform_val = transforms.Compose([
        transforms.ToTensor(),  
        transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261]) 
    ])
    # load CIFAR-10 dataset
    train_dataset = datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
    valid_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_val)

    # prepare the dataloader
    train_loader = prepare_dataloader(train_dataset, current_gpu_index, num_gpu, batch_size)
    valid_loader = prepare_dataloader(valid_dataset, current_gpu_index, num_gpu, batch_size)

    # Put model on device
    model = timm.create_model('resnet18', pretrained=True, num_classes=10, in_chans=3)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()
    
    # Put model on device
    model = model.to(current_gpu_index)
    ddp_model = DDP(model, device_ids=[current_gpu_index], output_device=current_gpu_index)    
    
    train_model(ddp_model, train_loader, valid_loader, num_epochs, optimizer, loss_fn)
    cleanup()

def train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion):
    best_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_loss_count = 0
        correct = 0
        total = 0
        # train
        for inputs, labels in train_loader:
            inputs, labels = inputs.cuda(non_blocking=True), labels.cuda(non_blocking=True)

            # forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # backword propagation and optmization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # calculate loss and accuracy
            running_loss += loss.item() # average loss of batch_size
            running_loss_count += 1
            _, predicted = outputs.max(1) # return max value and its index, outputs:(batch_size, num_classes)
            total += labels.size(0) # (batch_size,)
            correct += predicted.eq(labels).sum().item() # .item() Convert a tensor to a scalar

        epoch_loss = running_loss / running_loss_count
        epoch_acc = 100. * correct / total
        # valid
        model.eval()
        running_loss_val = 0.0
        running_loss_count_val = 0
        correct_val = 0
        total_val = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.cuda(non_blocking=True), labels.cuda(non_blocking=True)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                running_loss_val += loss.item()
                running_loss_count_val += 1
                _, predicted = outputs.max(1)
                total_val += labels.size(0)
                correct_val += predicted.eq(labels).sum().item()

        epoch_loss_val = running_loss_val / running_loss_count_val
        epoch_acc_val = 100. * correct_val / total_val
        print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}, Valid Loss: {epoch_loss_val:.4f}, Valid Acc: {epoch_acc_val:.2f}')
        # save model at rank=0
        if dist.get_rank() == 0:
            if epoch_acc_val > best_acc:
                print(f'Saving best model with validation accuracy: {epoch_acc_val:.2f}%')
                torch.save(model.state_dict(), model_path)
                best_acc = epoch_acc_val

if __name__ == "__main__":
    torch.multiprocessing.start_processes(
        per_device_launch_fn,
        args=(num_gpus,),
        nprocs=num_gpus,
        join=True,
        start_method="spawn",
    )

output
```
GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
4 GPU in total on the node [cn39]

GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
4 GPU in total on the node [cn39]

current_gpu_index: 2
Epoch 1/2: Train Loss: 1.3159, Train Acc: 54.38, Valid Loss: 0.8592, Valid Acc: 69.44
Epoch 2/2: Train Loss: 0.8251, Train Acc: 71.74, Valid Loss: 0.6585, Valid Acc: 76.80
GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
4 GPU in total on the node [cn39]

current_gpu_index: 1
Epoch 1/2: Train Loss: 1.3193, Train Acc: 54.21, Valid Loss: 0.8807, Valid Acc: 69.68
Epoch 2/2: Train Loss: 0.8282, Train Acc: 71.26, Valid Loss: 0.6728, Valid Acc: 76.80
GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
4 GPU in total on the node [cn39]

current_gpu_index: 0
Epoch 1/2: Train Loss: 1.3200, Train Acc: 53.60, Valid Loss: 0.8733, Valid Acc: 69.96
Saving best model with validation accuracy: 69.96%
Epoch 2/2: Train Loss: 0.8254, Train Acc: 71.49, Valid Loss: 0.6761, Valid Acc: 77.56
Saving best model with validation accuracy: 77.56%
GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
4 GPU in total on the node [cn39]

current_gpu_index: 3
Epoch 1/2: Train Loss: 1.3211, Train Acc: 54.14, Valid Loss: 0.8391, Valid Acc: 70.36
Epoch 2/2: Train Loss: 0.8263, Train Acc: 71.31, Valid Loss: 0.6591, Valid Acc: 77.00
 
 
This program takes 0d:0h:0m:51s
```

### Model parallelism

implemention

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import timm
import os
import socket

hostname = socket.gethostname()
num_gpus = torch.cuda.device_count()
num_epochs = 2
batch_size = 64
model_path = './best.pth'

for i in range(num_gpus): print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
print(f'{num_gpus} GPU in total on the node [{hostname}]\n')

model = timm.create_model('resnet18', pretrained=True, num_classes=10)

# split model mannualy into different parts (GPU)
class SplitResNet18(nn.Module):
    def __init__(self):
        super(SplitResNet18, self).__init__()
        # GPU 0
        self.conv1_to_maxpool = nn.Sequential(model.conv1, model.bn1, model.act1, model.maxpool).to('cuda:0')
        # GPU 1 
        self.layer1_to_layer2 = nn.Sequential(model.layer1, model.layer2).to('cuda:1')
        # GPU 2
        self.layer3_to_layer4 = nn.Sequential(model.layer3, model.layer4).to('cuda:2')
        # GPU 3
        self.global_pool_to_fc = nn.Sequential(model.global_pool, model.fc).to('cuda:3')

    def forward(self, x):
        # input to GPU 0 
        x = x.to('cuda:0')
        x = self.conv1_to_maxpool(x)
        x = x.to('cuda:1')
        x = self.layer1_to_layer2(x)
        x = x.to('cuda:2')
        x = self.layer3_to_layer4(x)
        x = x.to('cuda:3')
        x = self.global_pool_to_fc(x)
        
        return x

# data augmentation and preprocessing
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
])
transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.247, 0.243, 0.261])
])

# load CIFAR-10 dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=False, transform=transform)
valid_dataset = datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

def train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion):
    best_acc = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        running_loss_count = 0
        correct = 0
        total = 0
        
        device = next(model.parameters()).device
        print(f'current device: {device}')
        # train
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
            # forward pass
            outputs = model(inputs)
            outputs = outputs.to(device)
            loss = criterion(outputs, labels)

            # backward propagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_loss_count += 1
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        epoch_loss = running_loss / running_loss_count
        epoch_acc = 100. * correct / total

        # valid
        model.eval()
        correct_val = 0
        total_val = 0
        running_loss_val = 0.0
        running_loss_count_val = 0
        with torch.no_grad():
            for inputs, labels in valid_loader:
                inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
                outputs = model(inputs)
                outputs = outputs.to(device)
                loss = criterion(outputs, labels)

                running_loss_val += loss.item()
                running_loss_count_val += 1
                _, predicted = outputs.max(1)
                total_val += labels.size(0)
                correct_val += predicted.eq(labels).sum().item()

        epoch_loss_val = running_loss_val / running_loss_count_val
        epoch_acc_val = 100. * correct_val / total_val
        print(f'Epoch {epoch+1}/{num_epochs}: Train Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}, Valid Loss: {epoch_loss_val:.4f}, Valid Acc: {epoch_acc_val:.2f}')
        
        # save model
        if epoch_acc_val > best_acc:
            print(f'Saving best model with validation accuracy: {epoch_acc_val:.2f}%')
            torch.save(model.state_dict(), model_path)
            best_acc = epoch_acc_val

if __name__ == "__main__":
   
    model = SplitResNet18()
    optimizer = optim.AdamW(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    train_model(model, train_loader, valid_loader, num_epochs, optimizer, criterion)

output
```
GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
4 GPU in total on the node [cn39]

current device: cuda:0
Epoch 1/2: Train Loss: 1.1441, Train Acc: 60.03, Valid Loss: 0.7721, Valid Acc: 73.44
Saving best model with validation accuracy: 73.44%
current device: cuda:0
Epoch 2/2: Train Loss: 0.7518, Train Acc: 74.11, Valid Loss: 0.6648, Valid Acc: 76.91
Saving best model with validation accuracy: 76.91%
 
 
This program takes 0d:0h:1m:20s
```

### SLURM information

```
$ sinfo
PARTITION     AVAIL  TIMELIMIT  NODES  STATE NODELIST
GPU11Go          up 2-00:00:00     13    mix cn[01-07,09-11,15,17,41]
GPU11Go          up 2-00:00:00      1   idle cn16
GPU24Go          up 1-00:00:00      9   idle cn[18,22,24-25,35,38-40,53]
GPU48Go          up 1-00:00:00      1  down* cn60
GPU48Go          up 1-00:00:00      3    mix cn[37,48,57]
GPU48Go          up 1-00:00:00      3   idle cn[20,42,52]
GPU96Go          up 1-00:00:00      3    mix cn[51,58-59]
GPU96Go          up 1-00:00:00      1  alloc cn50
Tests-GPU24Go    up       5:00      1  down* cn54
WS-CPU1*         up 4-00:00:00      3  down* cn[32-33,46]
WS-CPU1*         up 4-00:00:00      7    mix cn[08,12-14,19,26,34]
WS-CPU1*         up 4-00:00:00      6  alloc cn[21,27-29,31,36]
WS-CPU2          up 4-00:00:00     12    mix cn[01-07,09-11,15,17]
Serveurs-CPU     up 10-00:00:0      8    mix cn[30,43-45,47,49,55-56]
```

```
#!/bin/bash 
#SBATCH --partition=GPU24Go   # partition name
##SBATCH --nodes=2            # total number of nodes
#SBATCH --nodelist=cn39       # Or specify a specific node check via sinfo
#SBATCH --ntasks-per-node=1   # number of tasks per node, if >1 will repeat task
#SBATCH --gres=gpu:4          # number of GPUs reserved per node
#SBATCH --cpus-per-task=4     # number of CPUs per task
##SBATCH --mem-per-cpu=1500MB # memory of per CPU
#SBATCH -J multigpu           # job name
#SBATCH --out=%J/result.txt   # log file
#SBATCH --error=%J/error.txt  # log file
##SBATCH --array=0-1          # array 2 jobs
#SBATCH --array=1             # array 1 job
```

SLURM in python

In [None]:
hostname = socket.gethostname()
num_gpus = torch.cuda.device_count()
print(f'{num_gpus} GPU in total on this node {hostname}')
num_epochs = 2
batch_size = 64

for i in range(num_gpus):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

# get SLURM variables
global_rank = int(os.environ['SLURM_PROCID'])                         # The global ranking of the current process among all processes
local_rank = int(os.environ['SLURM_LOCALID'])                         # The local rank of the current process on its node
number_tasks = int(os.environ['SLURM_NTASKS'])                        # The total number of tasks in the job
cpus_per_task = int(os.environ['SLURM_CPUS_PER_TASK'])                # The number of CPUs allocated to each task
nodelist = os.environ['SLURM_JOB_NODELIST']                           # the list of nodes
nodelist = hostlist.expand_hostlist(os.environ['SLURM_JOB_NODELIST']) # expand nodelist
gpu_ids = os.environ['SLURM_STEP_GPUS'].split(",")                    # The list of GPU IDs allocated to the job

print(f'  global_rank: {global_rank}')
print(f'   local_rank: {local_rank}')
print(f' number_tasks: {number_tasks}')
print(f'cpus_per_task: {cpus_per_task}')
print(f'     nodelist: {nodelist}')
print(f'      gpu_ids: {gpu_ids}')

output
```
4 GPU in total on this node cn39
GPU 0: NVIDIA RTX A5000
GPU 1: NVIDIA RTX A5000
GPU 2: NVIDIA RTX A5000
GPU 3: NVIDIA RTX A5000
  global_rank: 0
   local_rank: 0
 number_tasks: 1
cpus_per_task: 4
     nodelist: ['cn39']
      gpu_ids: ['0', '1', '2', '3']
```

### GPU information

In [16]:
# dell workstation
import torch 

num_gpus = torch.cuda.device_count()
print(f'{num_gpus} GPU in total on this machine')

for i in range(num_gpus):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

import socket
hostname = socket.gethostname()
print(f"Node name (hostname): {hostname}")

print('PyTorch version:', torch.__version__)  # PyTorch version
print('CUDA version:', torch.version.cuda)    # CUDA version

1 GPU in total on this machine
GPU 0: Quadro RTX 5000
Node name (hostname): DESKTOP-7GS0DEP
PyTorch version: 2.4.1+cu124
CUDA version: 12.4


In [5]:
# thinkbook
import torch 

num_gpus = torch.cuda.device_count()
print(f'{num_gpus} GPU in total on this machine')

for i in range(num_gpus):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

import socket
hostname = socket.gethostname()
print(f"Node name (hostname): {hostname}")

print('PyTorch version:', torch.__version__)  # PyTorch version
print('CUDA version:', torch.version.cuda)    # CUDA version

1 GPU in total on this machine
GPU 0: NVIDIA GeForce RTX 3050 4GB Laptop GPU
Node name (hostname): Lenovo
PyTorch version: 2.4.1+cu124
CUDA version: 12.4


## Multi-GPU on multi nodes with SLURM (todo)