In [1]:
# BOILER PLATE, MUST BE RUN ON SUBMIT NODE
%load_ext autoreload
%autoreload 2

import socket
import psutil
import torch 

hostname = socket.gethostname()
print("Host name:", hostname)
num_cpus = psutil.cpu_count()
print("Number of CPUs:", num_cpus)
total_memory = psutil.virtual_memory().total / (1024 ** 3)
print("Total memory (GB):", round(total_memory, 2))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Host name: h100-st-p548xlarge-10
Number of CPUs: 192
Total memory (GB): 1999.96
Using device: cuda


In [26]:
import numpy as np
from datasets import Dataset
import pynvml

def print_gpu_utilization(devices=0):
    pynvml.nvmlInit()
    if not isinstance(devices, list):
        devices = [devices]
    
    for device in devices:
        handle = pynvml.nvmlDeviceGetHandleByIndex(device)
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        print(f"GPU-{device} memory occupied: {info.used//1024**2} MB.")
    
def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [29]:
print("Initial")
print_gpu_utilization([0, 1])

Initial
GPU-0 memory occupied: 3657 MB.
GPU-1 memory occupied: 567 MB.


In [30]:
print("After loading tiny tensor and the kernels")
torch.ones((1, 1)).to("cuda")
print_gpu_utilization([0, 1])

After loading tiny tensor and the kernels
GPU-0 memory occupied: 3657 MB.
GPU-1 memory occupied: 567 MB.


# Load model

In [34]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, logging
logging.set_verbosity_error()

In [31]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda")
print_gpu_utilization([0, 1])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU-0 memory occupied: 3657 MB.
GPU-1 memory occupied: 567 MB.


# Train model

In [36]:
seq_len, dataset_size = 512, 512

dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}

dummy_data['input_ids'].shape, dummy_data['labels'].shape

ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

default_args = {
    "output_dir": "tmp",
    "eval_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [38]:
training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



{'train_runtime': 34.1721, 'train_samples_per_second': 14.983, 'train_steps_per_second': 0.468, 'train_loss': 0.20920303463935852, 'epoch': 1.0}
Time: 34.17
Samples/second: 14.98
GPU-0 memory occupied: 13483 MB.


In [39]:
training_args = TrainingArguments(per_device_train_batch_size=8, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



{'train_runtime': 6.9119, 'train_samples_per_second': 74.076, 'train_steps_per_second': 1.157, 'train_loss': 0.04293365031480789, 'epoch': 1.0}
Time: 6.91
Samples/second: 74.08
GPU-0 memory occupied: 16491 MB.


In [40]:
training_args = TrainingArguments(per_device_train_batch_size=16, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)



{'train_runtime': 9.2333, 'train_samples_per_second': 55.452, 'train_steps_per_second': 0.433, 'train_loss': 0.005023417994379997, 'epoch': 1.0}
Time: 9.23
Samples/second: 55.45
GPU-0 memory occupied: 23461 MB.
