Be sure to install PyTorch with CUDA support in the first place

In [None]:
pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128

Afterwards, you might better verify your PyTorch installation

In [None]:
pip show torch

Let's set-up some common parameters

In [None]:
EPOCHS = 3            
STEPS_PER_EPOCH = 1000  
BATCH_SIZE = 4096       
FEATURES = 256
CLASSES = 10
HIDDEN = 512
LR = 0.01

The code below runs a basic PyTorch training job without using the GPU. You indicate it through **device**.

In [None]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

USE_CUDA = False and torch.cuda.is_available()  # set to True to use your GPU

device = torch.device("cuda" if USE_CUDA and torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

# tiny MLP
model = nn.Sequential(
    nn.Linear(FEATURES, HIDDEN),
    nn.ReLU(),
    nn.Linear(HIDDEN, CLASSES),
).to(device)

opt = torch.optim.SGD(model.parameters(), lr=LR)

t0 = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    model.train()
    epoch_start = time.perf_counter()
    loss_accum = 0.0

    for step in range(1, STEPS_PER_EPOCH + 1):
        # generate synthetic batch on the fly (no I/O, keeps code simple)
        xb = torch.randn(BATCH_SIZE, FEATURES, device=device)
        yb = torch.randint(0, CLASSES, (BATCH_SIZE,), device=device)

        opt.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = F.cross_entropy(logits, yb)
        loss.backward()
        opt.step()

        loss_accum += loss.item()

        # occasional heartbeat so you can see progress
        if step % 500 == 0 or step == STEPS_PER_EPOCH:
            elapsed = time.perf_counter() - t0
            print(f"epoch {epoch}/{EPOCHS}  step {step}/{STEPS_PER_EPOCH}  "
                  f"loss {loss_accum/step:.4f}  elapsed {elapsed:.1f}s", flush=True)

    print(f"epoch {epoch} done in {time.perf_counter() - epoch_start:.1f}s")

print("total time:", round(time.perf_counter() - t0, 1), "s")

Before trying to use the GPU, be sure that it is available.

In [None]:
!nvidia-smi

Then, we must check that CUDA is installed. If not, there is a custom way for each OS in the NVIDIA site:

In [None]:
!nvcc --version

Now, it's time to use the GPU (and compare):

In [None]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

USE_CUDA = True and torch.cuda.is_available()  # set to True to use your GPU

device = torch.device("cuda" if USE_CUDA and torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

# tiny MLP
model = nn.Sequential(
    nn.Linear(FEATURES, HIDDEN),
    nn.ReLU(),
    nn.Linear(HIDDEN, CLASSES),
).to(device)

opt = torch.optim.SGD(model.parameters(), lr=LR)

t0 = time.perf_counter()
for epoch in range(1, EPOCHS + 1):
    model.train()
    epoch_start = time.perf_counter()
    loss_accum = 0.0

    for step in range(1, STEPS_PER_EPOCH + 1):
        # generate synthetic batch on the fly (no I/O, keeps code simple)
        xb = torch.randn(BATCH_SIZE, FEATURES, device=device)
        yb = torch.randint(0, CLASSES, (BATCH_SIZE,), device=device)

        opt.zero_grad(set_to_none=True)
        logits = model(xb)
        loss = F.cross_entropy(logits, yb)
        loss.backward()
        opt.step()

        loss_accum += loss.item()

        # occasional heartbeat so you can see progress
        if step % 500 == 0 or step == STEPS_PER_EPOCH:
            elapsed = time.perf_counter() - t0
            print(f"epoch {epoch}/{EPOCHS}  step {step}/{STEPS_PER_EPOCH}  "
                  f"loss {loss_accum/step:.4f}  elapsed {elapsed:.1f}s", flush=True)

    print(f"epoch {epoch} done in {time.perf_counter() - epoch_start:.1f}s")

print("total time:", round(time.perf_counter() - t0, 1), "s")