In [3]:
import torch
import torch_directml

# 1. Initialize the DirectML device
device = torch_directml.device()
print(f"DirectML Device: {device}")

# 2. Check if the device is actually available
# (DirectML usually defaults to CPU if no compatible GPU is found)
try:
  # Create a tensor directly on the DirectML device
  x = torch.tensor([1.0, 2.0, 3.0]).to(device)
  y = torch.tensor([4.0, 5.0, 6.0]).to(device)

  # Perform a simple calculation
  z = x + y

  print("Successfully connected to GPU!")
  print(f"Result of tensor addition: {z}")
  print(f"Tensor is on device: {z.device}")

except Exception as e:
  print("Could not access the GPU via DirectML.")
  print(f"Error: {e}")

DirectML Device: privateuseone:0
Successfully connected to GPU!
Result of tensor addition: tensor([5., 7., 9.], device='privateuseone:0')
Tensor is on device: privateuseone:0


In [4]:
import torch
import torch_directml
import time

# --- CONFIGURATION ---
# Size of the square matrix (N x N).
# 4096 is ~100MB per tensor. 8192 is ~400MB.
# Adjust 'MATRIX_SIZE' based on your RAM/VRAM.
MATRIX_SIZE = 8192


def get_device():
  if torch_directml.is_available():
    return torch_directml.device()
  return torch.device("cpu")


def stress_test(device, size):
  print(f"--- Preparing Data on {device} ---")
  # Generate random matrices
  # We use float32 (standard precision)
  x = torch.randn(size, size, device=device)
  y = torch.randn(size, size, device=device)

  print(f"--- Started Computation on {device} ---")
  start_time = time.perf_counter()

  # The heavy operation: Matrix Multiplication
  z = torch.mm(x, y)

  # FORCE SYNCHRONIZATION
  # We pull one value to CPU to ensure the GPU has actually finished
  # the calculation before we stop the clock.
  _ = z.min().item()

  end_time = time.perf_counter()
  return end_time - start_time


if __name__ == "__main__":
  dml_device = get_device()
  cpu_device = torch.device("cpu")

  print(f"Matrix Size: {MATRIX_SIZE}x{MATRIX_SIZE}")
  print("Warming up GPU...")
  # Warm up to remove initialization overhead
  warmup = torch.randn(1024, 1024, device=dml_device)
  torch.mm(warmup, warmup)

  # 1. Test GPU
  print(f"\nTesting GPU ({dml_device})...")
  try:
    gpu_time = stress_test(dml_device, MATRIX_SIZE)
    print(f"GPU Time: {gpu_time:.4f} seconds")
  except Exception as e:
    print(f"GPU Failed: {e}")
    gpu_time = None

  # 2. Test CPU
  print(f"\nTesting CPU...")
  try:
    cpu_time = stress_test(cpu_device, MATRIX_SIZE)
    print(f"CPU Time: {cpu_time:.4f} seconds")
  except KeyboardInterrupt:
    print("CPU test stopped manually (it was taking too long!)")
    cpu_time = None

  # 3. Results
  if gpu_time and cpu_time:
    speedup = cpu_time / gpu_time
    print("\n" + "="*30)
    print(f"RESULT: GPU is {speedup:.2f}x faster than CPU")
    print("="*30)

Matrix Size: 8192x8192
Warming up GPU...


tensor([[ 38.4022,  25.3507,  19.4983,  ...,  22.6373,  -8.0272, -38.5622],
        [ 31.2668,   7.4328,  40.5782,  ...,   7.9537,  18.3317, -25.9368],
        [ -9.6734,  16.4145,  -0.2959,  ...,  -4.6618,  69.1028,  -2.6722],
        ...,
        [-37.9909,  27.2076,  16.3519,  ...,  35.0770,   1.7056,   3.3922],
        [ -4.0383,  14.5772,  -3.2566,  ...,  18.1526, -34.8598,   1.3983],
        [ -1.3562,  54.9273,  34.9020,  ...,  15.0746,  21.2836,  17.9908]],
       device='privateuseone:0')


Testing GPU (privateuseone:0)...
--- Preparing Data on privateuseone:0 ---
--- Started Computation on privateuseone:0 ---
GPU Time: 0.4937 seconds

Testing CPU...
--- Preparing Data on cpu ---
--- Started Computation on cpu ---
CPU Time: 5.2707 seconds

RESULT: GPU is 10.68x faster than CPU


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_directml
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time

# --- CONFIGURATION ---
BATCH_SIZE = 128    # Higher batch size uses more VRAM
EPOCHS = 5          # Number of times to loop through the ENTIRE dataset
# (MNIST is 60,000 images, so 5 epochs = 300,000 image passes)

# --- THE NEURAL NETWORK ---


class StressCNN(nn.Module):
  def __init__(self):
    super(StressCNN, self).__init__()
    # A standard CNN architecture
    self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
    self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
    self.fc1 = nn.Linear(64 * 7 * 7, 128)
    self.fc2 = nn.Linear(128, 10)
    self.relu = nn.ReLU()

  def forward(self, x):
    x = self.pool(self.relu(self.conv1(x)))
    x = self.pool(self.relu(self.conv2(x)))
    x = x.view(-1, 64 * 7 * 7)  # Flatten
    x = self.relu(self.fc1(x))
    x = self.fc2(x)
    return x


def train_model(device_name, device_obj, dataloader):
  print(f"\n--- Starting Training on {device_name.upper()} ---")

  model = StressCNN().to(device_obj)
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  model.train()  # Set to training mode

  start_time = time.perf_counter()

  for epoch in range(EPOCHS):
    running_loss = 0.0
    # Iterate over batches
    for i, (images, labels) in enumerate(dataloader):
      # 1. Move data to device (CRITICAL STEP FOR SPEED)
      images, labels = images.to(device_obj), labels.to(device_obj)

      # 2. Zero gradients
      optimizer.zero_grad()

      # 3. Forward pass
      outputs = model(images)
      loss = criterion(outputs, labels)

      # 4. Backward pass (Heavy computation)
      loss.backward()
      optimizer.step()

      running_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} complete. Avg Loss: {running_loss/len(dataloader):.4f}")

  # Force synchronization for DirectML/GPU to ensure timing is accurate
  if "cpu" not in str(device_obj):
    # We perform a small read to force the GPU to finish all pending work
    _ = torch.tensor([1]).to(device_obj).cpu()

  end_time = time.perf_counter()
  duration = end_time - start_time
  print(f"--> {device_name} finished in {duration:.4f} seconds")
  return duration


if __name__ == "__main__":
  # 1. Setup Data
  print("Downloading/Loading MNIST Data...")
  transform = transforms.Compose([
      transforms.ToTensor(),
      transforms.Normalize((0.1307,), (0.3081,))
  ])
  dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
  dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
  print("Data Ready.\n")

  # 2. Define Devices
  dml_device = torch_directml.device() if torch_directml.is_available() else None
  cpu_device = torch.device("cpu")

  # 3. Run Benchmarks
  gpu_time = 0
  if dml_device:
    # We run GPU first
    gpu_time = train_model("GPU (DirectML)", dml_device, dataloader)
  else:
    print("DirectML not found. Skipping GPU test.")

  # Run CPU
  # WARNING: CPU training on CNNs is notoriously slow.
  cpu_time = train_model("CPU", cpu_device, dataloader)

  # 4. Comparison
  if gpu_time > 0:
    speedup = cpu_time / gpu_time
    print("\n" + "="*40)
    print(f"FINAL RESULT:")
    print(f"GPU Time: {gpu_time:.2f}s")
    print(f"CPU Time: {cpu_time:.2f}s")
    print(f"Speedup:  {speedup:.2f}x FASTER on GPU")
    print("="*40)

Downloading/Loading MNIST Data...
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data\MNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:07<00:00, 1357198.75it/s]


Extracting ./data\MNIST\raw\train-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data\MNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 125415.76it/s]


Extracting ./data\MNIST\raw\train-labels-idx1-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data\MNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:01<00:00, 1073828.82it/s]


Extracting ./data\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 4507933.93it/s]


Extracting ./data\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data\MNIST\raw

Data Ready.


--- Starting Training on GPU (DIRECTML) ---


  torch._foreach_lerp_(device_exp_avgs, device_grads, 1 - beta1)


Epoch 1/5 complete. Avg Loss: 0.1879
Epoch 2/5 complete. Avg Loss: 0.0492
Epoch 3/5 complete. Avg Loss: 0.0352
Epoch 4/5 complete. Avg Loss: 0.0265
Epoch 5/5 complete. Avg Loss: 0.0190
--> GPU (DirectML) finished in 129.7837 seconds

--- Starting Training on CPU ---
Epoch 1/5 complete. Avg Loss: 0.1510
Epoch 2/5 complete. Avg Loss: 0.0471
Epoch 3/5 complete. Avg Loss: 0.0318
Epoch 4/5 complete. Avg Loss: 0.0227
Epoch 5/5 complete. Avg Loss: 0.0165
--> CPU finished in 241.7296 seconds

FINAL RESULT:
GPU Time: 129.78s
CPU Time: 241.73s
Speedup:  1.86x FASTER on GPU


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_directml
import time
import warnings

# Suppress the warning so we can see the timing results clearly
warnings.filterwarnings("ignore", message=".*aten::lerp.Scalar_out.*")

# Setup
dml = torch_directml.device()
print(f"--- Optimization Stress Test on {dml} ---\n")

# A dummy model and data
model = nn.Linear(4096, 4096).to(dml)
data = torch.randn(1024, 4096, device=dml)
target = torch.randn(1024, 4096, device=dml)


def test_optimizer(opt_class, name, **kwargs):
  # Reset model parameters to ensure fair test
  model.reset_parameters()
  optimizer = opt_class(model.parameters(), **kwargs)
  criterion = nn.MSELoss()

  print(f"Testing {name}...")
  start = time.perf_counter()

  # Run 50 optimization steps
  for _ in range(50):
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()  # <--- This is where Adam hits the CPU wall

  # Sync GPU
  _ = torch.tensor([1]).to(dml).cpu()

  duration = time.perf_counter() - start
  print(f"-> Time: {duration:.4f} seconds\n")
  return duration


# 1. Test Adam (Will likely trigger hidden CPU fallbacks)
adam_time = test_optimizer(optim.Adam, "Adam (uses lerp -> CPU Fallback)", lr=0.001)

# 2. Test SGD (Should stay 100% on GPU)
sgd_time = test_optimizer(optim.SGD, "SGD (Pure GPU)", lr=0.01)

print("="*40)
if adam_time > sgd_time:
  print(f"SGD is {adam_time / sgd_time:.2f}x faster than Adam on DirectML!")
  print("Recommendation: Use SGD with Momentum for now.")
else:
  print("Speeds are similar (Fallback might be negligible for this model size).")
print("="*40)

--- Optimization Stress Test on privateuseone:0 ---

Testing Adam (uses lerp -> CPU Fallback)...
-> Time: 8.1624 seconds

Testing SGD (Pure GPU)...
-> Time: 4.7863 seconds

SGD is 1.71x faster than Adam on DirectML!
Recommendation: Use SGD with Momentum for now.


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_directml
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import time
import warnings
import gc

# --- CONFIGURATION ---
# 64 is very safe for 4GB VRAM.
# It creates more "overhead" (more CPU-GPU chatter), but it won't crash.
BATCH_SIZE = 64
EPOCHS = 3       # Reduced to 3 to get quick results

warnings.filterwarnings("ignore")


class HeavyCNN(nn.Module):
  def __init__(self):
    super(HeavyCNN, self).__init__()
    # We keep the model heavy to force computation
    self.features = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(64, 128, kernel_size=3, padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Conv2d(128, 256, kernel_size=3, padding=1),
        nn.ReLU(),
    )
    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(256 * 7 * 7, 512),
        nn.ReLU(),
        nn.Linear(512, 10)
    )

  def forward(self, x):
    x = self.features(x)
    x = self.classifier(x)
    return x


def run_benchmark(device_name, device_obj, dataloader):
  print(f"\n--- Starting Benchmark on {device_name} ---")

  # CLEAR MEMORY FIRST
  gc.collect()
  torch.cuda.empty_cache() if torch.cuda.is_available() else None

  try:
    model = HeavyCNN().to(device_obj)
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    # Warmup
    print("Warming up...", end="\r")
    dummy = torch.randn(BATCH_SIZE, 1, 28, 28).to(device_obj)
    _ = model(dummy)

    start_time = time.perf_counter()
    model.train()

    for epoch in range(EPOCHS):
      running_loss = 0.0
      for i, (images, labels) in enumerate(dataloader):
        images = images.to(device_obj, non_blocking=True)
        labels = labels.to(device_obj, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

      # Print progress every epoch so you know it's not frozen
      print(f"Epoch {epoch+1}/{EPOCHS} | Avg Loss: {running_loss/len(dataloader):.4f}")

    # Sync
    if "cpu" not in str(device_obj):
      _ = torch.tensor([1]).to(device_obj).cpu()

    duration = time.perf_counter() - start_time
    print(f"--> {device_name} Time: {duration:.4f} seconds")
    return duration

  except RuntimeError as e:
    print(f"\nCRITICAL ERROR on {device_name}: {e}")
    return None


if __name__ == "__main__":
  print(f"Setup: Batch Size={BATCH_SIZE} | Epochs={EPOCHS}")

  # Data Setup
  transform = transforms.ToTensor()
  train_data = datasets.MNIST('./data', train=True, download=True, transform=transform)

  # GPU Loader
  gpu_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True,
                          pin_memory=True, num_workers=2)

  # CPU Loader
  cpu_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True,
                          pin_memory=False, num_workers=0)

  dml = torch_directml.device()
  cpu = torch.device("cpu")

  # 1. GPU Test
  gpu_time = run_benchmark("GPU (DirectML)", dml, gpu_loader)

  # 2. CPU Test
  if gpu_time:
    cpu_time = run_benchmark("CPU", cpu, cpu_loader)

    # 3. Final Report
    print("\n" + "="*40)
    print(f"FINAL RESULTS (Batch {BATCH_SIZE}):")
    print(f"GPU Time: {gpu_time:.2f}s")
    print(f"CPU Time: {cpu_time:.2f}s")
    speedup = cpu_time / gpu_time
    print(f"Speedup:  {speedup:.2f}x FASTER on GPU")
    print("="*40)

Setup: Batch Size=64 | Epochs=3

--- Starting Benchmark on GPU (DirectML) ---
Epoch 1/3 | Avg Loss: 0.3042
Epoch 2/3 | Avg Loss: 0.0595
Epoch 3/3 | Avg Loss: 0.0399
--> GPU (DirectML) Time: 51.5093 seconds

--- Starting Benchmark on CPU ---
Epoch 1/3 | Avg Loss: 0.3124
Epoch 2/3 | Avg Loss: 0.0583
Epoch 3/3 | Avg Loss: 0.0381
--> CPU Time: 496.0633 seconds

FINAL RESULTS (Batch 64):
GPU Time: 51.51s
CPU Time: 496.06s
Speedup:  9.63x FASTER on GPU
