# Apple MPS GPU Acceleration Demo

This notebook demonstrates how to use Apple's Metal Performance Shaders (MPS) for GPU acceleration on Apple Silicon Macs (M1/M2/M3).

MPS provides GPU acceleration for PyTorch operations on Apple Silicon, offering significant speedups for machine learning workloads.

In [None]:
import torch
import torch.nn as nn
import time
import matplotlib.pyplot as plt
import numpy as np

print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS built: {torch.backends.mps.is_built()}")

## Device Selection

The recommended way to select a device for Apple Silicon:

In [None]:
# Device selection for Apple Silicon
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS (Metal Performance Shaders) for GPU acceleration")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using NVIDIA CUDA for GPU acceleration")
else:
    device = torch.device("cpu")
    print("Using CPU (no GPU acceleration available)")

print(f"Selected device: {device}")

## Basic Tensor Operations

In [None]:
# Create tensors on the selected device
x = torch.randn(1000, 1000, device=device)
y = torch.randn(1000, 1000, device=device)

print(f"Tensor x device: {x.device}")
print(f"Tensor y device: {y.device}")

# Perform matrix multiplication
z = torch.matmul(x, y)
print(f"Result tensor device: {z.device}")
print(f"Result tensor shape: {z.shape}")

## Performance Comparison: CPU vs MPS

In [None]:
def benchmark_matmul(device, size=2000, iterations=10):
    """Benchmark matrix multiplication on specified device."""
    times = []
    
    for i in range(iterations):
        # Create random tensors
        x = torch.randn(size, size, device=device)
        y = torch.randn(size, size, device=device)
        
        # Time the operation
        start_time = time.time()
        z = torch.matmul(x, y)
        
        # Synchronize if using MPS
        if device.type == 'mps':
            torch.mps.synchronize()
        elif device.type == 'cuda':
            torch.cuda.synchronize()
            
        end_time = time.time()
        times.append(end_time - start_time)
    
    return times

# Benchmark CPU
print("Benchmarking CPU...")
cpu_times = benchmark_matmul(torch.device('cpu'), iterations=5)
avg_cpu_time = np.mean(cpu_times)
print(f"Average CPU time: {avg_cpu_time:.4f} seconds")

# Benchmark MPS (if available)
if torch.backends.mps.is_available():
    print("Benchmarking MPS...")
    mps_times = benchmark_matmul(torch.device('mps'), iterations=5)
    avg_mps_time = np.mean(mps_times)
    print(f"Average MPS time: {avg_mps_time:.4f} seconds")
    
    speedup = avg_cpu_time / avg_mps_time
    print(f"MPS speedup: {speedup:.2f}x")
else:
    print("MPS not available for benchmarking")

## Neural Network Training Example

In [None]:
# Simple neural network for demonstration
class SimpleNet(nn.Module):
    def __init__(self, input_size=784, hidden_size=512, num_classes=10):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

# Create model and move to device
model = SimpleNet().to(device)
print(f"Model device: {next(model.parameters()).device}")

# Create dummy data
batch_size = 128
input_data = torch.randn(batch_size, 784, device=device)
target = torch.randint(0, 10, (batch_size,), device=device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print("Training for a few steps...")
for step in range(10):
    # Forward pass
    outputs = model(input_data)
    loss = criterion(outputs, target)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if step % 2 == 0:
        print(f"Step {step}, Loss: {loss.item():.4f}")

print("Training completed successfully!")

## Best Practices for MPS

1. **Always check availability**: Use `torch.backends.mps.is_available()` before using MPS
2. **Move tensors and models**: Ensure both your data and model are on the same device
3. **Synchronization**: Use `torch.mps.synchronize()` when timing operations
4. **Memory management**: MPS shares memory with the system, so be mindful of memory usage
5. **Fallback to CPU**: Always have a CPU fallback for compatibility

In [None]:
# Example of proper device handling
def get_device():
    """Get the best available device."""
    if torch.backends.mps.is_available():
        return torch.device("mps")
    elif torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

def move_to_device(tensor_or_model, device):
    """Safely move tensor or model to device."""
    try:
        return tensor_or_model.to(device)
    except Exception as e:
        print(f"Failed to move to {device}, falling back to CPU: {e}")
        return tensor_or_model.to(torch.device("cpu"))

# Example usage
device = get_device()
print(f"Using device: {device}")

# Create and move tensor
tensor = torch.randn(100, 100)
tensor = move_to_device(tensor, device)
print(f"Tensor is on: {tensor.device}")