In [7]:
import torch
import torch.nn as nn
import time

# --- Model & Data Parameters (Identical to C++ examples) ---
NUM_ITERATIONS = 100
WARMUP_ITERATIONS = 20
BATCH_SIZE = 128
INPUT_C = 3
INPUT_H = 64
INPUT_W = 64

CONV_OUTC = 64
KERNEL_H = 5
KERNEL_W = 5
CONV_PAD = 2
CONV_STRIDE = 1

POOL_H = 2
POOL_W = 2
POOL_STRIDE = 2

NUM_CLASSES = 1000


# Define the CNN model in PyTorch
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # Layer 1: Conv -> ReLU -> Pool
        self.conv1 = nn.Conv2d(
            in_channels=INPUT_C,
            out_channels=CONV_OUTC,
            kernel_size=(KERNEL_H, KERNEL_W),
            stride=CONV_STRIDE,
            padding=CONV_PAD
        )
        self.relu = nn.ReLU()
        self.pool1 = nn.MaxPool2d(
            kernel_size=(POOL_H, POOL_W),
            stride=POOL_STRIDE
        )

        # Calculate the flattened size after convolution and pooling
        # Input: 64x64 -> Conv -> 64x64 -> Pool -> 32x32
        # So, flattened features = 64 channels * 32 height * 32 width
        flattened_features = CONV_OUTC * (INPUT_H // POOL_STRIDE) * (INPUT_W // POOL_STRIDE)

        # Layer 2: Fully Connected (Linear)
        self.fc1 = nn.Linear(
            in_features=flattened_features,
            out_features=NUM_CLASSES
        )

    def forward(self, x):
        # The forward pass logic
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool1(x)

        # Flatten the tensor for the linear layer, keeping the batch dimension
        x = torch.flatten(x, start_dim=1)

        x = self.fc1(x)
        return x

# --- Main execution block ---
if __name__ == "__main__":
    print("--- Performance Test: CNN with Python/PyTorch ---")

    # 1. Check for CUDA and select the device
    if not torch.cuda.is_available():
        print("CUDA is not available. Exiting.")
        exit()

    device = torch.device("cuda")
    print(f"Using device: {device}")

    # 2. Enable cuDNN benchmarking
    # This is CRITICAL for performance. It tells cuDNN to find the fastest
    # convolution algorithm for our specific input size.
    torch.backends.cudnn.benchmark = True
    print(f"cuDNN benchmark mode: {torch.backends.cudnn.benchmark}")

    # 3. Create the model and move it to the GPU
    model = SimpleCNN().to(device)
    # Set the model to evaluation mode (disables things like dropout)
    model.eval()

    # 4. Create fake input data directly on the GPU
    # This is equivalent to cudaMalloc + filling with data
    input_tensor = torch.randn(BATCH_SIZE, INPUT_C, INPUT_H, INPUT_W, device=device)
    print(f"Batch: {BATCH_SIZE}, Input: {INPUT_C}x{INPUT_H}x{INPUT_W}, Iterations: {NUM_ITERATIONS}")

    # 5. Warm-up phase
    # Run the model a few times to let cuDNN select the best algorithms
    # and warm up the GPU caches.
    print(f"\n--- Starting warm-up ({WARMUP_ITERATIONS} iterations) ---")
    with torch.no_grad(): # Disables gradient calculation for inference
        for _ in range(WARMUP_ITERATIONS):
            _ = model(input_tensor)

    # Wait for all GPU work to finish before starting the benchmark
    torch.cuda.synchronize()

    # 6. Benchmark loop
    print(f"--- Starting benchmark ({NUM_ITERATIONS} iterations) ---")

    # Using CUDA events for the most accurate GPU-side timing
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    start_event.record() # Start the master timer

    with torch.no_grad():
        for i in range(NUM_ITERATIONS):
            _ = model(input_tensor)

    end_event.record() # Stop the master timer

    # Wait for the events to complete
    torch.cuda.synchronize()

    # Calculate and print the results
    total_time_ms = start_event.elapsed_time(end_event)
    avg_time_ms = total_time_ms / NUM_ITERATIONS

    print("\n--- PyTorch Results ---")
    print(f"Total time for {NUM_ITERATIONS} iterations: {total_time_ms:.3f} ms")
    print(f"Average time per forward pass: {avg_time_ms:.3f} ms")

--- Performance Test: CNN with Python/PyTorch ---
Using device: cuda
cuDNN benchmark mode: True
Batch: 128, Input: 3x64x64, Iterations: 100

--- Starting warm-up (20 iterations) ---
--- Starting benchmark (100 iterations) ---

--- PyTorch Results ---
Total time for 100 iterations: 262.985 ms
Average time per forward pass: 2.630 ms
