### Testing with some operations    

In [1]:

# Quick GPU + package sanity check
# This cell is written to be safe on machines *without* a GPU.

print("--- PyTorch ---")
try:
    import torch

    print("torch version:", torch.__version__)
    cuda_ok = torch.cuda.is_available()
    print("CUDA available:", cuda_ok)

    if cuda_ok:
        device_index = 0
        print("GPU name:", torch.cuda.get_device_name(device_index))
    else:
        print("PyTorch will run on CPU (no CUDA device detected).")
except Exception as e:
    print("PyTorch import/usage failed:", repr(e))
    print("If you expected PyTorch, install deps via: uv sync")

print("\n--- TensorFlow ---")
try:
    import tensorflow as tf

    print("tensorflow version:", tf.__version__)
    print("Built with CUDA:", tf.test.is_built_with_cuda())

    gpus = tf.config.list_physical_devices('GPU')
    print("Visible GPUs:", gpus)

    if not gpus:
        print("TensorFlow will run on CPU (no GPU visible to TF).")
except Exception as e:
    print("TensorFlow import/usage failed:", repr(e))
    print("If you expected TensorFlow, install deps via: uv sync")


--- PyTorch ---
torch version: 2.9.1+cu128
CUDA available: True
GPU name: NVIDIA RTX A4000 Laptop GPU

--- TensorFlow ---


2025-12-15 18:59:57.343367: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-15 18:59:57.387995: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-15 18:59:58.833584: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


tensorflow version: 2.20.0
Built with CUDA: True
Visible GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Example: Accelerated Matrix Multiplication

In [2]:
# Matrix multiplication demo (GPU if available, otherwise CPU)
#
# This cell intentionally avoids hard-coding device='cuda' so it can run on:
# - GPU machines (CUDA available)
# - CPU-only machines (no CUDA / no NVIDIA GPU)

import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', device)

# Create tensors on the selected device
a = torch.randn(5000, 5000, device=device)
b = torch.randn(5000, 5000, device=device)

# Perform matrix multiplication
c = torch.matmul(a, b)
print('Result shape:', tuple(c.shape))


Using device: cuda
Result shape: (5000, 5000)


### Training Examples (TensorFlow)

> There are **two** training examples below on purpose.

- **Example 1: Synthetic (learnable) classification** — no downloads, runs fast, and is fully controlled so you can focus on the *ML workflow* (data → model → loss/optimizer → training → evaluation).
- **Example 2: MNIST + mixed precision** — uses a real dataset and adds a practical GPU optimization technique (mixed precision) you’ll see in real projects.

If you’re new to ML, run them in this order:
1. Run Cell 2 (GPU + package sanity check)
2. Run Cell 6 (Example 1: synthetic classification)
3. Run Cell 9 (Example 2: MNIST + mixed precision)

As you run them, watch these two ideas:
- **Training accuracy vs validation accuracy**: training shows how well the model fits seen data; validation shows how it generalizes.
- **Loss vs accuracy**: loss is what the optimizer actually minimizes; accuracy is a human-friendly metric.

In [None]:

"""
Example 1 — Synthetic (learnable) classification on CPU or GPU (TensorFlow)

Why this example exists:
- It teaches the *mechanics* of training without external dependencies (no dataset downloads).
- The labels are generated from a hidden linear rule, so the task is genuinely learnable (accuracy should rise).
- It is small enough to iterate quickly while you experiment with batch size, epochs, and model depth.

What to look for when you run it:
- Accuracy should improve above random chance (~10% for 10 classes).
- If a GPU is available, TensorFlow should use it automatically.
"""

# Beginner-friendly GPU training example (TensorFlow)

import os
import time
import shutil
import numpy as np

# Step 0: Make sure CUDA Toolkit binaries are discoverable by this kernel process
#
# On Linux/WSL, CUDA Toolkit is often installed under /usr/local/cuda.
# Some TF/XLA GPU paths may call `ptxas` (part of the CUDA Toolkit).
# VS Code/Jupyter kernels sometimes have a PATH that doesn’t include /usr/local/cuda/bin,
# so we prepend it if it exists.
cuda_bin = "/usr/local/cuda/bin"
if os.path.isdir(cuda_bin):
    path_parts = os.environ.get("PATH", "").split(os.pathsep)
    if cuda_bin not in path_parts:
        os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")

ptxas_path = shutil.which("ptxas")
nvcc_path = shutil.which("nvcc")
print("ptxas:", ptxas_path or "NOT FOUND")
print("nvcc:", nvcc_path or "NOT FOUND")

# Import TensorFlow (after environment setup)
import tensorflow as tf

# Step 1: Configure GPU memory growth
#
# Without this, TensorFlow may try to reserve most/all VRAM up-front.
# Memory growth lets TF allocate GPU memory as needed during training.
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
        print("GPU Available:", gpus)
        print("TensorFlow will automatically use GPU for ops\n")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("No GPU detected. Training will run on CPU.\n")

# Step 2: Create a synthetic dataset that is *actually learnable*
#
# We generate labels from a hidden linear model:
#   logits = X @ W + noise
#   y = argmax(logits)
#
# If labels were random, the best you could do is ~10% accuracy forever.
print("Generating learnable synthetic classification data...")
rng = np.random.default_rng(0)

num_train = 10_000
num_test = 2_000
num_features = 100
num_classes = 10

# Feature matrix (inputs)
X_train = rng.normal(size=(num_train, num_features)).astype('float32')
X_test = rng.normal(size=(num_test, num_features)).astype('float32')

# Hidden weights used to generate labels (the pattern the model can learn)
W = rng.normal(size=(num_features, num_classes)).astype('float32')
noise_scale = 0.25

train_logits = X_train @ W + rng.normal(scale=noise_scale, size=(num_train, num_classes)).astype('float32')
test_logits = X_test @ W + rng.normal(scale=noise_scale, size=(num_test, num_classes)).astype('float32')

# Integer labels (0..num_classes-1)
y_train_int = np.argmax(train_logits, axis=1)
y_test_int = np.argmax(test_logits, axis=1)

# One-hot labels for categorical_crossentropy
y_train = tf.keras.utils.to_categorical(y_train_int, num_classes)
y_test = tf.keras.utils.to_categorical(y_test_int, num_classes)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}\n")

# Step 3: Build a simple neural network
#
# Architecture: 100 → 64 (ReLU) → 10 (Softmax)
#
# - ReLU layer learns a non-linear representation of inputs.
# - Softmax outputs class probabilities that sum to 1.
print("Building model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(num_features,), name='hidden_layer'),
    tf.keras.layers.Dense(num_classes, activation='softmax', name='output_layer'),
])

# Step 4: Compile (choose optimizer + loss + metrics)
#
# - Optimizer (Adam): how we update weights from gradients
# - Loss (categorical_crossentropy): how wrong predictions are (what we minimize)
# - Metric (accuracy): human-friendly score
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

model.summary()
print()

# Step 5: Train
print("Training model (GPU if available, otherwise CPU)...")
start_time = time.time()

history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.2,
    verbose=1,
)

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.2f} seconds")

# Step 6: Evaluate on held-out test data
print("\nEvaluating on test data...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Step 7: Predict a few samples (show predicted class + confidence)
print("\nMaking predictions on 5 test samples...")
preds = model.predict(X_test[:5], verbose=0)
for i, pred in enumerate(preds):
    predicted_class = int(np.argmax(pred))
    confidence = float(pred[predicted_class])
    actual_class = int(np.argmax(y_test[i]))
    print(f"Sample {i+1}: Predicted={predicted_class} (confidence={confidence:.3f}), Actual={actual_class}")


### Training Example 2 (TensorFlow): MNIST + Mixed Precision

This second example complements Example 1 by using a **real dataset** (MNIST handwritten digits).

Why include this one:
- You’ll see the typical preprocessing steps (normalize inputs, one-hot labels).
- You’ll see a real benchmark-style dataset (so results are easier to interpret).
- It demonstrates **mixed precision** (`mixed_float16`), a common GPU performance technique on modern NVIDIA GPUs (Tensor Cores).

Note: MNIST may download the first time you run it (internet required).

#### Key concepts in this example

- **Mixed precision**: computations use float16 where safe (faster on many GPUs), while keeping some values in float32 for stability.
- **Why the output layer uses `dtype='float32'`**: softmax + loss can be numerically sensitive; forcing float32 helps avoid instability when using float16.
- **Normalization**: scaling pixel values to `[0, 1]` makes optimization easier.
- **Train/validation split**: you’ll see whether improvements generalize, not just memorize.

If you’re just starting out, try changing one thing at a time:
- Increase/decrease `batch_size`
- Increase `epochs`
- Add/remove a Dense layer
- Compare CPU vs GPU runtime

In [None]:

"""
Example 2 — MNIST digits + mixed precision (TensorFlow)

Why this example exists:
- MNIST is a small *real* dataset, so training curves/accuracy are meaningful and comparable.
- It shows standard image preprocessing (normalization) and label encoding.
- It introduces mixed precision (`mixed_float16`), which can speed up training on many NVIDIA GPUs.

Notes:
- If you already ran Example 1, TensorFlow may already be imported; re-importing is fine.
- MNIST may download the first time you run this cell (internet required).
"""

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Step 1: Configure GPU memory growth (avoid grabbing all VRAM up-front)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPUs available:", gpus)
        details = tf.config.experimental.get_device_details(gpus[0])
        print("Using GPU:", details.get('device_name', 'Unknown GPU'))
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Training will run on CPU.")

# Step 2: Enable mixed precision (optional but useful on modern NVIDIA GPUs)
#
# mixed_float16 uses float16 for many computations for speed, but keeps variables/critical ops safe.
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')
print("Mixed precision enabled: mixed_float16")

# Step 3: Load MNIST dataset
print("\nLoading MNIST dataset...")
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Step 4: Normalize pixel values
#
# MNIST images are uint8 in [0, 255]. We convert to float32 and scale to [0, 1].
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Step 5: One-hot encode labels for categorical_crossentropy
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

print(f"Training samples: {len(x_train)}, Test samples: {len(x_test)}")

# Step 6: Define a simple neural network model
model = Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    # Output layer forced to float32 for numerical stability with mixed precision
    Dense(10, activation='softmax', dtype='float32')
])

# Step 7: Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

model.summary()

# Step 8: Train
# Tip: batch size affects speed + memory. If you see OOM errors, reduce it further.
print("\nStarting training...")
history = model.fit(
    x_train, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=64,
    verbose=1,
)

# Step 9: Evaluate the model on test data
print("\nEvaluating model...")
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=1)
print(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
