### Testing with some operations    

In [None]:

# Quick GPU + package sanity check
# This cell is written to be safe on machines *without* a GPU.

print("--- PyTorch ---")
try:
    import torch

    print("torch version:", torch.__version__)
    cuda_ok = torch.cuda.is_available()
    print("CUDA available:", cuda_ok)

    if cuda_ok:
        device_index = 0
        print("GPU name:", torch.cuda.get_device_name(device_index))
    else:
        print("PyTorch will run on CPU (no CUDA device detected).")
except Exception as e:
    print("PyTorch import/usage failed:", repr(e))
    print("If you expected PyTorch, install deps via: uv sync")

print("\n--- TensorFlow ---")
try:
    import tensorflow as tf

    print("tensorflow version:", tf.__version__)
    print("Built with CUDA:", tf.test.is_built_with_cuda())

    gpus = tf.config.list_physical_devices('GPU')
    print("Visible GPUs:", gpus)

    if not gpus:
        print("TensorFlow will run on CPU (no GPU visible to TF).")
except Exception as e:
    print("TensorFlow import/usage failed:", repr(e))
    print("If you expected TensorFlow, install deps via: uv sync")


### Example: Accelerated Matrix Multiplication

In [None]:
# Create tensors on GPU
a = torch.randn(5000, 5000, device='cuda')
b = torch.randn(5000, 5000, device='cuda')

# Perform matrix multiplication on GPU
c = torch.matmul(a, b)
print("Result shape:", c.shape)


### Complete Training Example (TensorFlow)
**A beginner-friendly guide to training a neural network on GPU**

This example demonstrates:
- Preparing training data
- Building a neural network architecture
- Training the model with GPU acceleration
- Evaluating performance
- Making predictions

In [None]:

# Beginner-friendly GPU training example (TensorFlow)
#
# Goals:
# - Show the end-to-end ML workflow (data -> model -> train -> eval -> predict)
# - Use the GPU if present (but still run on CPU if not)
# - Keep the dataset synthetic *but learnable*, so accuracy improves (unlike random labels)
#
# Notes about CUDA Toolkit on WSL/Linux:
# - Some TensorFlow/XLA GPU paths may call `ptxas` (CUDA Toolkit).
# - CUDA is often installed at /usr/local/cuda, but that may not be on PATH in Jupyter.
# - We add /usr/local/cuda/bin to PATH (if present) BEFORE importing TensorFlow.

import os
import time
import shutil
import numpy as np

# Step 0: Make sure CUDA Toolkit binaries are discoverable by this kernel process
cuda_bin = "/usr/local/cuda/bin"
if os.path.isdir(cuda_bin):
    path_parts = os.environ.get("PATH", "").split(os.pathsep)
    if cuda_bin not in path_parts:
        os.environ["PATH"] = cuda_bin + os.pathsep + os.environ.get("PATH", "")

ptxas_path = shutil.which("ptxas")
nvcc_path = shutil.which("nvcc")
print("ptxas:", ptxas_path or "NOT FOUND")
print("nvcc:", nvcc_path or "NOT FOUND")

# Import TensorFlow (after environment setup)
import tensorflow as tf

# Step 1: Configure GPU memory growth (prevents TensorFlow from grabbing all VRAM up-front)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
        print("GPU Available:", gpus)
        print("TensorFlow will automatically use GPU for ops\n")
    except RuntimeError as e:
        print(f"GPU configuration error: {e}")
else:
    print("No GPU detected. Training will run on CPU.\n")

# Step 2: Create a synthetic dataset that is *actually learnable*
#
# We generate labels from a hidden linear model:
#   logits = X @ W + noise
#   y = argmax(logits)
#
# This way, a small neural network can learn and accuracy will rise above ~10%.
print("Generating learnable synthetic classification data...")
rng = np.random.default_rng(0)

num_train = 10_000
num_test = 2_000
num_features = 100
num_classes = 10

# Feature matrix
X_train = rng.normal(size=(num_train, num_features)).astype('float32')
X_test = rng.normal(size=(num_test, num_features)).astype('float32')

# Hidden linear weights used to generate labels
W = rng.normal(size=(num_features, num_classes)).astype('float32')
noise_scale = 0.25

train_logits = X_train @ W + rng.normal(scale=noise_scale, size=(num_train, num_classes)).astype('float32')
test_logits = X_test @ W + rng.normal(scale=noise_scale, size=(num_test, num_classes)).astype('float32')

# Integer labels (0..num_classes-1)
y_train_int = np.argmax(train_logits, axis=1)
y_test_int = np.argmax(test_logits, axis=1)

# One-hot labels for categorical_crossentropy
y_train = tf.keras.utils.to_categorical(y_train_int, num_classes)
y_test = tf.keras.utils.to_categorical(y_test_int, num_classes)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}\n")

# Step 3: Build a simple neural network
# Architecture: 100 -> 64 (ReLU) -> 10 (Softmax)
print("Building model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(num_features,), name='hidden_layer'),
    tf.keras.layers.Dense(num_classes, activation='softmax', name='output_layer'),
])

# Step 4: Compile the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'],
)

model.summary()
print()

# Step 5: Train
print("Training model (GPU if available, otherwise CPU)...")
start_time = time.time()

history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=128,
    validation_split=0.2,
    verbose=1,
)

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.2f} seconds")

# Step 6: Evaluate
print("\nEvaluating on test data...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Step 7: Predict
print("\nMaking predictions on 5 test samples...")
preds = model.predict(X_test[:5], verbose=0)
for i, pred in enumerate(preds):
    predicted_class = int(np.argmax(pred))
    confidence = float(pred[predicted_class])
    actual_class = int(np.argmax(y_test[i]))
    print(f"Sample {i+1}: Predicted={predicted_class} (confidence={confidence:.3f}), Actual={actual_class}")


###  Python script for training a small neural network on an RTX A4000 using TensorFlow with GPU acceleration and mixed precision optimization:

#### Key Features

- Mixed Precision: Uses mixed_float16 for Tensor Core acceleration.
- GPU Check: Confirms RTX A4000 is detected.
- MNIST Dataset: Simple dataset for quick training.
- Neural Network: 2 hidden layers + softmax output.

In [None]:

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

# Configure GPU memory growth to prevent crashes
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth to avoid allocating all GPU memory at once
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPUs available:", gpus)
        print("Using GPU:", tf.config.experimental.get_device_details(gpus[0])['device_name'])
    except RuntimeError as e:
        print(e)
else:
    print("No GPU detected. Training will run on CPU.")

# Enable mixed precision for Tensor Cores optimization
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')
print("Mixed precision enabled: mixed_float16")

# Load MNIST dataset
print("\nLoading MNIST dataset...")
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize pixel values
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# One-hot encode labels
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

print(f"Training samples: {len(x_train)}, Test samples: {len(x_test)}")

# Define a simple neural network model
model = Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(10, activation='softmax', dtype='float32')  # Output layer with float32 for stability
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model with smaller batch size to prevent memory issues
print("\nStarting training...")
history = model.fit(x_train, y_train,
                    validation_split=0.1,
                    epochs=5,
                    batch_size=64,  # Reduced from 128 to prevent memory issues
                    verbose=1)

# Evaluate the model on test data
print("\nEvaluating model...")
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
