# Gradients and Backpropagation: Autograd vs GradientTape

**Learning Objectives:**
- Master automatic differentiation in both PyTorch and TensorFlow
- Understand the differences between autograd and GradientTape
- Learn gradient computation patterns and best practices
- Explore advanced gradient techniques and debugging

**Prerequisites:** Computational graphs, tensor operations

**Estimated Time:** 45 minutes

---

Automatic differentiation is the backbone of modern deep learning. Understanding how PyTorch's autograd and TensorFlow's GradientTape work is essential for:
- **Training neural networks**: Computing gradients for optimization
- **Custom operations**: Implementing new layers and functions
- **Debugging**: Understanding gradient flow and vanishing/exploding gradients
- **Advanced techniques**: Gradient clipping, accumulation, and higher-order derivatives

In [None]:
import os
import sys

import numpy as np

# Add src to path for our utilities
sys.path.append(os.path.join('..', '..', 'src'))

from utils.comparison_tools import create_side_by_side_comparison

# Try to import frameworks
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    PYTORCH_AVAILABLE = True
    print(f"✅ PyTorch {torch.__version__} available")
except ImportError:
    PYTORCH_AVAILABLE = False
    print("❌ PyTorch not available")

try:
    import tensorflow as tf
    TENSORFLOW_AVAILABLE = True
    print(f"✅ TensorFlow {tf.__version__} available")
except ImportError:
    TENSORFLOW_AVAILABLE = False
    print("❌ TensorFlow not available")

# Set random seeds
np.random.seed(42)
if PYTORCH_AVAILABLE:
    torch.manual_seed(42)
if TENSORFLOW_AVAILABLE:
    tf.random.set_seed(42)

## 1. Basic Gradient Computation

Let's start with simple examples to understand how each framework computes gradients.

In [None]:
print("=" * 60)
print("BASIC GRADIENT COMPUTATION")
print("=" * 60)

print("""
Automatic Differentiation:
• Computes derivatives automatically using chain rule
• Essential for training neural networks
• Two main approaches: forward-mode and reverse-mode (backpropagation)
• Both frameworks use reverse-mode AD
""")

# Simple function: f(x) = x^2 + 2x + 1
# Derivative: f'(x) = 2x + 2

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Autograd Example:")

    def pytorch_gradient_example():
        # Create tensor with gradient tracking
        x = torch.tensor(3.0, requires_grad=True)
        print(f"Input x: {x.item()}")
        print(f"Requires grad: {x.requires_grad}")

        # Forward pass
        y = x**2 + 2*x + 1
        print(f"y = x² + 2x + 1 = {y.item()}")
        print(f"Gradient function: {y.grad_fn}")

        # Backward pass
        y.backward()
        print(f"Computed gradient dy/dx: {x.grad.item()}")
        print(f"Expected gradient (2x + 2): {2*x.item() + 2}")
        print(f"Gradients match: {abs(x.grad.item() - (2*x.item() + 2)) < 1e-6}")

        return x.grad.item()

    pytorch_grad = pytorch_gradient_example()

if TENSORFLOW_AVAILABLE:
    print("\n🟠 TensorFlow GradientTape Example:")

    def tensorflow_gradient_example():
        # Create variable
        x = tf.Variable(3.0)
        print(f"Input x: {x.numpy()}")
        print(f"Trainable: {x.trainable}")

        # Forward pass with gradient tape
        with tf.GradientTape() as tape:
            y = x**2 + 2*x + 1
            print(f"y = x² + 2x + 1 = {y.numpy()}")

        # Compute gradient
        dy_dx = tape.gradient(y, x)
        print(f"Computed gradient dy/dx: {dy_dx.numpy()}")
        print(f"Expected gradient (2x + 2): {2*x.numpy() + 2}")
        print(f"Gradients match: {abs(dy_dx.numpy() - (2*x.numpy() + 2)) < 1e-6}")

        return dy_dx.numpy()

    tensorflow_grad = tensorflow_gradient_example()

# Compare approaches
pytorch_code = """
import torch

# PyTorch: requires_grad=True
x = torch.tensor(3.0, requires_grad=True)
y = x**2 + 2*x + 1

# Automatic gradient computation
y.backward()
print(f"Gradient: {x.grad}")
"""

tensorflow_code = """
import tensorflow as tf

# TensorFlow: GradientTape context
x = tf.Variable(3.0)
with tf.GradientTape() as tape:
    y = x**2 + 2*x + 1

# Explicit gradient computation
dy_dx = tape.gradient(y, x)
print(f"Gradient: {dy_dx}")
"""

print(create_side_by_side_comparison(
    pytorch_code, tensorflow_code, "Basic Gradient Computation"
))

## 2. Multiple Variables and Partial Derivatives

Computing gradients with respect to multiple variables.

In [None]:
print("\n" + "=" * 60)
print("MULTIPLE VARIABLES AND PARTIAL DERIVATIVES")
print("=" * 60)

# Function: f(x, y) = x²y + xy² + x + y
# ∂f/∂x = 2xy + y² + 1
# ∂f/∂y = x² + 2xy + 1

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Multiple Variables:")

    def pytorch_multivariable_example():
        x = torch.tensor(2.0, requires_grad=True)
        y = torch.tensor(3.0, requires_grad=True)

        print(f"x = {x.item()}, y = {y.item()}")

        # Forward pass
        z = x**2 * y + x * y**2 + x + y
        print(f"z = x²y + xy² + x + y = {z.item()}")

        # Backward pass
        z.backward()

        print(f"∂z/∂x = {x.grad.item()}")
        print(f"∂z/∂y = {y.grad.item()}")

        # Verify gradients
        expected_dx = 2*x.item()*y.item() + y.item()**2 + 1
        expected_dy = x.item()**2 + 2*x.item()*y.item() + 1

        print(f"Expected ∂z/∂x: {expected_dx}")
        print(f"Expected ∂z/∂y: {expected_dy}")

        print(f"∂z/∂x correct: {abs(x.grad.item() - expected_dx) < 1e-6}")
        print(f"∂z/∂y correct: {abs(y.grad.item() - expected_dy) < 1e-6}")

        return x.grad.item(), y.grad.item()

    pt_grad_x, pt_grad_y = pytorch_multivariable_example()

if TENSORFLOW_AVAILABLE:
    print("\n🟠 TensorFlow Multiple Variables:")

    def tensorflow_multivariable_example():
        x = tf.Variable(2.0)
        y = tf.Variable(3.0)

        print(f"x = {x.numpy()}, y = {y.numpy()}")

        with tf.GradientTape() as tape:
            z = x**2 * y + x * y**2 + x + y
            print(f"z = x²y + xy² + x + y = {z.numpy()}")

        # Compute gradients for both variables
        gradients = tape.gradient(z, [x, y])
        dz_dx, dz_dy = gradients

        print(f"∂z/∂x = {dz_dx.numpy()}")
        print(f"∂z/∂y = {dz_dy.numpy()}")

        # Verify gradients
        expected_dx = 2*x.numpy()*y.numpy() + y.numpy()**2 + 1
        expected_dy = x.numpy()**2 + 2*x.numpy()*y.numpy() + 1

        print(f"Expected ∂z/∂x: {expected_dx}")
        print(f"Expected ∂z/∂y: {expected_dy}")

        print(f"∂z/∂x correct: {abs(dz_dx.numpy() - expected_dx) < 1e-6}")
        print(f"∂z/∂y correct: {abs(dz_dy.numpy() - expected_dy) < 1e-6}")

        return dz_dx.numpy(), dz_dy.numpy()

    tf_grad_x, tf_grad_y = tensorflow_multivariable_example()

# Vector and matrix gradients
print("\n📊 Vector and Matrix Gradients:")

if PYTORCH_AVAILABLE:
    print("\nPyTorch Vector Gradients:")

    # Vector input
    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

    # Scalar output (sum of squares)
    y = torch.sum(x**2)
    y.backward()

    print(f"Input vector: {x.data}")
    print(f"Output scalar: {y.item()}")
    print(f"Gradient vector: {x.grad}")
    print(f"Expected (2x): {2 * x.data}")

if TENSORFLOW_AVAILABLE:
    print("\nTensorFlow Vector Gradients:")

    x = tf.Variable([1.0, 2.0, 3.0])

    with tf.GradientTape() as tape:
        y = tf.reduce_sum(x**2)

    dy_dx = tape.gradient(y, x)

    print(f"Input vector: {x.numpy()}")
    print(f"Output scalar: {y.numpy()}")
    print(f"Gradient vector: {dy_dx.numpy()}")
    print(f"Expected (2x): {2 * x.numpy()}")

## 3. Neural Network Gradients

Computing gradients for neural network parameters.

In [None]:
print("\n" + "=" * 60)
print("NEURAL NETWORK GRADIENTS")
print("=" * 60)

# Simple neural network: y = ReLU(Wx + b)
# Loss: MSE between prediction and target

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Neural Network Gradients:")

    def pytorch_nn_gradients():
        # Create simple network
        torch.manual_seed(42)

        # Input and target
        x = torch.randn(5, 3)  # 5 samples, 3 features
        target = torch.randn(5, 2)  # 5 samples, 2 outputs

        # Network parameters
        W = torch.randn(3, 2, requires_grad=True)
        b = torch.randn(2, requires_grad=True)

        print(f"Input shape: {x.shape}")
        print(f"Weight shape: {W.shape}")
        print(f"Bias shape: {b.shape}")
        print(f"Target shape: {target.shape}")

        # Forward pass
        linear_output = x @ W + b
        prediction = torch.relu(linear_output)
        loss = torch.mean((prediction - target)**2)

        print(f"\nLoss: {loss.item():.4f}")

        # Backward pass
        loss.backward()

        print("\nGradient shapes:")
        print(f"dL/dW shape: {W.grad.shape}")
        print(f"dL/db shape: {b.grad.shape}")

        print("\nGradient magnitudes:")
        print(f"||dL/dW||: {torch.norm(W.grad).item():.4f}")
        print(f"||dL/db||: {torch.norm(b.grad).item():.4f}")

        return loss.item(), W.grad.clone(), b.grad.clone()

    pt_loss, pt_W_grad, pt_b_grad = pytorch_nn_gradients()

if TENSORFLOW_AVAILABLE:
    print("\n🟠 TensorFlow Neural Network Gradients:")

    def tensorflow_nn_gradients():
        tf.random.set_seed(42)

        # Input and target
        x = tf.random.normal((5, 3))  # 5 samples, 3 features
        target = tf.random.normal((5, 2))  # 5 samples, 2 outputs

        # Network parameters
        W = tf.Variable(tf.random.normal((3, 2)))
        b = tf.Variable(tf.random.normal((2,)))

        print(f"Input shape: {x.shape}")
        print(f"Weight shape: {W.shape}")
        print(f"Bias shape: {b.shape}")
        print(f"Target shape: {target.shape}")

        with tf.GradientTape() as tape:
            # Forward pass
            linear_output = x @ W + b
            prediction = tf.nn.relu(linear_output)
            loss = tf.reduce_mean((prediction - target)**2)

        print(f"\nLoss: {loss.numpy():.4f}")

        # Compute gradients
        gradients = tape.gradient(loss, [W, b])
        dL_dW, dL_db = gradients

        print("\nGradient shapes:")
        print(f"dL/dW shape: {dL_dW.shape}")
        print(f"dL/db shape: {dL_db.shape}")

        print("\nGradient magnitudes:")
        print(f"||dL/dW||: {tf.norm(dL_dW).numpy():.4f}")
        print(f"||dL/db||: {tf.norm(dL_db).numpy():.4f}")

        return loss.numpy(), dL_dW, dL_db

    tf_loss, tf_W_grad, tf_b_grad = tensorflow_nn_gradients()

# Using built-in layers
print("\n🏗️ Using Built-in Layers:")

pytorch_layer_code = """
import torch.nn as nn

# PyTorch built-in layers
model = nn.Sequential(
    nn.Linear(3, 10),
    nn.ReLU(),
    nn.Linear(10, 2)
)

# Forward pass
x = torch.randn(5, 3)
target = torch.randn(5, 2)
prediction = model(x)
loss = nn.MSELoss()(prediction, target)

# Backward pass
loss.backward()

# Access gradients
for name, param in model.named_parameters():
    print(f"{name}: {param.grad.shape}")
"""

tensorflow_layer_code = """
import tensorflow as tf

# TensorFlow built-in layers
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(2)
])

# Forward pass with GradientTape
x = tf.random.normal((5, 3))
target = tf.random.normal((5, 2))

with tf.GradientTape() as tape:
    prediction = model(x)
    loss = tf.keras.losses.MSE(target, prediction)
    loss = tf.reduce_mean(loss)

# Compute gradients
gradients = tape.gradient(loss, model.trainable_variables)

# Access gradients
for i, grad in enumerate(gradients):
    print(f"Layer {i}: {grad.shape}")
"""

print(create_side_by_side_comparison(
    pytorch_layer_code, tensorflow_layer_code, "Built-in Layers Gradients"
))

## 4. Advanced Gradient Techniques

Gradient clipping, accumulation, and higher-order derivatives.

In [None]:
print("\n" + "=" * 60)
print("ADVANCED GRADIENT TECHNIQUES")
print("=" * 60)

# 1. Gradient Clipping
print("\n1. Gradient Clipping:")

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Gradient Clipping:")

    # Create model with large gradients
    model = nn.Linear(5, 1)
    x = torch.randn(10, 5)
    target = torch.randn(10, 1) * 100  # Large target values

    # Forward pass
    prediction = model(x)
    loss = nn.MSELoss()(prediction, target)

    # Backward pass
    loss.backward()

    # Check gradient norms before clipping
    total_norm_before = 0
    for param in model.parameters():
        if param.grad is not None:
            total_norm_before += param.grad.data.norm(2).item() ** 2
    total_norm_before = total_norm_before ** 0.5

    print(f"Gradient norm before clipping: {total_norm_before:.4f}")

    # Clip gradients
    max_norm = 1.0
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

    # Check gradient norms after clipping
    total_norm_after = 0
    for param in model.parameters():
        if param.grad is not None:
            total_norm_after += param.grad.data.norm(2).item() ** 2
    total_norm_after = total_norm_after ** 0.5

    print(f"Gradient norm after clipping: {total_norm_after:.4f}")
    print(f"Clipping applied: {total_norm_before > max_norm}")

if TENSORFLOW_AVAILABLE:
    print("\n🟠 TensorFlow Gradient Clipping:")

    # Create model
    model = tf.keras.layers.Dense(1)
    x = tf.random.normal((10, 5))
    target = tf.random.normal((10, 1)) * 100

    with tf.GradientTape() as tape:
        prediction = model(x)
        loss = tf.reduce_mean(tf.square(prediction - target))

    # Compute gradients
    gradients = tape.gradient(loss, model.trainable_variables)

    # Check gradient norms before clipping
    total_norm_before = tf.sqrt(sum([tf.reduce_sum(tf.square(g)) for g in gradients]))
    print(f"Gradient norm before clipping: {total_norm_before.numpy():.4f}")

    # Clip gradients
    max_norm = 1.0
    clipped_gradients, global_norm = tf.clip_by_global_norm(gradients, max_norm)

    print(f"Gradient norm after clipping: {global_norm.numpy():.4f}")
    print(f"Clipping applied: {global_norm.numpy() > max_norm}")

# 2. Gradient Accumulation
print("\n2. Gradient Accumulation:")

pytorch_accumulation_code = """
# PyTorch gradient accumulation
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.01)

accumulation_steps = 4
optimizer.zero_grad()

for i in range(accumulation_steps):
    # Mini-batch
    x = torch.randn(8, 10)
    target = torch.randn(8, 1)

    prediction = model(x)
    loss = nn.MSELoss()(prediction, target)

    # Scale loss by accumulation steps
    loss = loss / accumulation_steps
    loss.backward()  # Accumulate gradients

# Update parameters with accumulated gradients
optimizer.step()
"""

tensorflow_accumulation_code = """
# TensorFlow gradient accumulation
model = tf.keras.layers.Dense(1)
optimizer = tf.keras.optimizers.SGD(0.01)

accumulation_steps = 4
accumulated_gradients = []

for i in range(accumulation_steps):
    with tf.GradientTape() as tape:
        x = tf.random.normal((8, 10))
        target = tf.random.normal((8, 1))

        prediction = model(x)
        loss = tf.reduce_mean(tf.square(prediction - target))
        loss = loss / accumulation_steps

    # Compute gradients
    gradients = tape.gradient(loss, model.trainable_variables)

    # Accumulate gradients
    if i == 0:
        accumulated_gradients = gradients
    else:
        accumulated_gradients = [
            acc_grad + grad for acc_grad, grad
            in zip(accumulated_gradients, gradients)
        ]

# Apply accumulated gradients
optimizer.apply_gradients(zip(accumulated_gradients, model.trainable_variables))
"""

print(create_side_by_side_comparison(
    pytorch_accumulation_code, tensorflow_accumulation_code, "Gradient Accumulation"
))

# 3. Higher-order derivatives
print("\n3. Higher-order Derivatives:")

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Second Derivatives:")

    # Function: f(x) = x^3
    # First derivative: f'(x) = 3x^2
    # Second derivative: f''(x) = 6x

    x = torch.tensor(2.0, requires_grad=True)
    y = x**3

    # First derivative
    dy_dx = torch.autograd.grad(y, x, create_graph=True)[0]
    print(f"First derivative dy/dx: {dy_dx.item()}")
    print(f"Expected (3x²): {3 * x.item()**2}")

    # Second derivative
    d2y_dx2 = torch.autograd.grad(dy_dx, x)[0]
    print(f"Second derivative d²y/dx²: {d2y_dx2.item()}")
    print(f"Expected (6x): {6 * x.item()}")

if TENSORFLOW_AVAILABLE:
    print("\n🟠 TensorFlow Second Derivatives:")

    x = tf.Variable(2.0)

    with tf.GradientTape() as tape2:
        with tf.GradientTape() as tape1:
            y = x**3

        # First derivative
        dy_dx = tape1.gradient(y, x)

    # Second derivative
    d2y_dx2 = tape2.gradient(dy_dx, x)

    print(f"First derivative dy/dx: {dy_dx.numpy()}")
    print(f"Expected (3x²): {3 * x.numpy()**2}")
    print(f"Second derivative d²y/dx²: {d2y_dx2.numpy()}")
    print(f"Expected (6x): {6 * x.numpy()}")

## 5. Gradient Debugging and Validation

Techniques for debugging gradient computation issues.

In [None]:
print("\n" + "=" * 60)
print("GRADIENT DEBUGGING AND VALIDATION")
print("=" * 60)

print("""
Common Gradient Issues:
• Vanishing gradients: Gradients become very small
• Exploding gradients: Gradients become very large
• Dead neurons: Gradients are zero (e.g., ReLU saturation)
• Incorrect gradient computation: Implementation bugs
""")

# Gradient checking (numerical vs analytical)
print("\n1. Gradient Checking:")

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Gradient Checking:")

    def pytorch_gradient_check():
        # Simple function for testing
        def test_function(x):
            return torch.sum(x**2 + 2*x)

        x = torch.randn(5, requires_grad=True, dtype=torch.float64)

        # Use PyTorch's gradient checker
        from torch.autograd import gradcheck

        # Check if gradients are correct
        test_passed = gradcheck(test_function, x, eps=1e-6, atol=1e-4)

        print(f"Gradient check passed: {test_passed}")

        # Manual numerical gradient check
        eps = 1e-5
        x_np = x.detach().numpy()

        # Analytical gradient
        y = test_function(x)
        y.backward()
        analytical_grad = x.grad.numpy()

        # Numerical gradient
        numerical_grad = np.zeros_like(x_np)
        for i in range(len(x_np)):
            x_plus = x_np.copy()
            x_minus = x_np.copy()
            x_plus[i] += eps
            x_minus[i] -= eps

            y_plus = test_function(torch.tensor(x_plus)).item()
            y_minus = test_function(torch.tensor(x_minus)).item()

            numerical_grad[i] = (y_plus - y_minus) / (2 * eps)

        # Compare gradients
        diff = np.abs(analytical_grad - numerical_grad)
        max_diff = np.max(diff)

        print(f"Max difference: {max_diff:.2e}")
        print(f"Gradients match: {max_diff < 1e-4}")

        return max_diff

    pytorch_gradient_check()

# Gradient flow visualization
print("\n2. Gradient Flow Analysis:")

if PYTORCH_AVAILABLE:
    print("\n🔥 PyTorch Gradient Flow:")

    def analyze_gradient_flow():
        # Create a deep network
        layers = []
        input_size = 10

        for i in range(5):  # 5 layers
            layers.append(nn.Linear(input_size, input_size))
            layers.append(nn.ReLU())

        model = nn.Sequential(*layers)

        # Forward pass
        x = torch.randn(32, 10)
        target = torch.randn(32, 10)

        prediction = model(x)
        loss = nn.MSELoss()(prediction, target)

        # Backward pass
        loss.backward()

        # Analyze gradient magnitudes
        print("Gradient magnitudes by layer:")
        for i, (name, param) in enumerate(model.named_parameters()):
            if param.grad is not None and 'weight' in name:
                grad_norm = param.grad.norm().item()
                print(f"Layer {i//2}: {grad_norm:.6f}")

                # Check for vanishing/exploding gradients
                if grad_norm < 1e-6:
                    print("  ⚠️  Potential vanishing gradient")
                elif grad_norm > 10:
                    print("  ⚠️  Potential exploding gradient")

    analyze_gradient_flow()

# Common debugging techniques
print("\n📋 Gradient Debugging Checklist:")
debugging_checklist = [
    "✓ Check if requires_grad=True (PyTorch) or GradientTape is used (TensorFlow)",
    "✓ Verify gradient shapes match parameter shapes",
    "✓ Look for NaN or Inf values in gradients",
    "✓ Check gradient magnitudes (too small = vanishing, too large = exploding)",
    "✓ Use gradient clipping for exploding gradients",
    "✓ Use proper initialization for vanishing gradients",
    "✓ Verify custom operations have correct backward passes",
    "✓ Use gradient checking for custom functions",
    "✓ Monitor gradient flow through the network",
    "✓ Check for dead neurons (zero gradients)"
]

for item in debugging_checklist:
    print(f"  {item}")

print("\n🛠️ Debugging Tools Comparison:")
debugging_tools = {
    "Gradient checking": {
        "PyTorch": "torch.autograd.gradcheck()",
        "TensorFlow": "tf.test.compute_gradient()"
    },
    "Gradient clipping": {
        "PyTorch": "torch.nn.utils.clip_grad_norm_()",
        "TensorFlow": "tf.clip_by_global_norm()"
    },
    "NaN detection": {
        "PyTorch": "torch.isnan(), torch.isinf()",
        "TensorFlow": "tf.debugging.check_numerics()"
    },
    "Gradient inspection": {
        "PyTorch": "param.grad for param in model.parameters()",
        "TensorFlow": "tape.gradient(loss, model.trainable_variables)"
    }
}

for tool, frameworks in debugging_tools.items():
    print(f"{tool:18} | PyTorch: {frameworks['PyTorch']:35} | TensorFlow: {frameworks['TensorFlow']}")

## Summary and Key Takeaways

**What we've learned:**

1. **Automatic Differentiation**: Both frameworks use reverse-mode AD for efficient gradient computation
2. **Framework Differences**: PyTorch's autograd vs TensorFlow's GradientTape
3. **Neural Network Gradients**: Computing gradients for complex models
4. **Advanced Techniques**: Gradient clipping, accumulation, and higher-order derivatives
5. **Debugging**: Tools and techniques for gradient validation

**Key Differences:**

| Aspect | PyTorch Autograd | TensorFlow GradientTape |
|--------|------------------|-------------------------|
| **Activation** | `requires_grad=True` | `with tf.GradientTape():` |
| **Computation** | `loss.backward()` | `tape.gradient(loss, vars)` |
| **Persistence** | Automatic | Must specify `persistent=True` |
| **Multiple Outputs** | Separate backward calls | Single gradient call |
| **Higher-order** | `create_graph=True` | Nested GradientTapes |
| **Memory** | Automatic cleanup | Automatic cleanup |

**Best Practices:**

**PyTorch:**
- Use `requires_grad=True` only for parameters that need gradients
- Call `optimizer.zero_grad()` before each backward pass
- Use `torch.no_grad()` for inference to save memory
- Leverage `torch.autograd.gradcheck()` for custom functions

**TensorFlow:**
- Use `tf.GradientTape()` context for gradient computation
- Set `persistent=True` for multiple gradient computations
- Use `tf.stop_gradient()` to prevent gradient flow
- Leverage `tf.debugging.check_numerics()` for gradient validation

**Common Patterns:**

**Training Loop (PyTorch):**
```python
optimizer.zero_grad()
loss = criterion(model(x), target)
loss.backward()
optimizer.step()
```

**Training Loop (TensorFlow):**
```python
with tf.GradientTape() as tape:
    loss = criterion(model(x), target)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
```

**Performance Considerations:**
- Both frameworks are highly optimized for gradient computation
- PyTorch's dynamic graphs have slight overhead but offer flexibility
- TensorFlow's eager mode is similar to PyTorch, graph mode is faster
- Gradient accumulation helps with memory-limited training

**When to Use Advanced Techniques:**
- **Gradient Clipping**: When training RNNs or very deep networks
- **Gradient Accumulation**: When batch size is limited by memory
- **Higher-order Derivatives**: For meta-learning or optimization research
- **Gradient Checking**: When implementing custom operations

**Next Steps:**
- Learn about optimizers and how they use gradients
- Explore advanced architectures and their gradient flow
- Study regularization techniques that affect gradients
- Practice debugging gradient issues in real models

Understanding automatic differentiation is fundamental to deep learning success. Both PyTorch and TensorFlow provide powerful tools for gradient computation, each with their own strengths and use cases.