In [1]:
# ============================================================================
# Weights & Biases (wandb) Usage Guide
# ============================================================================
# 
# wandb is a tool for experiment tracking, visualization, and collaboration
# 
# Installation: pip install wandb
# Login: wandb login (or set WANDB_API_KEY environment variable)
# 
# Key Features:
# - Track metrics, hyperparameters, and system metrics
# - Visualize training curves in real-time
# - Log images, tables, and other artifacts
# - Compare multiple runs
# - Share results with team
# ============================================================================

import wandb
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

print("wandb version:", wandb.__version__)



wandb version: 0.23.1


# 1. Basic Setup and Initialization

## Steps to get started:
1. **Install**: `pip install wandb`
2. **Login**: Run `wandb login` in terminal (or set `WANDB_API_KEY` env var)
3. **Initialize**: Call `wandb.init()` with your project name
4. **Access UI**: Visit https://wandb.ai to view your runs

In [3]:
# ============================================================================
# Example 1: Basic Initialization
# ============================================================================

# Initialize a run
# This creates a new run in your project and returns a run object
run = wandb.init(
    project="de-LLM",                    # Project name (creates/uses existing project)
    name="basic-example",                # Run name (optional, auto-generated if not provided)
    notes="Learning wandb basics",       # Notes about this run
    tags=["tutorial", "basic"],          # Tags for filtering runs
    config={                              # Hyperparameters/config
        "learning_rate": 0.001,
        "batch_size": 32,
        "epochs": 10,
        "model": "simple-nn"
    }
)

print(f"Run ID: {run.id}")
print(f"Run URL: {run.url}")  # Click this URL to view in browser!

# Log a simple metric
wandb.log({"loss": 0.5, "accuracy": 0.85})

# Finish the run (saves all data)
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mjunius-zhou[0m ([33mjunius-zhou-junius[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Run ID: rs1phc58
Run URL: https://wandb.ai/junius-zhou-junius/de-LLM/runs/rs1phc58


0,1
accuracy,▁
loss,▁

0,1
accuracy,0.85
loss,0.5


In [None]:
# ============================================================================
# Example 2: Training Loop with Metrics Logging
# ============================================================================

# Simulate a training loop
from time import sleep


wandb.init(
    project="de-LLM",
    name="training-example",
    config={
        "learning_rate": 0.001,
        "batch_size": 64,
        "epochs": 5,
        "optimizer": "Adam"
    }
)

# Simulate training
for epoch in range(500):
    # Simulate training metrics
    train_loss = 1.0 / (epoch + 1) + np.random.normal(0, 0.1)
    train_acc = 0.5 + epoch * 0.1 + np.random.normal(0, 0.05)
    
    # Simulate validation metrics
    val_loss = train_loss + 0.1
    val_acc = train_acc - 0.05

    sleep(0.1)
    
    # Log metrics (creates time series plots)
    # group data according to the keys with "/"
    wandb.log({
        "epoch": epoch,
        "train/loss": train_loss,
        "train/accuracy": train_acc,
        "val/loss": val_loss,
        "val/accuracy": val_acc,
        "learning_rate": 0.001 * (0.9 ** epoch)  # Learning rate schedule
    })
    
    print(f"Epoch {epoch}: Train Loss={train_loss:.3f}, Val Acc={val_acc:.3f}")

wandb.finish()

Epoch 0: Train Loss=0.903, Val Acc=0.440
Epoch 1: Train Loss=0.577, Val Acc=0.539
Epoch 2: Train Loss=0.395, Val Acc=0.708
Epoch 3: Train Loss=0.228, Val Acc=0.675
Epoch 4: Train Loss=0.156, Val Acc=0.905
Epoch 5: Train Loss=0.320, Val Acc=0.896
Epoch 6: Train Loss=0.154, Val Acc=1.019
Epoch 7: Train Loss=0.130, Val Acc=1.157
Epoch 8: Train Loss=0.237, Val Acc=1.190
Epoch 9: Train Loss=0.107, Val Acc=1.301
Epoch 10: Train Loss=0.012, Val Acc=1.427
Epoch 11: Train Loss=0.051, Val Acc=1.545
Epoch 12: Train Loss=0.127, Val Acc=1.628
Epoch 13: Train Loss=0.141, Val Acc=1.795
Epoch 14: Train Loss=0.110, Val Acc=1.902
Epoch 15: Train Loss=-0.045, Val Acc=1.972
Epoch 16: Train Loss=0.126, Val Acc=2.069
Epoch 17: Train Loss=0.020, Val Acc=2.157
Epoch 18: Train Loss=0.108, Val Acc=2.171
Epoch 19: Train Loss=0.149, Val Acc=2.306
Epoch 20: Train Loss=0.032, Val Acc=2.472
Epoch 21: Train Loss=0.128, Val Acc=2.594
Epoch 22: Train Loss=-0.013, Val Acc=2.708
Epoch 23: Train Loss=-0.085, Val Acc=2.666

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 498: Train Loss=-0.153, Val Acc=50.312
Epoch 499: Train Loss=-0.033, Val Acc=50.414


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇██
learning_rate,█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/accuracy,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇█
train/loss,█▅▃▄▅▇▄▃▁▃▆▃▃▄▃▃▄▂▆▄▄▄▃▆▁▄▃▃▄▃▆▃▂▄▄▇▃▂▅▂
val/accuracy,▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇██
val/loss,█▄▆▆▆▃▂▇▂▂▄▃▂▇▃▅▁▄▃▄▇▆▅▅▂▄▅▄▅▅▂▆▇▆▇▃▆▆▄▂

0,1
epoch,499.0
learning_rate,0.0
train/accuracy,50.46394
train/loss,-0.03322
val/accuracy,50.41394
val/loss,0.06678


In [None]:
# ============================================================================
# Example 3: Logging Images and Plots
# ============================================================================

wandb.init(
    project="de-LLM",
    name="image-logging-example",
    config={"plot_type": "matplotlib"}
)

# Create a simple plot
fig, ax = plt.subplots(figsize=(8, 6))
x = np.linspace(0, 10, 100)
y = np.sin(x)
ax.plot(x, y, label="sin(x)")
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_title("Sample Plot")
ax.legend()
ax.grid(True)

# Log the plot
wandb.log({"plot": wandb.Image(fig)})

# Log multiple plots
for i in range(3):
    fig, ax = plt.subplots()
    x = np.linspace(0, 10, 100)
    y = np.sin(x + i)
    ax.plot(x, y, label=f"sin(x + {i})")
    ax.legend()
    wandb.log({f"plot_{i}": wandb.Image(fig)})
    plt.close(fig)

# Log numpy array as image
random_image = np.random.rand(64, 64, 3)
wandb.log({"random_image": wandb.Image(random_image)})

wandb.finish()

In [None]:
# ============================================================================
# Example 4: Logging Tables (for data analysis)
# ============================================================================

wandb.init(
    project="de-LLM",
    name="table-logging-example"
)

# Create a table with predictions
columns = ["image_id", "prediction", "ground_truth", "confidence"]
data = [
    [f"img_{i}", f"class_{np.random.randint(0, 10)}", f"class_{np.random.randint(0, 10)}", np.random.rand()]
    for i in range(10)
]

table = wandb.Table(columns=columns, data=data)
wandb.log({"predictions": table})

# Log confusion matrix as table
confusion_matrix = np.random.randint(0, 100, (5, 5))
cm_table = wandb.Table(
    columns=[f"class_{i}" for i in range(5)],
    data=confusion_matrix.tolist()
)
wandb.log({"confusion_matrix": cm_table})

wandb.finish()

In [None]:
# ============================================================================
# Example 5: Complete Training Example (PyTorch)
# ============================================================================

# Simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize wandb with hyperparameters
wandb.init(
    project="de-LLM",
    name="pytorch-training-example",
    config={
        "learning_rate": 0.001,
        "batch_size": 32,
        "epochs": 3,
        "optimizer": "Adam",
        "model": "SimpleMLP"
    }
)

# Create model and log architecture
model = SimpleModel()
wandb.watch(model, log="all", log_freq=10)  # Track gradients and parameters

# Create dummy data
dummy_input = torch.randn(32, 784)
dummy_target = torch.randint(0, 10, (32,))

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(wandb.config.epochs):
    model.train()
    
    # Simulate batch training
    for batch_idx in range(10):
        optimizer.zero_grad()
        output = model(dummy_input)
        loss = criterion(output, dummy_target)
        loss.backward()
        optimizer.step()
        
        # Log metrics every batch
        if batch_idx % 5 == 0:
            accuracy = (output.argmax(dim=1) == dummy_target).float().mean()
            wandb.log({
                "batch_loss": loss.item(),
                "batch_accuracy": accuracy.item(),
                "epoch": epoch,
                "batch": batch_idx
            })
    
    # Log epoch-level metrics
    wandb.log({
        "epoch_loss": loss.item(),
        "epoch": epoch
    })

wandb.finish()

# 2. How to View Data in wandb UI

## Accessing the UI:

1. **Web Interface**: 
   - Visit https://wandb.ai
   - Login with your account
   - Select your project (e.g., "de-LLM")

2. **From Code**:
   - Use `run.url` to get direct link to your run
   - Or use `wandb.run.url` after initialization

## What You Can See in the UI:

### Dashboard View:
- **Runs Table**: List of all runs with metrics, configs, and status
- **Parallel Coordinates**: Visualize hyperparameter relationships
- **Scatter Plots**: Compare runs across different metrics

### Run Details:
- **Metrics Tab**: Time series plots of all logged metrics
- **System Tab**: CPU, GPU, memory usage over time
- **Logs Tab**: Console output from your training
- **Files Tab**: Saved model checkpoints, configs, etc.
- **Media Tab**: Images, videos, audio you logged
- **Tables Tab**: Data tables you logged

### Key Features:
- **Compare Runs**: Select multiple runs to compare side-by-side
- **Filter Runs**: By tags, config values, or metrics
- **Group Runs**: Organize runs by hyperparameters
- **Sweep**: Hyperparameter optimization (see Example 6)

In [None]:
# ============================================================================
# Example 6: Hyperparameter Sweep (Automated Search)
# ============================================================================

# Define sweep configuration
sweep_config = {
    "method": "grid",  # or "random", "bayes"
    "metric": {
        "name": "val_accuracy",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {
            "values": [0.001, 0.01, 0.1]
        },
        "batch_size": {
            "values": [32, 64, 128]
        },
        "epochs": {
            "value": 5
        }
    }
}

# Initialize sweep (uncomment to create sweep)
# sweep_id = wandb.sweep(sweep_config, project="de-LLM")

# Define training function for sweep
def train_sweep():
    # Initialize wandb for this sweep run
    wandb.init(project="de-LLM")
    
    # Access hyperparameters from config
    lr = wandb.config.learning_rate
    batch_size = wandb.config.batch_size
    
    # Simulate training
    for epoch in range(wandb.config.epochs):
        train_loss = 1.0 / (epoch + 1) + np.random.normal(0, 0.1)
        val_acc = 0.5 + epoch * 0.1 + np.random.normal(0, 0.05)
        
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
            "val_accuracy": val_acc
        })
    
    wandb.finish()

# Run sweep (uncomment to execute)
# wandb.agent(sweep_id, train_sweep, count=10)

print("Sweep configuration created. Uncomment code to run sweep.")

In [None]:
# ============================================================================
# Example 7: Logging Model Artifacts
# ============================================================================

wandb.init(
    project="de-LLM",
    name="artifact-example"
)

# Create a simple model
model = SimpleModel()

# Save model checkpoint
torch.save(model.state_dict(), "model_checkpoint.pth")

# Log as artifact
artifact = wandb.Artifact("model", type="model")
artifact.add_file("model_checkpoint.pth")
wandb.log_artifact(artifact)

# Log dataset
dataset_artifact = wandb.Artifact("dataset", type="dataset")
# Add files to artifact
# dataset_artifact.add_dir("data/")
wandb.log_artifact(dataset_artifact)

wandb.finish()

In [None]:
# ============================================================================
# Example 8: Best Practices and Tips
# ============================================================================

print("""
BEST PRACTICES:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. ✅ Use descriptive run names
   wandb.init(name="resnet50-lr0.001-bs64")

2. ✅ Log hyperparameters in config
   wandb.config.update({"lr": 0.001, "batch_size": 64})

3. ✅ Use namespaces for metrics (with slashes)
   wandb.log({"train/loss": loss, "val/loss": val_loss})

4. ✅ Log at appropriate frequency
   - Every epoch: wandb.log({"epoch": epoch, ...})
   - Every N batches: if batch_idx % N == 0: wandb.log(...)

5. ✅ Use wandb.watch() for PyTorch models
   wandb.watch(model, log="all", log_freq=100)

6. ✅ Finish runs properly
   wandb.finish()  # Always call this!

7. ✅ Use tags for organization
   wandb.init(tags=["experiment", "baseline"])

8. ✅ Log images/videos for debugging
   wandb.log({"predictions": wandb.Image(image)})

9. ✅ Use wandb.alert() for important events
   wandb.alert(title="Training Complete", text="Model converged!")

10. ✅ Compare runs using UI filters
    - Filter by tags, config values, or metrics
    - Use parallel coordinates plot for hyperparameter analysis

COMMON COMMANDS:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Terminal commands:
  wandb login                    # Login to wandb
  wandb offline                  # Run in offline mode
  wandb sync <run_dir>          # Sync offline runs
  wandb status                  # Check login status

Python API:
  wandb.init()                  # Initialize run
  wandb.log()                   # Log metrics
  wandb.config                  # Access hyperparameters
  wandb.watch()                 # Track model gradients
  wandb.finish()                # End run
  wandb.save()                  # Save files
  wandb.log_artifact()         # Log artifacts

VIEWING IN UI:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

1. Go to https://wandb.ai
2. Select your project (e.g., "de-LLM")
3. Click on a run to see:
   - Metrics: Time series plots
   - System: CPU/GPU usage
   - Logs: Console output
   - Files: Saved checkpoints
   - Media: Images/videos
   - Tables: Data tables

4. Compare runs:
   - Select multiple runs
   - Use filters (tags, config, metrics)
   - View parallel coordinates plot

5. Create reports:
   - Share findings with team
   - Document experiments
   - Track progress over time
""")

In [None]:
# ============================================================================
# Example 9: Quick Reference - Minimal Working Example
# ============================================================================

# Minimal example - copy this template for your projects
wandb.init(
    project="de-LLM",
    name="minimal-example",
    config={
        "learning_rate": 0.001,
        "batch_size": 32
    }
)

# Log metrics
for i in range(10):
    wandb.log({
        "loss": 1.0 / (i + 1),
        "accuracy": i * 0.1
    })

# Get run URL (open in browser to see results)
print(f"View your run at: {wandb.run.url}")

wandb.finish()

print("\n✅ Check the URL above to see your metrics in the wandb UI!")
print("   - Metrics tab: See loss and accuracy plots")
print("   - System tab: See resource usage")
print("   - Config tab: See hyperparameters")