## Model and Optimizer State Dicts

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# 1. Define a simple dummy model
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(10, 5)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(5, 1)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Initialize model and optimizer
model = SimpleModel()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- PRINTING STATE DICTS ---

print("--- Model State Dict ---")
# The keys are the layer names, values are the tensors
for param_tensor in model.state_dict():
    print(f"{param_tensor:<20} | {str(model.state_dict()[param_tensor].size()):<20}")

print("\nRAW VIEW")
print(model.state_dict())

print("\n--- Optimizer State Dict ---")
# Optimizer state dict contains parameter groups and internal state (momentum, etc.)
for var_name in optimizer.state_dict():
    # We print just the keys or summary to avoid dumping massive tensors
    print(f"{var_name:<20} | {optimizer.state_dict()[var_name]}")

print("\nRAW VIEW")
print(optimizer.state_dict())

--- Model State Dict ---
fc1.weight           | torch.Size([5, 10]) 
fc1.bias             | torch.Size([5])     
fc2.weight           | torch.Size([1, 5])  
fc2.bias             | torch.Size([1])     

RAW VIEW
OrderedDict({'fc1.weight': tensor([[-0.1367, -0.2585,  0.1300, -0.2752, -0.3076, -0.0153,  0.0634, -0.0610,
         -0.1233,  0.0261],
        [-0.2843, -0.2743, -0.1469,  0.0816,  0.2899,  0.1792,  0.2866,  0.2788,
         -0.0625, -0.1449],
        [-0.1778,  0.1306, -0.0481, -0.0356,  0.1365, -0.1782, -0.0668, -0.0612,
          0.1603,  0.2282],
        [-0.2512,  0.2352,  0.0576,  0.2973,  0.1342, -0.2308,  0.1454, -0.2697,
          0.3122,  0.0999],
        [ 0.1420, -0.3055,  0.0207,  0.2566,  0.1396,  0.1423,  0.1613,  0.1129,
          0.1024,  0.2644]]), 'fc1.bias': tensor([-0.1791, -0.0276,  0.2827, -0.1459,  0.1174]), 'fc2.weight': tensor([[-0.4019,  0.3250, -0.1255, -0.0877,  0.3886]]), 'fc2.bias': tensor([-0.1000])})

--- Optimizer State Dict ---
state          

## Scheduler State Dict

In [2]:
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import pprint

# 1. Setup Dummy Model & Optimizer
# (Schedulers need an optimizer to attach to)
model = nn.Linear(10, 1)
optimizer = optim.SGD(model.parameters(), lr=0.1)

print("=== 1. StepLR State Dict ===")
# Decays LR by 0.1 every 5 epochs
scheduler_step = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)

# Simulate 7 epochs (so we pass the step_size of 5)
for i in range(7):
    optimizer.step()
    scheduler_step.step()

print(f"Current LR: {scheduler_step.get_last_lr()}")
pprint.pprint(scheduler_step.state_dict())

print("\n" + "="*40 + "\n")

# ---------------------------------------------------------

print("=== 2. CosineAnnealingLR State Dict ===")
# Reset optimizer for clarity
optimizer = optim.SGD(model.parameters(), lr=0.1)
# Follows a cosine curve over 50 epochs
scheduler_cos = lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=0.001)

# Simulate 25 epochs (halfway through the cosine wave)
for i in range(25):
    optimizer.step()
    scheduler_cos.step()

print(f"Current LR: {scheduler_cos.get_last_lr()}")
pprint.pprint(scheduler_cos.state_dict())

print("\n" + "="*40 + "\n")

# ---------------------------------------------------------

print("=== 3. ReduceLROnPlateau State Dict ===")
# Reset optimizer
optimizer = optim.SGD(model.parameters(), lr=0.1)
# Reduce LR if validation loss stops going down for 3 epochs (patience=3)
scheduler_plateau = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

# Simulate training where loss does NOT improve
dummy_losses = [0.9, 0.9, 0.9, 0.9, 0.9, 0.9]

for loss in dummy_losses:
    optimizer.step()
    # Note: This scheduler requires a metric argument in .step()
    scheduler_plateau.step(loss)

pprint.pprint(scheduler_plateau.state_dict())

=== 1. StepLR State Dict ===
Current LR: [0.010000000000000002]
{'_get_lr_called_within_step': False,
 '_is_initial': False,
 '_last_lr': [0.010000000000000002],
 '_step_count': 8,
 'base_lrs': [0.1],
 'gamma': 0.1,
 'last_epoch': 7,
 'step_size': 5}


=== 2. CosineAnnealingLR State Dict ===
Current LR: [0.0505]
{'T_max': 50,
 '_get_lr_called_within_step': False,
 '_is_initial': False,
 '_last_lr': [0.0505],
 '_step_count': 26,
 'base_lrs': [0.1],
 'eta_min': 0.001,
 'last_epoch': 25}


=== 3. ReduceLROnPlateau State Dict ===
{'_last_lr': [0.05],
 'best': 0.9,
 'cooldown': 0,
 'cooldown_counter': 0,
 'default_min_lr': 0,
 'eps': 1e-08,
 'factor': 0.5,
 'last_epoch': 6,
 'min_lrs': [0],
 'mode': 'min',
 'mode_worse': inf,
 'num_bad_epochs': 1,
 'patience': 3,
 'threshold': 0.0001,
 'threshold_mode': 'rel'}


## RNG States

In [3]:
import torch
import numpy as np
import random

# 1. Setup: Define a function to generate "random" values from all libraries
def get_random_values():
    return {
        "torch": torch.randn(1).item(),
        "numpy": np.random.rand(),
        "python": random.random()
    }

# Seed everything initially for demonstration
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print("--- 1. Initial Run ---")
print(f"Step 1: {get_random_values()}")
print(f"Step 2: {get_random_values()}")

# --- SAVE THE STATE HERE (Simulating a checkpoint at Step 2) ---
checkpoint = {
    'torch_rng': torch.get_rng_state(),
    'numpy_rng': np.random.get_state(),
    'python_rng': random.getstate(),
    # If using GPU, you must also save: torch.cuda.get_rng_state()
    # 'cuda_rng': torch.cuda.get_rng_state() 
}

# Continue generating (The "Ground Truth" timeline)
print("--- 2. Continuing without stopping (Target behavior) ---")
print(f"Step 3: {get_random_values()}")
print(f"Step 4: {get_random_values()}")


# --- SIMULATE RESTART ---
print("\n... Crashing and Restarting ...\n")

# Reset seeds to prove we aren't just getting lucky
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

# Restore States
torch.set_rng_state(checkpoint['torch_rng'])
np.random.set_state(checkpoint['numpy_rng'])
random.setstate(checkpoint['python_rng'])
# if torch.cuda.is_available():
#     torch.cuda.set_rng_state(loaded_state['cuda_rng'])

print("--- 3. Resumed Run (Should match Target behavior) ---")
print(f"Step 3: {get_random_values()}")
print(f"Step 4: {get_random_values()}")


--- 1. Initial Run ---
Step 1: {'torch': 0.33669036626815796, 'numpy': 0.3745401188473625, 'python': 0.6394267984578837}
Step 2: {'torch': 0.12880940735340118, 'numpy': 0.9507143064099162, 'python': 0.025010755222666936}
--- 2. Continuing without stopping (Target behavior) ---
Step 3: {'torch': 0.23446236550807953, 'numpy': 0.7319939418114051, 'python': 0.27502931836911926}
Step 4: {'torch': 0.23033303022384644, 'numpy': 0.5986584841970366, 'python': 0.22321073814882275}

... Crashing and Restarting ...

--- 3. Resumed Run (Should match Target behavior) ---
Step 3: {'torch': 0.23446236550807953, 'numpy': 0.7319939418114051, 'python': 0.27502931836911926}
Step 4: {'torch': 0.23033303022384644, 'numpy': 0.5986584841970366, 'python': 0.22321073814882275}


## Gradient Scaling

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import GradScaler

# 1. SETUP
# Check for GPU (GradScaler needs a GPU to actually do work, though it runs no-ops on CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create a simple dummy model
model = nn.Linear(10, 1).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)
scaler = GradScaler(enabled=torch.cuda.is_available())

# --- MAKE FAKE DATA ---
# Batch Size: 32, Features: 10
# We move it to the same device as the model
inputs = torch.randn(32, 10).to(device)
targets = torch.randn(32, 1).to(device)

print(f"Initial Scale: {scaler.get_scale()}")

# 2. TRAINING STEP
optimizer.zero_grad()

# A. Forward pass in autocast context
with torch.amp.autocast(str(device), enabled=torch.cuda.is_available()):
    outputs = model(inputs)
    loss = nn.MSELoss()(outputs, targets)

# B. Scale the loss
# This multiplies the loss by the scale factor (initially 65536)
scaler.scale(loss).backward()

# C. Step the optimizer
scaler.step(optimizer)

# D. Update the scale factor
scaler.update()

print(f"Scale after step: {scaler.get_scale()}")

Initial Scale: 65536.0
Scale after step: 65536.0


## Putting it all Together with MLFLOW

### Training

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import GradScaler
import mlflow
import os
import pprint

RUN_ID = None
# --- 1. CONFIGURATION ---
config = {
    "experiment_name": "Gradient_Rage",
    "run_name": "Saving_Model_Checkpoint",
    "input_dim": 10,
    "hidden_dim": 32,
    "output_dim": 1,
    "lr": 0.01,
    "epochs": 5,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# --- 2. MODEL DEFINITION ---
class SimpleModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleModel, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# --- 3. TRAINING FUNCTION ---
def train():
    global RUN_ID
    # A. Setup MLflow
    mlflow.set_tracking_uri("file:./mlruns")
    mlflow.set_experiment(config["experiment_name"])

    # B. Start the Run -> Capture 'run' object
    with mlflow.start_run(run_name=config["run_name"]) as run:
        
        # --- CAPTURE RUN ID ---
        RUN_ID = run.info.run_id
        print(f"Active Run ID: {RUN_ID}")
        
        # 1. Log the Model Config
        mlflow.log_params(config)
        
        # 2. Initialize Components
        model = SimpleModel(config['input_dim'], config['hidden_dim'], config['output_dim']).to(config['device'])
        optimizer = optim.Adam(model.parameters(), lr=config['lr'])
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
        scaler = GradScaler(enabled=(config['device'] == 'cuda'))

        print(f"Starting Training on {config['device']}...")
        
        # 3. Training Loop
        for epoch in range(config['epochs']):
            model.train()
            
            inputs = torch.randn(32, config['input_dim']).to(config['device'])
            targets = torch.randn(32, config['output_dim']).to(config['device'])

            optimizer.zero_grad()

            with torch.amp.autocast(config['device'], enabled=(config['device'] == 'cuda')):
                outputs = model(inputs)
                loss = nn.MSELoss()(outputs, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            # 4. Log Metrics
            current_lr = scheduler.get_last_lr()[0]
            mlflow.log_metric("train_loss", loss.item(), step=epoch)
            mlflow.log_metric("learning_rate", current_lr, step=epoch)
            
            print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

        # --- 4. THE SAVE ---
        print("\nSaving Checkpoint...")
        
        checkpoint = {
            'run_id': RUN_ID,  # <--- SAVE ID HERE for traceability
            'epoch': config['epochs'],
            'config': config, 
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'rng_state': torch.get_rng_state(),
        }
        pprint.pprint(checkpoint)

        local_path = "checkpoint.pth"
        torch.save(checkpoint, local_path)
        mlflow.log_artifact(local_path)
        os.remove(local_path)
        
        print(f"Run {RUN_ID} Complete. Checkpoint saved to MLflow.")

train()

  return FileStore(store_uri, store_uri)


Active Run ID: c7b613f1ca4b4798a26c17c35d953bed
Starting Training on cuda...
Epoch 1 | Loss: 0.8749
Epoch 2 | Loss: 1.1350
Epoch 3 | Loss: 0.9264
Epoch 4 | Loss: 1.0717
Epoch 5 | Loss: 0.7526

Saving Checkpoint...
{'config': {'device': 'cuda',
            'epochs': 5,
            'experiment_name': 'Gradient_Rage',
            'hidden_dim': 32,
            'input_dim': 10,
            'lr': 0.01,
            'output_dim': 1,
            'run_name': 'Saving_Model_Checkpoint'},
 'epoch': 5,
 'model_state_dict': OrderedDict([('net.0.weight',
                                   tensor([[ 0.1616, -0.2028, -0.0346, -0.2412, -0.1331,  0.0787,  0.1502,  0.2817,
         -0.3122,  0.1913],
        [-0.2026,  0.0965,  0.2875, -0.0150,  0.2968, -0.2882, -0.0999,  0.1871,
         -0.0868, -0.0436],
        [-0.1530,  0.2140,  0.1355,  0.0988, -0.1569, -0.2357,  0.0126,  0.0894,
         -0.1623, -0.1090],
        [ 0.1932,  0.0426,  0.0241, -0.0590, -0.2702, -0.2766, -0.1719, -0.1670,
          0.

## Resume Training

In [6]:
def resume_from_mlflow(run_id):
    print(f"\n--- [STEP B] Resuming from MLflow ID: {run_id} ---")
    
    # 1. Download
    mlflow.set_tracking_uri("file:./mlruns")
    local_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path="checkpoint.pth")
    
    # 2. Load File
    checkpoint = torch.load(local_path)
    config = checkpoint['config']
    
    # 3. Re-Init Architecture
    model = SimpleModel(config['input_dim'], config['hidden_dim'], config['output_dim'])
    optimizer = optim.SGD(model.parameters(), lr=config['lr'])
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)
    scaler = GradScaler(enabled=(config['device'] == "cuda"))
    
    # 4. Load States
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    scaler.load_state_dict(checkpoint['scaler_state_dict'])
    torch.set_rng_state(checkpoint['rng_state'])
    
    return model, optimizer, scheduler, scaler, config

model, opt, sched, scaler, config = resume_from_mlflow(run_id=RUN_ID)
    
print("\n--- [INSPECTION] Verifying Loaded Components ---")

# 1. CONFIG
print(f"1. Config Loaded: {config}")

# 2. MODEL WEIGHTS (Check first layer weights)
print(f"2. Model Weights (First 3 of Layer 1): {model.net[0].weight.view(-1)[:3].tolist()}")

# 3. OPTIMIZER (Check Param Groups)
print(f"3. Optimizer LR (should be decayed): {opt.param_groups[0]['lr']}")

# 4. SCHEDULER (Check Epoch Counter)
print(f"4. Scheduler Last Epoch: {sched.last_epoch}")

# 5. SCALER (Check Scale Factor)
print(f"5. Scaler Scale Factor: {scaler.get_scale()}")

# 6. RNG (Check a random number generation)
print(f"6. RNG Test (Next Random Num): {torch.randn(1).item()}")


--- [STEP B] Resuming from MLflow ID: c7b613f1ca4b4798a26c17c35d953bed ---


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]


--- [INSPECTION] Verifying Loaded Components ---
1. Config Loaded: {'experiment_name': 'Gradient_Rage', 'run_name': 'Saving_Model_Checkpoint', 'input_dim': 10, 'hidden_dim': 32, 'output_dim': 1, 'lr': 0.01, 'epochs': 5, 'device': 'cuda'}
2. Model Weights (First 3 of Layer 1): [0.1615852415561676, -0.20281507074832916, -0.03463708609342575]
3. Optimizer LR (should be decayed): 0.0001
4. Scheduler Last Epoch: 5
5. Scaler Scale Factor: 65536.0
6. RNG Test (Next Random Num): -2.0489625930786133
