In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Example dataset
X = torch.randn(7000, 12)  # Input features
y = torch.randn(7000, 1)   # Targets
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define your DNN model
class DNN(nn.Module):
    def __init__(self):
        super(DNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(12, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.fc(x)

model = DNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Scheduler to reduce learning rate on plateau
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=20, verbose=True)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # Calculate average loss for this epoch
    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

    # Step the scheduler with the validation loss (or MSE score)
    scheduler.step(avg_loss)

    # Check the current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current Learning Rate: {current_lr:.6f}")


Epoch 1/100, Loss: 1.0036
Current Learning Rate: 0.001000
Epoch 2/100, Loss: 0.9966
Current Learning Rate: 0.001000
Epoch 3/100, Loss: 0.9957
Current Learning Rate: 0.001000
Epoch 4/100, Loss: 0.9905
Current Learning Rate: 0.001000
Epoch 5/100, Loss: 0.9892
Current Learning Rate: 0.001000
Epoch 6/100, Loss: 0.9890
Current Learning Rate: 0.001000
Epoch 7/100, Loss: 0.9828
Current Learning Rate: 0.001000
Epoch 8/100, Loss: 0.9790
Current Learning Rate: 0.001000
Epoch 9/100, Loss: 0.9751
Current Learning Rate: 0.001000
Epoch 10/100, Loss: 0.9707
Current Learning Rate: 0.001000
Epoch 11/100, Loss: 0.9642
Current Learning Rate: 0.001000
Epoch 12/100, Loss: 0.9605
Current Learning Rate: 0.001000
Epoch 13/100, Loss: 0.9531
Current Learning Rate: 0.001000
Epoch 14/100, Loss: 0.9487
Current Learning Rate: 0.001000
Epoch 15/100, Loss: 0.9381
Current Learning Rate: 0.001000
Epoch 16/100, Loss: 0.9302
Current Learning Rate: 0.001000
Epoch 17/100, Loss: 0.9282
Current Learning Rate: 0.001000
Epoch 

In [4]:
class ReduceLROnPlateauCallback:
    def __init__(self, optimizer, factor=0.5, patience=20, min_lr=1e-6, verbose=True):
        self.optimizer = optimizer
        self.factor = factor
        self.patience = patience
        self.min_lr = min_lr
        self.verbose = verbose
        self.best_loss = float('inf')
        self.wait = 0

    def step(self, current_loss):
        if current_loss < self.best_loss:
            self.best_loss = current_loss
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self._reduce_lr()
                self.wait = 0

    def _reduce_lr(self):
        for param_group in self.optimizer.param_groups:
            new_lr = max(param_group['lr'] * self.factor, self.min_lr)
            if param_group['lr'] > new_lr:
                if self.verbose:
                    print(f"Reducing learning rate from {param_group['lr']:.6f} to {new_lr:.6f}")
                param_group['lr'] = new_lr

# Example usage in a training loop
callback = ReduceLROnPlateauCallback(optimizer, factor=0.5, patience=20, verbose=True)

for epoch in range(epochs):
    model.train()
    epoch_loss = 0.0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
    
    # Call the callback step
    callback.step(avg_loss)

    # Check the current learning rate
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Current Learning Rate: {current_lr:.6f}")


Epoch 1/100, Loss: 0.6667
Current Learning Rate: 0.001000
Epoch 2/100, Loss: 0.6616
Current Learning Rate: 0.001000
Epoch 3/100, Loss: 0.6632
Current Learning Rate: 0.001000
Epoch 4/100, Loss: 0.6614
Current Learning Rate: 0.001000
Epoch 5/100, Loss: 0.6584
Current Learning Rate: 0.001000
Epoch 6/100, Loss: 0.6638
Current Learning Rate: 0.001000
Epoch 7/100, Loss: 0.6563
Current Learning Rate: 0.001000
Epoch 8/100, Loss: 0.6573
Current Learning Rate: 0.001000
Epoch 9/100, Loss: 0.6604
Current Learning Rate: 0.001000
Epoch 10/100, Loss: 0.6489
Current Learning Rate: 0.001000
Epoch 11/100, Loss: 0.6523
Current Learning Rate: 0.001000
Epoch 12/100, Loss: 0.6496
Current Learning Rate: 0.001000
Epoch 13/100, Loss: 0.6471
Current Learning Rate: 0.001000
Epoch 14/100, Loss: 0.6497
Current Learning Rate: 0.001000
Epoch 15/100, Loss: 0.6460
Current Learning Rate: 0.001000
Epoch 16/100, Loss: 0.6437
Current Learning Rate: 0.001000
Epoch 17/100, Loss: 0.6480
Current Learning Rate: 0.001000
Epoch 

Checkpoint model saving

In [5]:
import torch

class SaveBestModelCallback:
    def __init__(self, save_path="best_model.pth", mode="min", verbose=True):
        """
        Save the best model based on a monitored metric.

        Args:
        - save_path (str): File path to save the best model.
        - mode (str): 'min' to minimize the monitored metric or 'max' to maximize it.
        - verbose (bool): Print messages when a new best model is saved.
        """
        self.save_path = save_path
        self.mode = mode
        self.verbose = verbose
        self.best_score = float('inf') if mode == "min" else -float('inf')

    def step(self, model, current_score):
        """
        Check if the current score is the best and save the model if it is.

        Args:
        - model: The PyTorch model to save.
        - current_score (float): The monitored metric score.
        """
        is_better = (
            current_score < self.best_score if self.mode == "min" else current_score > self.best_score
        )
        if is_better:
            self.best_score = current_score
            torch.save(model.state_dict(), self.save_path)
            if self.verbose:
                print(f"New best model saved with {self.mode} score: {current_score:.4f}")

#Example usage in a training loop
save_best_model = SaveBestModelCallback(save_path="best_model.pth", mode="min", verbose=True)

for epoch in range(10):
    model.train()
    train_loss = 0.0
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_train_loss:.4f}")

    # Simulate validation loss (replace with actual validation loop if available)
    val_loss = avg_train_loss + torch.randn(1).item() * 0.01  # Example metric
    print(f"Validation Loss: {val_loss:.4f}")

    # Save the best model
    save_best_model.step(model, val_loss)


Epoch 1/100, Loss: 0.5628
Validation Loss: 0.5592
New best model saved with min score: 0.5592
Epoch 2/100, Loss: 0.5666
Validation Loss: 0.5704
Epoch 3/100, Loss: 0.5600
Validation Loss: 0.5620
Epoch 4/100, Loss: 0.5589
Validation Loss: 0.5591
New best model saved with min score: 0.5591
Epoch 5/100, Loss: 0.5629
Validation Loss: 0.5502
New best model saved with min score: 0.5502
Epoch 6/100, Loss: 0.5622
Validation Loss: 0.5624
Epoch 7/100, Loss: 0.5632
Validation Loss: 0.5622
Epoch 8/100, Loss: 0.5632
Validation Loss: 0.5751
Epoch 9/100, Loss: 0.5612
Validation Loss: 0.5731
Epoch 10/100, Loss: 0.5618
Validation Loss: 0.5607


cpu vs gpu profiler

In [None]:
import torch
import torch.profiler

def training_step():
    # Dummy training loop
    x = torch.randn(1000, 1000, device="cuda")
    y = torch.matmul(x, x)

with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA,
    ],
    on_trace_ready=torch.profiler.tensorboard_trace_handler("./log"),
    record_shapes=True,
    with_stack=True
) as prof:
    training_step()

print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
