# Weights & Biases with Gradient

# Preface

Weights and Biases is a ML Ops platform that has useful features around model tracking, hyperparameter tuning, and artifact saving during model training steps. Integrating with Weights and Biases provides Gradient users access to world-class model experimenting features while taking advantage of Gradient easy-to-use development platform and access to accelerated hardware.

The goal of this tutorial is to highlight Weights and Biases features and how to use those within Gradient to scale up model training. During this tutorial you will learn to initiate W&B model runs, log metrics, save artifacts, tune hyperparameters, and determine the best performing model. The models trained during this tutorial can be saved in a Gradient Dataset and then be leveraged within Gradient Workflows and Deployments.


# Installation and Setup

In [None]:
!pip install wandb -q

In [None]:
import wandb
import os

os.environ["WANDB_NOTEBOOK_NAME"] = "./train_model_wandb.ipynb"

# Login

In [None]:
wandb.login(key='your-api-key')

# Initalizing a Model Run and Logging

In [None]:
config={
         "epochs": 5,
         "batch_size": 128,
         "lr": 1e-3,
         "model": 'ResNet18'
        }

In [None]:
import time
import torch.nn as nn
import torch.optim as optim
import torch

from resnet import resnet18, resnet34
from load_data import load_data

In [None]:
def validate_model(model, valid_dl, loss_func, device):
    
    # Compute performance of the model on the validation dataset
    model.eval()
    val_loss = 0.

    with torch.inference_mode():

        correct = 0
        for i, (images, labels) in enumerate(valid_dl, 0):
            
            # Move data to GPU if available 
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            val_loss += loss_func(outputs, labels)*labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)

### Saving a model as a Gradient artifact

In [None]:
!pip install gradient -q

In [None]:
from gradient import ModelsClient

models_client = ModelsClient(api_key='your-gradient-api-key')

In [None]:
def upload_model(config, model_client, model_dir='models'):

    # Create model directory
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
        
    # Save model file
    params = [config['model'], 'epchs', str(config['epochs']), 'bs', str(config['batch_size']), 'lr', str(round(config['lr'], 6))]
    full_model_name = '-'.join(params)
    model_path = os.path.join(model_dir, full_model_name + '.pth')
    torch.save(model.state_dict(), model_path)

    # Upload model as a Gradient artifact
    model_client.upload(path=model_path, name=full_model_name, model_type='Custom', project_id='your-project-id')

    return full_model_name

### Train ResNet 18

In [None]:
model_name = 'ResNet18'

# Initialize W&B run
with wandb.init(project="test-project", config=config, name=model_name):

    # Create Data Loader objects
    trainloader, valloader, testloader = load_data(config)

    # Create ResNet18 Model with 3 channel inputs (colored image) and 10 output classes
    model = resnet18(3, 10)

    # Define loss and optimization functions
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)

    # Move the model to GPU if accessible 
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    step = 0
    epoch_durations = []
    for epoch in range(config['epochs']):
        
        epoch_start_time = time.time()
        batch_checkpoint=50
        running_loss = 0.0
        model.train()

        for i, data in enumerate(trainloader, 0):
        
            # Move data to GPU if available 
            inputs, labels = data[0].to(device), data[1].to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward + Backward + Optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item()
            
            # Log every 50 mini-batches
            if i % batch_checkpoint == batch_checkpoint-1:    # log every 50 mini-batches
                step +=1
                print(f'epoch: {epoch + ((i+1)/len(trainloader)):.2f}')
                wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
            
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / batch_checkpoint))
                    
                running_loss = 0.0

        # Log validation metrics
        val_loss, accuracy = validate_model(model, valloader, criterion, device)
        wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
        print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
        
        # Log epoch duration
        epoch_duration = time.time() - epoch_start_time
        wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

        epoch_durations.append(epoch_duration)

    # Log average epoch duration
    avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
    wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})

    #Upload model artifact to Gradient and log model name to W&B
    full_model_name = upload_model(config, model_client)
    wandb.log({"Notes": full_model_name})

print('Training Finished')

### Train ResNet34

In [None]:
model_name = 'ResNet34'
config['model'] = model_name

# Initialize W&B run
with wandb.init(project="test-project", config=config, name=model_name):

    # Create Data Loader objects
    trainloader, valloader, testloader = load_data(config)

    # Create ResNet34 Model with 3 channel inputs (colored image) and 10 output classes
    model = resnet34(3, 10)

    # Define loss and optimization functions
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)

    # Move the model to GPU if accessible 
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    step = 0
    epoch_durations = []
    for epoch in range(config['epochs']):
        
        epoch_start_time = time.time()
        batch_checkpoint=50
        running_loss = 0.0
        model.train()

        for i, data in enumerate(trainloader, 0):
        
            # Move data to GPU if available 
            inputs, labels = data[0].to(device), data[1].to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward + Backward + Optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item()
            
            # log every 50 mini-batches
            if i % batch_checkpoint == batch_checkpoint-1:    # log every 50 mini-batches
                step +=1
                print(f'epoch: {epoch + ((i+1)/len(trainloader)):.2f}')
                wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
            
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / batch_checkpoint))
                    
                running_loss = 0.0

        # Log validation metrics
        val_loss, accuracy = validate_model(model, valloader, criterion, device)
        wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
        print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
        
        # Log epoch duration
        epoch_duration = time.time() - epoch_start_time
        wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

        epoch_durations.append(epoch_duration)

    # Log average epoch duration
    avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
    wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})

    #Upload model artifact to Gradient and log model name to W&B
    full_model_name = upload_model(config, model_client)
    wandb.log({"Notes": full_model_name})

print('Training Finished')

# Artifacts

In [None]:
# Classes of images in CIFAR-10 dataset
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Initialize W&B run
with wandb.init(project='test-project'):
    # Create W&B artifact
    artifact = wandb.Artifact('cifar10_image_predictions', type='predictions')

    # Create Data Loader objects
    trainloader, valloader, testloader = load_data(config)

    # Create columns for W&B table
    columns=['image', 'label', 'prediction']
    for digit in range(10):
        columns.append("score_" + classes[digit])

    # Create W&B table
    pred_table = wandb.Table(columns=columns)   

    with torch.no_grad():
        for i, data in enumerate(testloader, 0):

            # Move data to GPU if available 
            inputs, labels = data[0].to(device), data[1].to(device)

            # Calculate model outputs and predictions
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)

            # Loop through first batch of images and add data to the table
            for j, image in enumerate(inputs, 0):
                pred_table.add_data(wandb.Image(image), classes[labels[j].item()], classes[predicted[j]], *outputs[j])
            break

    # Log W&B model artifact
    artifact.add(pred_table, "cifar10_predictions")
    wandb.log_artifact(artifact)

# Sweeps

In [None]:
sweep_config = {
                'method': 'bayes',
                'metric': {'goal': 'minimize', 'name': 'val_loss'},
                'parameters': {
                    'batch_size': {'values': [32, 128]},
                    'epochs': {'value': 5},
                    'lr': {'distribution': 'uniform',
                                      'max': 1e-2,
                                      'min': 1e-4},
                    'model': {'values': ['ResNet18', 'ResNet34']}
                    }
}

In [None]:
def train(config = None):
    with wandb.init(project='test-project', config=config):
        config = wandb.config

        trainloader, valloader, testloader = load_data(config)

        if config['model']=='ResNet18':
            model = resnet18(3,10)
        else:
            model = resnet34(3,10)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model.to(device)

        step = 0
        batch_checkpoint=50
        epoch_durations = []
        for epoch in range(config['epochs']):
            
            epoch_start_time = time.time()
            running_loss = 0.0
            model.train()

            for i, data in enumerate(trainloader, 0):
            
                # Move data to GPU if available 
                inputs, labels = data[0].to(device), data[1].to(device)
                
                # Zero the parameter gradients
                optimizer.zero_grad()
                
                # Forward + Backward + Optimize
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
                running_loss += loss.item()
                
                # Log every 50 batches
                if i % batch_checkpoint == batch_checkpoint-1:
                    step +=1
                    print(f'epoch: {epoch + ((i+1)/len(trainloader)):.2f}')
                    wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
                
                    print('[%d, %5d] loss: %.3f' %
                        (epoch + 1, i + 1, running_loss / batch_checkpoint))
                        
                    running_loss = 0.0
            
            # Log at the end of each epoch
            step +=1
            print(f'epoch: {epoch + ((i+1)/len(trainloader)):.2f}')
            wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
                
            print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / batch_checkpoint))

            # Log validation metrics
            val_loss, accuracy = validate_model(model, valloader, criterion, device)
            wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
            print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
            
            epoch_duration = time.time() - epoch_start_time
            wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

            epoch_durations.append(epoch_duration)

        avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
        wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})

        #Upload model artifact to Gradient and log model name to W&B
        full_model_name = upload_model(config, model_client)
        wandb.log({"Notes": full_model_name})

    print('Training Finished')

In [None]:
sweep_id = wandb.sweep(sweep_config, project="test-project")
wandb.agent(sweep_id, function=train, count=10)