# Weights & Biases Walkthrough

# Preface

The goal of this tutorial is to show steps in Gradient to use the Weights & Biases Python library to track model runs and model parameters, create artifacts, and compare models to determine best performing model. 

The context of this tutorial will be to try test and track multiple values for learning rate and epochs for a ResNet model built in PyTorch.


# Installation and Setup

In [None]:
!pip install wandb -q



In [None]:
import wandb
import os

os.environ["WANDB_NOTEBOOK_NAME"] = "./train_model_wandb.ipynb"

# Login

In [None]:
wandb.login(key='1305ff3ca47ed8cd6735ba50a3b2f6697ff94916')

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)




[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
wandb.login(key='<your-wandb-api-key>')

# Initalizing a Model Run and Logging

In [None]:
config={
         "epochs": 5,
         "batch_size": 128,
         "lr": 1e-3,
        }

### Train ResNet 18

In [None]:
import time
import torch.nn as nn
import torch.optim as optim
import torch

from resnet import resnet18, resnet34
from load_data import trainloader, valloader

Files already downloaded and verified


Files already downloaded and verified


In [None]:
def validate_model(model, valid_dl, loss_func, device):
    
    # Compute performance of the model on the validation dataset
    model.eval()
    val_loss = 0.
    with torch.inference_mode():
        correct = 0
        for i, (images, labels) in enumerate(valid_dl, 0):
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            val_loss += loss_func(outputs, labels)*labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)

In [None]:
model_name = 'ResNet18'

with wandb.init(project="test-project", config=config, name=model_name):

    model = resnet18(3, 10)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    step = 0
    epoch_durations = []
    for epoch in range(config['epochs']):
        
        epoch_start_time = time.time()
        batch_checkpoint=50
        running_loss = 0.0
        model.train()

        for i, data in enumerate(trainloader, 0):
        
            inputs, labels = data[0].to(device), data[1].to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward + Backward + Optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item()
            
            # log every 50 mini-batches
            if i % batch_checkpoint == batch_checkpoint-1:    # log every 50 mini-batches
                step +=1
                print('epoch:', epoch + ((i+1)/len(trainloader)))
                wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
            
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / batch_checkpoint))
                    
                running_loss = 0.0

        # Log validation metrics
        val_loss, accuracy = validate_model(model, valloader, criterion, device)
        wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
        print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
        
        epoch_duration = time.time() - epoch_start_time
        wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

        epoch_durations.append(epoch_duration)

    avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
    wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})

print('Training Finished')

epoch: 0.125
[1,    50] loss: 1.855


epoch: 0.25
[1,   100] loss: 1.515


epoch: 0.375
[1,   150] loss: 1.428


epoch: 0.5
[1,   200] loss: 1.317


epoch: 0.625
[1,   250] loss: 1.249


epoch: 0.75
[1,   300] loss: 1.222


epoch: 0.875
[1,   350] loss: 1.181


epoch: 1.0
[1,   400] loss: 1.129


Valid Loss: 1.187888, accuracy: 0.59


epoch: 1.125
[2,    50] loss: 0.991


epoch: 1.25
[2,   100] loss: 0.960


epoch: 1.375
[2,   150] loss: 0.944


epoch: 1.5
[2,   200] loss: 0.984


epoch: 1.625
[2,   250] loss: 0.938


epoch: 1.75
[2,   300] loss: 0.932


epoch: 1.875
[2,   350] loss: 0.920


epoch: 2.0
[2,   400] loss: 0.918


Valid Loss: 1.026815, accuracy: 0.65


epoch: 2.125
[3,    50] loss: 0.729


epoch: 2.25
[3,   100] loss: 0.719


epoch: 2.375
[3,   150] loss: 0.746


epoch: 2.5
[3,   200] loss: 0.741


epoch: 2.625
[3,   250] loss: 0.772


epoch: 2.75
[3,   300] loss: 0.782


epoch: 2.875
[3,   350] loss: 0.750


epoch: 3.0
[3,   400] loss: 0.794


Valid Loss: 1.016920, accuracy: 0.65


epoch: 3.125
[4,    50] loss: 0.530


epoch: 3.25
[4,   100] loss: 0.563


epoch: 3.375
[4,   150] loss: 0.602


epoch: 3.5
[4,   200] loss: 0.628


epoch: 3.625
[4,   250] loss: 0.640


epoch: 3.75
[4,   300] loss: 0.638


epoch: 3.875
[4,   350] loss: 0.634


epoch: 4.0
[4,   400] loss: 0.643


Valid Loss: 0.990716, accuracy: 0.67


epoch: 4.125
[5,    50] loss: 0.413


epoch: 4.25
[5,   100] loss: 0.403


epoch: 4.375
[5,   150] loss: 0.434


epoch: 4.5
[5,   200] loss: 0.446


epoch: 4.625
[5,   250] loss: 0.489


epoch: 4.75
[5,   300] loss: 0.517


epoch: 4.875
[5,   350] loss: 0.541


epoch: 5.0
[5,   400] loss: 0.547


Valid Loss: 1.019831, accuracy: 0.68


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),▁█▃▁▄
train_loss,█▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▂▂▂
val_accuracy,▁▆▆██
val_loss,█▂▂▁▂

0,1
avg epoch runtime (seconds),4.95499
epoch,5.0
epoch_runtime (seconds),4.98119
train_loss,0.54706
val_accuracy,0.6765
val_loss,1.01983


Training Finished


### Train ResNet34

In [None]:
model_name = 'ResNet34'

with wandb.init(project="test-project", config=config, name=model_name):

    model = resnet34(3, 10)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    step = 0
    epoch_durations = []
    for epoch in range(config['epochs']):
        
        epoch_start_time = time.time()
        batch_checkpoint=50
        running_loss = 0.0
        model.train()

        for i, data in enumerate(trainloader, 0):
        
            inputs, labels = data[0].to(device), data[1].to(device)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward + Backward + Optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
            running_loss += loss.item()
            
            # log every 50 mini-batches
            if i % batch_checkpoint == batch_checkpoint-1:    # log every 50 mini-batches
                step +=1
                print('epoch:', epoch + ((i+1)/len(trainloader)))
                wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
            
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / batch_checkpoint))
                    
                running_loss = 0.0

        # Log validation metrics
        val_loss, accuracy = validate_model(model, valloader, criterion, device)
        wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
        print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
        
        epoch_duration = time.time() - epoch_start_time
        wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

        epoch_durations.append(epoch_duration)

    avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
    wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})

print('Training Finished')

epoch: 0.125
[1,    50] loss: 1.997


epoch: 0.25
[1,   100] loss: 1.594


epoch: 0.375
[1,   150] loss: 1.452


epoch: 0.5
[1,   200] loss: 1.342


epoch: 0.625
[1,   250] loss: 1.284


epoch: 0.75
[1,   300] loss: 1.223


epoch: 0.875
[1,   350] loss: 1.198


epoch: 1.0
[1,   400] loss: 1.109


epoch: 1.125
[2,    50] loss: 0.964


epoch: 1.25
[2,   100] loss: 1.000


epoch: 1.375
[2,   150] loss: 0.952


epoch: 1.5
[2,   200] loss: 0.967


epoch: 1.625
[2,   250] loss: 0.953


epoch: 1.75
[2,   300] loss: 0.942


epoch: 1.875
[2,   350] loss: 0.926


epoch: 2.0
[2,   400] loss: 0.927


epoch: 2.125
[3,    50] loss: 0.731


epoch: 2.25
[3,   100] loss: 0.741


epoch: 2.375
[3,   150] loss: 0.760


epoch: 2.5
[3,   200] loss: 0.752


epoch: 2.625
[3,   250] loss: 0.745


epoch: 2.75
[3,   300] loss: 0.746


epoch: 2.875
[3,   350] loss: 0.743


epoch: 3.0
[3,   400] loss: 0.742


0,1
epoch,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇██
epoch_runtime (seconds),▁▃█
train_loss,█▆▅▄▄▄▄▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,3.0
epoch_runtime (seconds),23.74247
train_loss,0.74153


Training Finished


# Artifacts

In [None]:
from load_data import testloader

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

with wandb.init(project='test-project'):
    artifact = wandb.Artifact('cifar10_image_predictions', type='predictions')

    columns=['image', 'label', 'prediction']
    for digit in range(10):
        columns.append("score_" + classes[digit])

    pred_table = wandb.Table(columns=columns)   

    with torch.no_grad():
        for i, data in enumerate(testloader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            for j, image in enumerate(inputs,0):
                pred_table.add_data(wandb.Image(image), classes[labels[j].item()], classes[predicted[j]], *outputs[j])
                break

    artifact.add(pred_table, "cifar10_predictions")
    wandb.log_artifact(artifact)

# Sweeps

In [None]:
sweep_config = {
                'method': 'grid',
                'metric': {'goal': 'minimize', 'name': 'val_loss'},
                'parameters': {
                    'batch_size': {'values': [16, 128]},
                    'epochs': {'values': [6]},
                    'lr': {'values': [1e-3, 1e-4]},
                    'model': {'values': ['ResNet18', 'ResNet34']}
                    }
}

In [None]:

def train(config = None):
    with wandb.init(project='test-project', config=config):
        config = wandb.config

        if config['model']=='ResNet18':
            model = resnet18(3,10)
        else:
            model = resnet34(3,10)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model.to(device)

        step = 0
        epoch_durations = []
        for epoch in range(config['epochs']):
            
            epoch_start_time = time.time()
            batch_checkpoint=50
            running_loss = 0.0
            model.train()

            for i, data in enumerate(trainloader, 0):
            
                inputs, labels = data[0].to(device), data[1].to(device)
                
                # Zero the parameter gradients
                optimizer.zero_grad()
                
                # Forward + Backward + Optimize
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            
                running_loss += loss.item()
                
                # log every 50 mini-batches
                if i % batch_checkpoint == batch_checkpoint-1:    # log every 50 mini-batches
                    step +=1
                    print('epoch:', epoch + ((i+1)/len(trainloader)))
                    wandb.log({"train_loss": running_loss/batch_checkpoint, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)
                
                    print('[%d, %5d] loss: %.3f' %
                        (epoch + 1, i + 1, running_loss / batch_checkpoint))
                        
                    running_loss = 0.0

            # Log validation metrics
            val_loss, accuracy = validate_model(model, valloader, criterion, device)
            wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
            print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
            
            epoch_duration = time.time() - epoch_start_time
            wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

            epoch_durations.append(epoch_duration)

        avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
        wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})

    print('Training Finished')

In [None]:
sweep_id = wandb.sweep(sweep_config, project="test-project")
wandb.agent(sweep_id, function=train)

Create sweep with ID: wuz0x8de
Sweep URL: https://wandb.ai/joshuapaperspace/test-project/sweeps/wuz0x8de


[34m[1mwandb[0m: Agent Starting Run: mryjmedd with config:


[34m[1mwandb[0m: 	batch_size: 16


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.001


[34m[1mwandb[0m: 	model: ResNet18


[34m[1mwandb[0m: Currently logged in as: [33mjoshuapaperspace[0m (use `wandb login --relogin` to force relogin)




epoch: 0.125
[1,    50] loss: 1.835


epoch: 0.25
[1,   100] loss: 1.514


epoch: 0.375
[1,   150] loss: 1.381


epoch: 0.5
[1,   200] loss: 1.309


epoch: 0.625
[1,   250] loss: 1.220


epoch: 0.75
[1,   300] loss: 1.162


epoch: 0.875
[1,   350] loss: 1.147


epoch: 1.0
[1,   400] loss: 1.136


Valid Loss: 1.095199, accuracy: 0.61


epoch: 1.125
[2,    50] loss: 0.926


epoch: 1.25
[2,   100] loss: 0.962


epoch: 1.375
[2,   150] loss: 0.920


epoch: 1.5
[2,   200] loss: 0.935


epoch: 1.625
[2,   250] loss: 0.936


epoch: 1.75
[2,   300] loss: 0.936


epoch: 1.875
[2,   350] loss: 0.906


epoch: 2.0
[2,   400] loss: 0.912


Valid Loss: 0.976894, accuracy: 0.65


epoch: 2.125
[3,    50] loss: 0.702


epoch: 2.25
[3,   100] loss: 0.715


epoch: 2.375
[3,   150] loss: 0.751


epoch: 2.5
[3,   200] loss: 0.745


epoch: 2.625
[3,   250] loss: 0.729


epoch: 2.75
[3,   300] loss: 0.756


epoch: 2.875
[3,   350] loss: 0.768


epoch: 3.0
[3,   400] loss: 0.733


Valid Loss: 0.936270, accuracy: 0.68


epoch: 3.125
[4,    50] loss: 0.547


epoch: 3.25
[4,   100] loss: 0.542


epoch: 3.375
[4,   150] loss: 0.590


epoch: 3.5
[4,   200] loss: 0.574


epoch: 3.625
[4,   250] loss: 0.603


epoch: 3.75
[4,   300] loss: 0.656


epoch: 3.875
[4,   350] loss: 0.615


epoch: 4.0
[4,   400] loss: 0.611


Valid Loss: 0.961565, accuracy: 0.68


epoch: 4.125
[5,    50] loss: 0.400


epoch: 4.25
[5,   100] loss: 0.396


epoch: 4.375
[5,   150] loss: 0.435


epoch: 4.5
[5,   200] loss: 0.442


epoch: 4.625
[5,   250] loss: 0.485


epoch: 4.75
[5,   300] loss: 0.508


epoch: 4.875
[5,   350] loss: 0.509


epoch: 5.0
[5,   400] loss: 0.519


Valid Loss: 1.036508, accuracy: 0.68


epoch: 5.125
[6,    50] loss: 0.311


epoch: 5.25
[6,   100] loss: 0.284


epoch: 5.375
[6,   150] loss: 0.338


epoch: 5.5
[6,   200] loss: 0.364


epoch: 5.625
[6,   250] loss: 0.386


epoch: 5.75
[6,   300] loss: 0.398


epoch: 5.875
[6,   350] loss: 0.390


epoch: 6.0
[6,   400] loss: 0.423


Valid Loss: 1.068615, accuracy: 0.69


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),█▁▁▁▁▁
train_loss,█▇▆▆▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▂▂▂▂▂▁▁▁▁▁▂
val_accuracy,▁▅▇▇▇█
val_loss,█▃▁▂▅▇

0,1
avg epoch runtime (seconds),6.03867
epoch,6.0
epoch_runtime (seconds),5.48477
train_loss,0.42285
val_accuracy,0.692
val_loss,1.06861


Training Finished


[34m[1mwandb[0m: Agent Starting Run: 0axkilal with config:


[34m[1mwandb[0m: 	batch_size: 16


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.001


[34m[1mwandb[0m: 	model: ResNet34




epoch: 0.125
[1,    50] loss: 1.987


epoch: 0.25
[1,   100] loss: 1.552


epoch: 0.375
[1,   150] loss: 1.431


epoch: 0.5
[1,   200] loss: 1.314


epoch: 0.625
[1,   250] loss: 1.246


epoch: 0.75
[1,   300] loss: 1.205


epoch: 0.875
[1,   350] loss: 1.191


epoch: 1.0
[1,   400] loss: 1.136


Valid Loss: 1.096109, accuracy: 0.62


epoch: 1.125
[2,    50] loss: 0.952


epoch: 1.25
[2,   100] loss: 0.964


epoch: 1.375
[2,   150] loss: 0.977


epoch: 1.5
[2,   200] loss: 0.941


epoch: 1.625
[2,   250] loss: 0.937


epoch: 1.75
[2,   300] loss: 0.933


epoch: 1.875
[2,   350] loss: 0.912


epoch: 2.0
[2,   400] loss: 0.927


Valid Loss: 0.988397, accuracy: 0.65


epoch: 2.125
[3,    50] loss: 0.721


epoch: 2.25
[3,   100] loss: 0.720


epoch: 2.375
[3,   150] loss: 0.751


epoch: 2.5
[3,   200] loss: 0.743


epoch: 2.625
[3,   250] loss: 0.760


epoch: 2.75
[3,   300] loss: 0.757


epoch: 2.875
[3,   350] loss: 0.743


epoch: 3.0
[3,   400] loss: 0.739


Valid Loss: 1.030522, accuracy: 0.65


epoch: 3.125
[4,    50] loss: 0.533


epoch: 3.25
[4,   100] loss: 0.553


epoch: 3.375
[4,   150] loss: 0.574


epoch: 3.5
[4,   200] loss: 0.591


epoch: 3.625
[4,   250] loss: 0.609


epoch: 3.75
[4,   300] loss: 0.592


epoch: 3.875
[4,   350] loss: 0.639


epoch: 4.0
[4,   400] loss: 0.604


Valid Loss: 0.914169, accuracy: 0.69


epoch: 4.125
[5,    50] loss: 0.398


epoch: 4.25
[5,   100] loss: 0.396


epoch: 4.375
[5,   150] loss: 0.420


epoch: 4.5
[5,   200] loss: 0.449


epoch: 4.625
[5,   250] loss: 0.473


epoch: 4.75
[5,   300] loss: 0.497


epoch: 4.875
[5,   350] loss: 0.506


epoch: 5.0
[5,   400] loss: 0.511


Valid Loss: 0.960225, accuracy: 0.71


epoch: 5.125
[6,    50] loss: 0.268


epoch: 5.25
[6,   100] loss: 0.275


epoch: 5.375
[6,   150] loss: 0.326


epoch: 5.5
[6,   200] loss: 0.347


epoch: 5.625
[6,   250] loss: 0.385


epoch: 5.75
[6,   300] loss: 0.382


epoch: 5.875
[6,   350] loss: 0.403


epoch: 6.0
[6,   400] loss: 0.392


Valid Loss: 1.013630, accuracy: 0.70


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),▅▃▃▅▁█
train_loss,█▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▃▂▂▂▂▂▂▂▂▁▁▁▁▁▂
val_accuracy,▁▃▄▇█▇
val_loss,█▄▅▁▃▅

0,1
avg epoch runtime (seconds),8.15016
epoch,6.0
epoch_runtime (seconds),8.26396
train_loss,0.39234
val_accuracy,0.697
val_loss,1.01363


Training Finished


[34m[1mwandb[0m: Agent Starting Run: omu970ls with config:


[34m[1mwandb[0m: 	batch_size: 16


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.0001


[34m[1mwandb[0m: 	model: ResNet18




epoch: 0.125
[1,    50] loss: 2.117


epoch: 0.25
[1,   100] loss: 1.716


epoch: 0.375
[1,   150] loss: 1.588


epoch: 0.5
[1,   200] loss: 1.547


epoch: 0.625
[1,   250] loss: 1.474


epoch: 0.75
[1,   300] loss: 1.471


epoch: 0.875
[1,   350] loss: 1.397


epoch: 1.0
[1,   400] loss: 1.385


Valid Loss: 1.320765, accuracy: 0.52


epoch: 1.125
[2,    50] loss: 1.229


epoch: 1.25
[2,   100] loss: 1.255


epoch: 1.375
[2,   150] loss: 1.237


epoch: 1.5
[2,   200] loss: 1.225


epoch: 1.625
[2,   250] loss: 1.192


epoch: 1.75
[2,   300] loss: 1.190


epoch: 1.875
[2,   350] loss: 1.161


epoch: 2.0
[2,   400] loss: 1.182


Valid Loss: 1.211051, accuracy: 0.56


epoch: 2.125
[3,    50] loss: 1.041


epoch: 2.25
[3,   100] loss: 1.036


epoch: 2.375
[3,   150] loss: 1.038


epoch: 2.5
[3,   200] loss: 1.033


epoch: 2.625
[3,   250] loss: 1.025


epoch: 2.75
[3,   300] loss: 1.027


epoch: 2.875
[3,   350] loss: 1.056


epoch: 3.0
[3,   400] loss: 1.045


Valid Loss: 1.169024, accuracy: 0.59


epoch: 3.125
[4,    50] loss: 0.895


epoch: 3.25
[4,   100] loss: 0.911


epoch: 3.375
[4,   150] loss: 0.901


epoch: 3.5
[4,   200] loss: 0.895


epoch: 3.625
[4,   250] loss: 0.883


epoch: 3.75
[4,   300] loss: 0.896


epoch: 3.875
[4,   350] loss: 0.900


epoch: 4.0
[4,   400] loss: 0.922


Valid Loss: 1.130964, accuracy: 0.60


epoch: 4.125
[5,    50] loss: 0.768


epoch: 4.25
[5,   100] loss: 0.781


epoch: 4.375
[5,   150] loss: 0.774


epoch: 4.5
[5,   200] loss: 0.772


epoch: 4.625
[5,   250] loss: 0.785


epoch: 4.75
[5,   300] loss: 0.768


epoch: 4.875
[5,   350] loss: 0.784


epoch: 5.0
[5,   400] loss: 0.787


Valid Loss: 1.152850, accuracy: 0.60


epoch: 5.125
[6,    50] loss: 0.649


epoch: 5.25
[6,   100] loss: 0.643


epoch: 5.375
[6,   150] loss: 0.623


epoch: 5.5
[6,   200] loss: 0.659


epoch: 5.625
[6,   250] loss: 0.673


epoch: 5.75
[6,   300] loss: 0.660


epoch: 5.875
[6,   350] loss: 0.671


epoch: 6.0
[6,   400] loss: 0.658


Valid Loss: 1.177778, accuracy: 0.62


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),▃█▁▃▅▆
train_loss,█▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
val_accuracy,▁▄▆▇▇█
val_loss,█▄▂▁▂▃

0,1
avg epoch runtime (seconds),5.39699
epoch,6.0
epoch_runtime (seconds),5.47564
train_loss,0.6579
val_accuracy,0.618
val_loss,1.17778


Training Finished


[34m[1mwandb[0m: Agent Starting Run: jkkntzu2 with config:


[34m[1mwandb[0m: 	batch_size: 16


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.0001


[34m[1mwandb[0m: 	model: ResNet34




epoch: 0.125
[1,    50] loss: 2.131


epoch: 0.25
[1,   100] loss: 1.722


epoch: 0.375
[1,   150] loss: 1.641


epoch: 0.5
[1,   200] loss: 1.547


epoch: 0.625
[1,   250] loss: 1.506


epoch: 0.75
[1,   300] loss: 1.451


epoch: 0.875
[1,   350] loss: 1.394


epoch: 1.0
[1,   400] loss: 1.395


Valid Loss: 1.350927, accuracy: 0.53


epoch: 1.125
[2,    50] loss: 1.165


epoch: 1.25
[2,   100] loss: 1.177


epoch: 1.375
[2,   150] loss: 1.159


epoch: 1.5
[2,   200] loss: 1.177


epoch: 1.625
[2,   250] loss: 1.160


epoch: 1.75
[2,   300] loss: 1.163


epoch: 1.875
[2,   350] loss: 1.163


epoch: 2.0
[2,   400] loss: 1.115


Valid Loss: 1.252922, accuracy: 0.55


epoch: 2.125
[3,    50] loss: 0.917


epoch: 2.25
[3,   100] loss: 0.953


epoch: 2.375
[3,   150] loss: 0.935


epoch: 2.5
[3,   200] loss: 0.931


epoch: 2.625
[3,   250] loss: 0.937


epoch: 2.75
[3,   300] loss: 0.946


epoch: 2.875
[3,   350] loss: 0.948


epoch: 3.0
[3,   400] loss: 0.957


Valid Loss: 1.225101, accuracy: 0.57


epoch: 3.125
[4,    50] loss: 0.728


epoch: 3.25
[4,   100] loss: 0.731


epoch: 3.375
[4,   150] loss: 0.740


epoch: 3.5
[4,   200] loss: 0.747


epoch: 3.625
[4,   250] loss: 0.771


epoch: 3.75
[4,   300] loss: 0.729


epoch: 3.875
[4,   350] loss: 0.757


epoch: 4.0
[4,   400] loss: 0.754


Valid Loss: 1.234721, accuracy: 0.58


epoch: 4.125
[5,    50] loss: 0.532


epoch: 4.25
[5,   100] loss: 0.553


epoch: 4.375
[5,   150] loss: 0.536


epoch: 4.5
[5,   200] loss: 0.529


epoch: 4.625
[5,   250] loss: 0.560


epoch: 4.75
[5,   300] loss: 0.582


epoch: 4.875
[5,   350] loss: 0.576


epoch: 5.0
[5,   400] loss: 0.583


Valid Loss: 1.307597, accuracy: 0.58


epoch: 5.125
[6,    50] loss: 0.361


epoch: 5.25
[6,   100] loss: 0.362


epoch: 5.375
[6,   150] loss: 0.368


epoch: 5.5
[6,   200] loss: 0.379


epoch: 5.625
[6,   250] loss: 0.390


epoch: 5.75
[6,   300] loss: 0.387


epoch: 5.875
[6,   350] loss: 0.409


epoch: 6.0
[6,   400] loss: 0.423


Valid Loss: 1.428951, accuracy: 0.57


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),██▅▁▂▅
train_loss,█▆▆▆▆▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁
val_accuracy,▁▄▇█▇▇
val_loss,▅▂▁▁▄█

0,1
avg epoch runtime (seconds),8.13255
epoch,6.0
epoch_runtime (seconds),8.14895
train_loss,0.42252
val_accuracy,0.5745
val_loss,1.42895


Training Finished


[34m[1mwandb[0m: Agent Starting Run: 6lka41y7 with config:


[34m[1mwandb[0m: 	batch_size: 128


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.001


[34m[1mwandb[0m: 	model: ResNet18




epoch: 0.125
[1,    50] loss: 1.862


epoch: 0.25
[1,   100] loss: 1.527


epoch: 0.375
[1,   150] loss: 1.375


epoch: 0.5
[1,   200] loss: 1.292


epoch: 0.625
[1,   250] loss: 1.234


epoch: 0.75
[1,   300] loss: 1.195


epoch: 0.875
[1,   350] loss: 1.168


epoch: 1.0
[1,   400] loss: 1.132


Valid Loss: 1.093151, accuracy: 0.61


epoch: 1.125
[2,    50] loss: 0.948


epoch: 1.25
[2,   100] loss: 0.954


epoch: 1.375
[2,   150] loss: 0.951


epoch: 1.5
[2,   200] loss: 0.922


epoch: 1.625
[2,   250] loss: 0.957


epoch: 1.75
[2,   300] loss: 0.903


epoch: 1.875
[2,   350] loss: 0.928


epoch: 2.0
[2,   400] loss: 0.903


Valid Loss: 0.957866, accuracy: 0.67


epoch: 2.125
[3,    50] loss: 0.724


epoch: 2.25
[3,   100] loss: 0.717


epoch: 2.375
[3,   150] loss: 0.706


epoch: 2.5
[3,   200] loss: 0.740


epoch: 2.625
[3,   250] loss: 0.772


epoch: 2.75
[3,   300] loss: 0.749


epoch: 2.875
[3,   350] loss: 0.757


epoch: 3.0
[3,   400] loss: 0.741


Valid Loss: 0.905640, accuracy: 0.69


epoch: 3.125
[4,    50] loss: 0.535


epoch: 3.25
[4,   100] loss: 0.535


epoch: 3.375
[4,   150] loss: 0.586


epoch: 3.5
[4,   200] loss: 0.607


epoch: 3.625
[4,   250] loss: 0.608


epoch: 3.75
[4,   300] loss: 0.607


epoch: 3.875
[4,   350] loss: 0.616


epoch: 4.0
[4,   400] loss: 0.624


Valid Loss: 0.988398, accuracy: 0.67


epoch: 4.125
[5,    50] loss: 0.404


epoch: 4.25
[5,   100] loss: 0.399


epoch: 4.375
[5,   150] loss: 0.434


epoch: 4.5
[5,   200] loss: 0.472


epoch: 4.625
[5,   250] loss: 0.476


epoch: 4.75
[5,   300] loss: 0.497


epoch: 4.875
[5,   350] loss: 0.508


epoch: 5.0
[5,   400] loss: 0.529


Valid Loss: 0.939673, accuracy: 0.70


epoch: 5.125
[6,    50] loss: 0.308


epoch: 5.25
[6,   100] loss: 0.296


epoch: 5.375
[6,   150] loss: 0.328


epoch: 5.5
[6,   200] loss: 0.350


epoch: 5.625
[6,   250] loss: 0.374


epoch: 5.75
[6,   300] loss: 0.368


epoch: 5.875
[6,   350] loss: 0.396


epoch: 6.0
[6,   400] loss: 0.433


Valid Loss: 1.014066, accuracy: 0.70


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),▂▃█▄▂▁
train_loss,█▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▂▂▂▂▂▁▁▁▁▁▂
val_accuracy,▁▅▇▆█▇
val_loss,█▃▁▄▂▅

0,1
avg epoch runtime (seconds),5.32836
epoch,6.0
epoch_runtime (seconds),5.26279
train_loss,0.43309
val_accuracy,0.696
val_loss,1.01407


Training Finished


[34m[1mwandb[0m: Agent Starting Run: e32onto0 with config:


[34m[1mwandb[0m: 	batch_size: 128


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.001


[34m[1mwandb[0m: 	model: ResNet34




epoch: 0.125
[1,    50] loss: 1.942


epoch: 0.25
[1,   100] loss: 1.604


epoch: 0.375
[1,   150] loss: 1.434


epoch: 0.5
[1,   200] loss: 1.340


epoch: 0.625
[1,   250] loss: 1.257


epoch: 0.75
[1,   300] loss: 1.237


epoch: 0.875
[1,   350] loss: 1.194


epoch: 1.0
[1,   400] loss: 1.152


Valid Loss: 1.158557, accuracy: 0.58


epoch: 1.125
[2,    50] loss: 0.994


epoch: 1.25
[2,   100] loss: 0.968


epoch: 1.375
[2,   150] loss: 0.969


epoch: 1.5
[2,   200] loss: 0.985


epoch: 1.625
[2,   250] loss: 0.949


epoch: 1.75
[2,   300] loss: 0.923


epoch: 1.875
[2,   350] loss: 0.909


epoch: 2.0
[2,   400] loss: 0.951


Valid Loss: 0.981478, accuracy: 0.66


epoch: 2.125
[3,    50] loss: 0.742


epoch: 2.25
[3,   100] loss: 0.762


epoch: 2.375
[3,   150] loss: 0.763


epoch: 2.5
[3,   200] loss: 0.762


epoch: 2.625
[3,   250] loss: 0.795


epoch: 2.75
[3,   300] loss: 0.763


epoch: 2.875
[3,   350] loss: 0.778


epoch: 3.0
[3,   400] loss: 0.747


Valid Loss: 0.970252, accuracy: 0.67


epoch: 3.125
[4,    50] loss: 0.552


epoch: 3.25
[4,   100] loss: 0.550


epoch: 3.375
[4,   150] loss: 0.603


epoch: 3.5
[4,   200] loss: 0.630


epoch: 3.625
[4,   250] loss: 0.623


epoch: 3.75
[4,   300] loss: 0.632


epoch: 3.875
[4,   350] loss: 0.633


epoch: 4.0
[4,   400] loss: 0.640


Valid Loss: 0.951096, accuracy: 0.67


epoch: 4.125
[5,    50] loss: 0.421


epoch: 4.25
[5,   100] loss: 0.411


epoch: 4.375
[5,   150] loss: 0.466


epoch: 4.5
[5,   200] loss: 0.472


epoch: 4.625
[5,   250] loss: 0.510


epoch: 4.75
[5,   300] loss: 0.513


epoch: 4.875
[5,   350] loss: 0.493


epoch: 5.0
[5,   400] loss: 0.536


Valid Loss: 1.004178, accuracy: 0.68


epoch: 5.125
[6,    50] loss: 0.304


epoch: 5.25
[6,   100] loss: 0.315


epoch: 5.375
[6,   150] loss: 0.358


epoch: 5.5
[6,   200] loss: 0.367


epoch: 5.625
[6,   250] loss: 0.386


epoch: 5.75
[6,   300] loss: 0.401


epoch: 5.875
[6,   350] loss: 0.438


epoch: 6.0
[6,   400] loss: 0.402


Valid Loss: 1.094957, accuracy: 0.68


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_runtime (seconds),▇█▆▁▃█
train_loss,█▇▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▂▂▂▂▁▁▁▁▁▁
val_accuracy,▁▆▇▇██
val_loss,█▂▂▁▃▆

0,1
avg epoch runtime (seconds),7.92971
epoch,6.0
epoch_runtime (seconds),8.25631
train_loss,0.40202
val_accuracy,0.6765
val_loss,1.09496


Training Finished


[34m[1mwandb[0m: Agent Starting Run: 0s18vhcw with config:


[34m[1mwandb[0m: 	batch_size: 128


[34m[1mwandb[0m: 	epochs: 6


[34m[1mwandb[0m: 	lr: 0.0001


[34m[1mwandb[0m: 	model: ResNet18




epoch: 0.125
[1,    50] loss: 2.114


epoch: 0.25
[1,   100] loss: 1.731


epoch: 0.375
[1,   150] loss: 1.599


epoch: 0.5
[1,   200] loss: 1.513


epoch: 0.625
[1,   250] loss: 1.456


epoch: 0.75
[1,   300] loss: 1.422


epoch: 0.875
[1,   350] loss: 1.398


epoch: 1.0
[1,   400] loss: 1.387


Valid Loss: 1.298747, accuracy: 0.53


epoch: 1.125
[2,    50] loss: 1.231


epoch: 1.25
[2,   100] loss: 1.219


epoch: 1.375
[2,   150] loss: 1.207


epoch: 1.5
[2,   200] loss: 1.196


epoch: 1.625
[2,   250] loss: 1.184


epoch: 1.75
[2,   300] loss: 1.161


epoch: 1.875
[2,   350] loss: 1.164


epoch: 2.0
[2,   400] loss: 1.172


Valid Loss: 1.192872, accuracy: 0.56


epoch: 2.125
[3,    50] loss: 1.037


epoch: 2.25
[3,   100] loss: 1.034


epoch: 2.375
[3,   150] loss: 1.035


epoch: 2.5
[3,   200] loss: 1.013


epoch: 2.625
[3,   250] loss: 1.020


epoch: 2.75
[3,   300] loss: 1.011


epoch: 2.875
[3,   350] loss: 1.025


epoch: 3.0
[3,   400] loss: 1.021


Valid Loss: 1.157583, accuracy: 0.59


epoch: 3.125
[4,    50] loss: 0.877


epoch: 3.25
[4,   100] loss: 0.871


epoch: 3.375
[4,   150] loss: 0.901


epoch: 3.5
[4,   200] loss: 0.901


epoch: 3.625
[4,   250] loss: 0.884


epoch: 3.75
[4,   300] loss: 0.903


epoch: 3.875
[4,   350] loss: 0.890


epoch: 4.0
[4,   400] loss: 0.886


Valid Loss: 1.138892, accuracy: 0.60


epoch: 4.125
[5,    50] loss: 0.719


epoch: 4.25
[5,   100] loss: 0.768


epoch: 4.375
[5,   150] loss: 0.764


epoch: 4.5
[5,   200] loss: 0.770


epoch: 4.625
[5,   250] loss: 0.795


epoch: 4.75
[5,   300] loss: 0.780


epoch: 4.875
[5,   350] loss: 0.790


epoch: 5.0
[5,   400] loss: 0.761


Valid Loss: 1.151576, accuracy: 0.59


In [None]:
hi = 'ResNet34-epochs-10'

In [None]:
pip install wandb -qqq
python train_model_wandb.py --layers=34 --epochs=10

In [None]:
import wandb
import os

In [None]:
os.environ["WANDB_NOTEBOOK_NAME"] = "./train_model_wandb.ipynb"

# Log in to your W&B account
## Access secrets through SDK?
wandb.login(key='1305ff3ca47ed8cd6735ba50a3b2f6697ff94916')

[34m[1mwandb[0m: Currently logged in as: [33mjoshuapaperspace[0m (use `wandb login --relogin` to force relogin)




[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import sys, getopt
import torch.optim as optim
import numpy as np
import torch
import torch.nn as nn
import time

from resnet import resnet18, resnet34
from load_data import trainloader, testloader, valloader
from config.config import model_config


# wandb.init(project="resnet-test")

models = ['ResNet18', 'ResNet34']

for model in models:
    with wandb.init(project="resnet-test", config=model_config, name=model):

        print(model_config['epochs'])
        print(model_config['batch_size'])
        print(model_config['lr'])
        print(model)

        if model == 'ResNet18':
            model = resnet18(3, 10)
        else:
            model = resnet34(3, 10)

        criterion = nn.CrossEntropyLoss()
        # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
        optimizer = optim.SGD(model.parameters(), lr=model_config['lr'], momentum=0.9)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model.to(device)

        step = 0
        epoch_durations = []
        for epoch in range(model_config['epochs']):
            
            start_epoch_time = time.time()
            # print("--- %s seconds ---" % (time.time() - start_time))

            print('epoch:', epoch+1)
            mini_batch_check=50
            running_loss = 0.0
            model.train()

            for i, data in enumerate(trainloader, 0):
                # print(i)

                inputs, labels = data[0].to(device), data[1].to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                # print statistics
                running_loss += loss.item()
                
                if i % mini_batch_check == mini_batch_check-1:    # print every 50 mini-batches
                    step +=1
                    print('inter-epoch:', epoch + ((i+1)/len(trainloader)))
                    wandb.log({"train_loss": running_loss/mini_batch_check, "epoch": epoch + ((i+1)/len(trainloader))}, step=step)

                    print('[%d, %5d] loss: %.3f' %
                        (epoch + 1, i + 1, running_loss / mini_batch_check))
                    
                    running_loss = 0.0
            
            val_loss, accuracy = validate_model(model, valloader, criterion)
                
            # Log validation metrics
            wandb.log({"val_loss": val_loss, "val_accuracy": accuracy}, step=step)
            print(f"Valid Loss: {val_loss:3f}, accuracy: {accuracy:.2f}")
            epoch_duration = time.time() - start_epoch_time
            wandb.log({"epoch_runtime (seconds)": epoch_duration}, step=step)

            epoch_durations.append(epoch_duration)

        avg_epoch_runtime = sum(epoch_durations) / len(epoch_durations)
        wandb.log({"avg epoch runtime (seconds)": avg_epoch_runtime})
        # wandb.finish()
print('Finished Training')

20
125
0.001
ResNet18
epoch: 1


inter-epoch: 0.125
[1,    50] loss: 1.864


inter-epoch: 0.25
[1,   100] loss: 1.498


inter-epoch: 0.375
[1,   150] loss: 1.369


inter-epoch: 0.5
[1,   200] loss: 1.305


inter-epoch: 0.625
[1,   250] loss: 1.259


inter-epoch: 0.75
[1,   300] loss: 1.202


inter-epoch: 0.875
[1,   350] loss: 1.129


inter-epoch: 1.0
[1,   400] loss: 1.082


Valid Loss: 1.090106, accuracy: 0.61
epoch: 2


inter-epoch: 1.125
[2,    50] loss: 0.955


inter-epoch: 1.25
[2,   100] loss: 0.937


inter-epoch: 1.375
[2,   150] loss: 0.922


inter-epoch: 1.5
[2,   200] loss: 0.920


inter-epoch: 1.625
[2,   250] loss: 0.902


inter-epoch: 1.75
[2,   300] loss: 0.900


inter-epoch: 1.875
[2,   350] loss: 0.908


inter-epoch: 2.0
[2,   400] loss: 0.891


Valid Loss: 0.915937, accuracy: 0.69
epoch: 3


inter-epoch: 2.125
[3,    50] loss: 0.715


inter-epoch: 2.25
[3,   100] loss: 0.716


inter-epoch: 2.375
[3,   150] loss: 0.697


inter-epoch: 2.5
[3,   200] loss: 0.706


inter-epoch: 2.625
[3,   250] loss: 0.734


inter-epoch: 2.75
[3,   300] loss: 0.743


inter-epoch: 2.875
[3,   350] loss: 0.747


inter-epoch: 3.0
[3,   400] loss: 0.733


Valid Loss: 0.903261, accuracy: 0.68
epoch: 4


inter-epoch: 3.125
[4,    50] loss: 0.554


inter-epoch: 3.25
[4,   100] loss: 0.533


inter-epoch: 3.375
[4,   150] loss: 0.556


inter-epoch: 3.5
[4,   200] loss: 0.583


inter-epoch: 3.625
[4,   250] loss: 0.590


inter-epoch: 3.75
[4,   300] loss: 0.598


inter-epoch: 3.875
[4,   350] loss: 0.605


inter-epoch: 4.0
[4,   400] loss: 0.627


Valid Loss: 0.900701, accuracy: 0.70
epoch: 5


inter-epoch: 4.125
[5,    50] loss: 0.384


inter-epoch: 4.25
[5,   100] loss: 0.374


inter-epoch: 4.375
[5,   150] loss: 0.408


inter-epoch: 4.5
[5,   200] loss: 0.483


inter-epoch: 4.625
[5,   250] loss: 0.490


inter-epoch: 4.75
[5,   300] loss: 0.486


inter-epoch: 4.875
[5,   350] loss: 0.470


inter-epoch: 5.0
[5,   400] loss: 0.530


Valid Loss: 0.933527, accuracy: 0.70
epoch: 6


inter-epoch: 5.125
[6,    50] loss: 0.300


inter-epoch: 5.25
[6,   100] loss: 0.274


inter-epoch: 5.375
[6,   150] loss: 0.298


inter-epoch: 5.5
[6,   200] loss: 0.317


inter-epoch: 5.625
[6,   250] loss: 0.376


inter-epoch: 5.75
[6,   300] loss: 0.395


inter-epoch: 5.875
[6,   350] loss: 0.423


inter-epoch: 6.0
[6,   400] loss: 0.412


Valid Loss: 1.025811, accuracy: 0.70
epoch: 7


inter-epoch: 6.125
[7,    50] loss: 0.230


inter-epoch: 6.25
[7,   100] loss: 0.215


inter-epoch: 6.375
[7,   150] loss: 0.227


inter-epoch: 6.5
[7,   200] loss: 0.283


inter-epoch: 6.625
[7,   250] loss: 0.275


inter-epoch: 6.75
[7,   300] loss: 0.294


inter-epoch: 6.875
[7,   350] loss: 0.311


inter-epoch: 7.0
[7,   400] loss: 0.317


Valid Loss: 1.147455, accuracy: 0.68
epoch: 8


inter-epoch: 7.125
[8,    50] loss: 0.173


inter-epoch: 7.25
[8,   100] loss: 0.142


inter-epoch: 7.375
[8,   150] loss: 0.192


inter-epoch: 7.5
[8,   200] loss: 0.199


inter-epoch: 7.625
[8,   250] loss: 0.231


inter-epoch: 7.75
[8,   300] loss: 0.249


inter-epoch: 7.875
[8,   350] loss: 0.226


inter-epoch: 8.0
[8,   400] loss: 0.256


Valid Loss: 1.130768, accuracy: 0.71
epoch: 9


inter-epoch: 8.125
[9,    50] loss: 0.157


inter-epoch: 8.25
[9,   100] loss: 0.129


inter-epoch: 8.375
[9,   150] loss: 0.136


inter-epoch: 8.5
[9,   200] loss: 0.150


inter-epoch: 8.625
[9,   250] loss: 0.164


inter-epoch: 8.75
[9,   300] loss: 0.165


inter-epoch: 8.875
[9,   350] loss: 0.197


inter-epoch: 9.0
[9,   400] loss: 0.223


Valid Loss: 1.235499, accuracy: 0.71
epoch: 10


inter-epoch: 9.125
[10,    50] loss: 0.142


inter-epoch: 9.25
[10,   100] loss: 0.121


inter-epoch: 9.375
[10,   150] loss: 0.123


inter-epoch: 9.5
[10,   200] loss: 0.130


inter-epoch: 9.625
[10,   250] loss: 0.135


inter-epoch: 9.75
[10,   300] loss: 0.155


inter-epoch: 9.875
[10,   350] loss: 0.181


inter-epoch: 10.0
[10,   400] loss: 0.176


Valid Loss: 1.287606, accuracy: 0.71
epoch: 11


inter-epoch: 10.125
[11,    50] loss: 0.091


inter-epoch: 10.25
[11,   100] loss: 0.090


inter-epoch: 10.375
[11,   150] loss: 0.085


inter-epoch: 10.5
[11,   200] loss: 0.102


inter-epoch: 10.625
[11,   250] loss: 0.107


inter-epoch: 10.75
[11,   300] loss: 0.109


inter-epoch: 10.875
[11,   350] loss: 0.122


inter-epoch: 11.0
[11,   400] loss: 0.131


Valid Loss: 1.388063, accuracy: 0.71
epoch: 12


inter-epoch: 11.125
[12,    50] loss: 0.093


inter-epoch: 11.25
[12,   100] loss: 0.078


inter-epoch: 11.375
[12,   150] loss: 0.085


inter-epoch: 11.5
[12,   200] loss: 0.093


inter-epoch: 11.625
[12,   250] loss: 0.103


inter-epoch: 11.75
[12,   300] loss: 0.110


inter-epoch: 11.875
[12,   350] loss: 0.100


inter-epoch: 12.0
[12,   400] loss: 0.099


Valid Loss: 1.324412, accuracy: 0.72
epoch: 13


inter-epoch: 12.125
[13,    50] loss: 0.061


inter-epoch: 12.25
[13,   100] loss: 0.069


inter-epoch: 12.375
[13,   150] loss: 0.066


inter-epoch: 12.5
[13,   200] loss: 0.064


inter-epoch: 12.625
[13,   250] loss: 0.059


inter-epoch: 12.75
[13,   300] loss: 0.057


inter-epoch: 12.875
[13,   350] loss: 0.071


inter-epoch: 13.0
[13,   400] loss: 0.096


Valid Loss: 1.434931, accuracy: 0.71
epoch: 14


inter-epoch: 13.125
[14,    50] loss: 0.069


inter-epoch: 13.25
[14,   100] loss: 0.048


inter-epoch: 13.375
[14,   150] loss: 0.054


inter-epoch: 13.5
[14,   200] loss: 0.051


inter-epoch: 13.625
[14,   250] loss: 0.054


inter-epoch: 13.75
[14,   300] loss: 0.058


inter-epoch: 13.875
[14,   350] loss: 0.069


inter-epoch: 14.0
[14,   400] loss: 0.065


Valid Loss: 1.327967, accuracy: 0.74
epoch: 15


inter-epoch: 14.125
[15,    50] loss: 0.035


inter-epoch: 14.25
[15,   100] loss: 0.037


inter-epoch: 14.375
[15,   150] loss: 0.033


inter-epoch: 14.5
[15,   200] loss: 0.040


inter-epoch: 14.625
[15,   250] loss: 0.047


inter-epoch: 14.75
[15,   300] loss: 0.055


inter-epoch: 14.875
[15,   350] loss: 0.054


inter-epoch: 15.0
[15,   400] loss: 0.065


Valid Loss: 1.473969, accuracy: 0.71
epoch: 16


inter-epoch: 15.125
[16,    50] loss: 0.057


inter-epoch: 15.25
[16,   100] loss: 0.054


inter-epoch: 15.375
[16,   150] loss: 0.048


inter-epoch: 15.5
[16,   200] loss: 0.046


inter-epoch: 15.625
[16,   250] loss: 0.044


inter-epoch: 15.75
[16,   300] loss: 0.050


inter-epoch: 15.875
[16,   350] loss: 0.065


inter-epoch: 16.0
[16,   400] loss: 0.085


Valid Loss: 1.568885, accuracy: 0.71
epoch: 17


inter-epoch: 16.125
[17,    50] loss: 0.054


inter-epoch: 16.25
[17,   100] loss: 0.047


inter-epoch: 16.375
[17,   150] loss: 0.046


inter-epoch: 16.5
[17,   200] loss: 0.042


inter-epoch: 16.625
[17,   250] loss: 0.050


inter-epoch: 16.75
[17,   300] loss: 0.064


inter-epoch: 16.875
[17,   350] loss: 0.058


inter-epoch: 17.0
[17,   400] loss: 0.084


Valid Loss: 1.480694, accuracy: 0.73
epoch: 18


inter-epoch: 17.125
[18,    50] loss: 0.045


inter-epoch: 17.25
[18,   100] loss: 0.045


inter-epoch: 17.375
[18,   150] loss: 0.049


inter-epoch: 17.5
[18,   200] loss: 0.043


inter-epoch: 17.625
[18,   250] loss: 0.043


inter-epoch: 17.75
[18,   300] loss: 0.057


inter-epoch: 17.875
[18,   350] loss: 0.052


inter-epoch: 18.0
[18,   400] loss: 0.056


Valid Loss: 1.481240, accuracy: 0.73
epoch: 19


inter-epoch: 18.125
[19,    50] loss: 0.038


inter-epoch: 18.25
[19,   100] loss: 0.034


inter-epoch: 18.375
[19,   150] loss: 0.030


inter-epoch: 18.5
[19,   200] loss: 0.029


inter-epoch: 18.625
[19,   250] loss: 0.030


inter-epoch: 18.75
[19,   300] loss: 0.033


inter-epoch: 18.875
[19,   350] loss: 0.037


inter-epoch: 19.0
[19,   400] loss: 0.035


Valid Loss: 1.464637, accuracy: 0.73
epoch: 20


inter-epoch: 19.125
[20,    50] loss: 0.030


inter-epoch: 19.25
[20,   100] loss: 0.025


inter-epoch: 19.375
[20,   150] loss: 0.028


inter-epoch: 19.5
[20,   200] loss: 0.024


inter-epoch: 19.625
[20,   250] loss: 0.022


inter-epoch: 19.75
[20,   300] loss: 0.025


inter-epoch: 19.875
[20,   350] loss: 0.023


inter-epoch: 20.0
[20,   400] loss: 0.033


Valid Loss: 1.527229, accuracy: 0.73


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
epoch_runtime (seconds),▅▂▂▁▁▃▂▅▁▃▄▂▂▅▄▃▁▂█▅
train_loss,█▇▅▅▄▄▃▄▃▃▂▃▂▂▂▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▅▅▆▆▆▅▇▆▆▆▇▆█▆▆▇▇█▇
val_loss,▃▁▁▁▁▂▄▃▅▅▆▅▇▅▇█▇▇▇█

0,1
avg epoch runtime (seconds),5.05526
epoch,20.0
epoch_runtime (seconds),5.20448
train_loss,0.03323
val_accuracy,0.728
val_loss,1.52723


20
125
0.001
ResNet34
epoch: 1


inter-epoch: 0.125
[1,    50] loss: 1.990


inter-epoch: 0.25
[1,   100] loss: 1.586


inter-epoch: 0.375
[1,   150] loss: 1.406


inter-epoch: 0.5
[1,   200] loss: 1.343


inter-epoch: 0.625
[1,   250] loss: 1.259


inter-epoch: 0.75
[1,   300] loss: 1.213


inter-epoch: 0.875
[1,   350] loss: 1.190


inter-epoch: 1.0
[1,   400] loss: 1.142


Valid Loss: 1.193628, accuracy: 0.59
epoch: 2


inter-epoch: 1.125
[2,    50] loss: 1.005


inter-epoch: 1.25
[2,   100] loss: 1.007


inter-epoch: 1.375
[2,   150] loss: 0.970


inter-epoch: 1.5
[2,   200] loss: 0.965


inter-epoch: 1.625
[2,   250] loss: 0.971


inter-epoch: 1.75
[2,   300] loss: 0.939


inter-epoch: 1.875
[2,   350] loss: 0.937


inter-epoch: 2.0
[2,   400] loss: 0.909


Valid Loss: 1.014620, accuracy: 0.65
epoch: 3


inter-epoch: 2.125
[3,    50] loss: 0.728


inter-epoch: 2.25
[3,   100] loss: 0.727


inter-epoch: 2.375
[3,   150] loss: 0.757


inter-epoch: 2.5
[3,   200] loss: 0.785


inter-epoch: 2.625
[3,   250] loss: 0.800


inter-epoch: 2.75
[3,   300] loss: 0.765


inter-epoch: 2.875
[3,   350] loss: 0.746


inter-epoch: 3.0
[3,   400] loss: 0.773


Valid Loss: 0.956595, accuracy: 0.68
epoch: 4


inter-epoch: 3.125
[4,    50] loss: 0.538


inter-epoch: 3.25
[4,   100] loss: 0.568


inter-epoch: 3.375
[4,   150] loss: 0.585


inter-epoch: 3.5
[4,   200] loss: 0.613


inter-epoch: 3.625
[4,   250] loss: 0.631


inter-epoch: 3.75
[4,   300] loss: 0.630


inter-epoch: 3.875
[4,   350] loss: 0.627


inter-epoch: 4.0
[4,   400] loss: 0.640


Valid Loss: 0.979833, accuracy: 0.69
epoch: 5


inter-epoch: 4.125
[5,    50] loss: 0.385


inter-epoch: 4.25
[5,   100] loss: 0.410


inter-epoch: 4.375
[5,   150] loss: 0.447


inter-epoch: 4.5
[5,   200] loss: 0.479


inter-epoch: 4.625
[5,   250] loss: 0.503


inter-epoch: 4.75
[5,   300] loss: 0.521


inter-epoch: 4.875
[5,   350] loss: 0.537


inter-epoch: 5.0
[5,   400] loss: 0.526


Valid Loss: 0.990239, accuracy: 0.69
epoch: 6


inter-epoch: 5.125
[6,    50] loss: 0.284


inter-epoch: 5.25
[6,   100] loss: 0.298


inter-epoch: 5.375
[6,   150] loss: 0.344


inter-epoch: 5.5
[6,   200] loss: 0.375


inter-epoch: 5.625
[6,   250] loss: 0.392


inter-epoch: 5.75
[6,   300] loss: 0.406


inter-epoch: 5.875
[6,   350] loss: 0.420


inter-epoch: 6.0
[6,   400] loss: 0.408


Valid Loss: 1.048875, accuracy: 0.70
epoch: 7


inter-epoch: 6.125
[7,    50] loss: 0.226


inter-epoch: 6.25
[7,   100] loss: 0.206


inter-epoch: 6.375
[7,   150] loss: 0.271


inter-epoch: 6.5
[7,   200] loss: 0.266


inter-epoch: 6.625
[7,   250] loss: 0.307


inter-epoch: 6.75
[7,   300] loss: 0.328


inter-epoch: 6.875
[7,   350] loss: 0.331


inter-epoch: 7.0
[7,   400] loss: 0.356


Valid Loss: 1.106977, accuracy: 0.70
epoch: 8


inter-epoch: 7.125
[8,    50] loss: 0.195


inter-epoch: 7.25
[8,   100] loss: 0.166


inter-epoch: 7.375
[8,   150] loss: 0.192


inter-epoch: 7.5
[8,   200] loss: 0.214


inter-epoch: 7.625
[8,   250] loss: 0.273


inter-epoch: 7.75
[8,   300] loss: 0.286


inter-epoch: 7.875
[8,   350] loss: 0.280


inter-epoch: 8.0
[8,   400] loss: 0.257


Valid Loss: 1.081672, accuracy: 0.71
epoch: 9


inter-epoch: 8.125
[9,    50] loss: 0.142


inter-epoch: 8.25
[9,   100] loss: 0.137


inter-epoch: 8.375
[9,   150] loss: 0.149


inter-epoch: 8.5
[9,   200] loss: 0.188


inter-epoch: 8.625
[9,   250] loss: 0.203


inter-epoch: 8.75
[9,   300] loss: 0.211


inter-epoch: 8.875
[9,   350] loss: 0.207


inter-epoch: 9.0
[9,   400] loss: 0.228


Valid Loss: 1.178297, accuracy: 0.71
epoch: 10


inter-epoch: 9.125
[10,    50] loss: 0.134


inter-epoch: 9.25
[10,   100] loss: 0.109


inter-epoch: 9.375
[10,   150] loss: 0.120


inter-epoch: 9.5
[10,   200] loss: 0.136


inter-epoch: 9.625
[10,   250] loss: 0.145


inter-epoch: 9.75
[10,   300] loss: 0.156


inter-epoch: 9.875
[10,   350] loss: 0.168


inter-epoch: 10.0
[10,   400] loss: 0.192


Valid Loss: 1.335845, accuracy: 0.71
epoch: 11


inter-epoch: 10.125
[11,    50] loss: 0.127


inter-epoch: 10.25
[11,   100] loss: 0.098


inter-epoch: 10.375
[11,   150] loss: 0.099


inter-epoch: 10.5
[11,   200] loss: 0.117


inter-epoch: 10.625
[11,   250] loss: 0.138


inter-epoch: 10.75
[11,   300] loss: 0.146


inter-epoch: 10.875
[11,   350] loss: 0.156


inter-epoch: 11.0
[11,   400] loss: 0.156


Valid Loss: 1.249988, accuracy: 0.72
epoch: 12


inter-epoch: 11.125
[12,    50] loss: 0.075


inter-epoch: 11.25
[12,   100] loss: 0.081


inter-epoch: 11.375
[12,   150] loss: 0.088


inter-epoch: 11.5
[12,   200] loss: 0.082


inter-epoch: 11.625
[12,   250] loss: 0.089


inter-epoch: 11.75
[12,   300] loss: 0.108


inter-epoch: 11.875
[12,   350] loss: 0.119


inter-epoch: 12.0
[12,   400] loss: 0.129


Valid Loss: 1.404500, accuracy: 0.71
epoch: 13


inter-epoch: 12.125
[13,    50] loss: 0.090


inter-epoch: 12.25
[13,   100] loss: 0.066


inter-epoch: 12.375
[13,   150] loss: 0.073


inter-epoch: 12.5
[13,   200] loss: 0.086


inter-epoch: 12.625
[13,   250] loss: 0.106


inter-epoch: 12.75
[13,   300] loss: 0.104


inter-epoch: 12.875
[13,   350] loss: 0.119


inter-epoch: 13.0
[13,   400] loss: 0.144


Valid Loss: 1.428244, accuracy: 0.71
epoch: 14


inter-epoch: 13.125
[14,    50] loss: 0.078


inter-epoch: 13.25
[14,   100] loss: 0.072


inter-epoch: 13.375
[14,   150] loss: 0.066


inter-epoch: 13.5
[14,   200] loss: 0.065


inter-epoch: 13.625
[14,   250] loss: 0.069


inter-epoch: 13.75
[14,   300] loss: 0.077


inter-epoch: 13.875
[14,   350] loss: 0.086


inter-epoch: 14.0
[14,   400] loss: 0.095


Valid Loss: 1.471097, accuracy: 0.72
epoch: 15


inter-epoch: 14.125
[15,    50] loss: 0.060


inter-epoch: 14.25
[15,   100] loss: 0.070


inter-epoch: 14.375
[15,   150] loss: 0.065


inter-epoch: 14.5
[15,   200] loss: 0.074


inter-epoch: 14.625
[15,   250] loss: 0.072


inter-epoch: 14.75
[15,   300] loss: 0.070


inter-epoch: 14.875
[15,   350] loss: 0.079


inter-epoch: 15.0
[15,   400] loss: 0.087


Valid Loss: 1.476755, accuracy: 0.71
epoch: 16


inter-epoch: 15.125
[16,    50] loss: 0.068


inter-epoch: 15.25
[16,   100] loss: 0.049


inter-epoch: 15.375
[16,   150] loss: 0.053


inter-epoch: 15.5
[16,   200] loss: 0.058


inter-epoch: 15.625
[16,   250] loss: 0.058


inter-epoch: 15.75
[16,   300] loss: 0.052


inter-epoch: 15.875
[16,   350] loss: 0.056


inter-epoch: 16.0
[16,   400] loss: 0.062


Valid Loss: 1.562123, accuracy: 0.71
epoch: 17


inter-epoch: 16.125
[17,    50] loss: 0.037


inter-epoch: 16.25
[17,   100] loss: 0.046


inter-epoch: 16.375
[17,   150] loss: 0.051


inter-epoch: 16.5
[17,   200] loss: 0.044


inter-epoch: 16.625
[17,   250] loss: 0.050


inter-epoch: 16.75
[17,   300] loss: 0.051


inter-epoch: 16.875
[17,   350] loss: 0.059


inter-epoch: 17.0
[17,   400] loss: 0.056


Valid Loss: 1.585756, accuracy: 0.72
epoch: 18


inter-epoch: 17.125
[18,    50] loss: 0.045


inter-epoch: 17.25
[18,   100] loss: 0.038


inter-epoch: 17.375
[18,   150] loss: 0.045


inter-epoch: 17.5
[18,   200] loss: 0.042


inter-epoch: 17.625
[18,   250] loss: 0.043


inter-epoch: 17.75
[18,   300] loss: 0.057


inter-epoch: 17.875
[18,   350] loss: 0.057


inter-epoch: 18.0
[18,   400] loss: 0.079


Valid Loss: 1.573243, accuracy: 0.72
epoch: 19


inter-epoch: 18.125
[19,    50] loss: 0.053


inter-epoch: 18.25
[19,   100] loss: 0.040


inter-epoch: 18.375
[19,   150] loss: 0.043


inter-epoch: 18.5
[19,   200] loss: 0.055


inter-epoch: 18.625
[19,   250] loss: 0.051


inter-epoch: 18.75
[19,   300] loss: 0.048


inter-epoch: 18.875
[19,   350] loss: 0.040


inter-epoch: 19.0
[19,   400] loss: 0.043


Valid Loss: 1.599669, accuracy: 0.72
epoch: 20


inter-epoch: 19.125
[20,    50] loss: 0.034


inter-epoch: 19.25
[20,   100] loss: 0.029


inter-epoch: 19.375
[20,   150] loss: 0.032


inter-epoch: 19.5
[20,   200] loss: 0.020


inter-epoch: 19.625
[20,   250] loss: 0.032


inter-epoch: 19.75
[20,   300] loss: 0.040


inter-epoch: 19.875
[20,   350] loss: 0.037


inter-epoch: 20.0
[20,   400] loss: 0.052


Valid Loss: 1.638053, accuracy: 0.72


0,1
avg epoch runtime (seconds),▁
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
epoch_runtime (seconds),▃▄▃▅▅▅▄▇▅▃█▇▃▇▅▄▃▆▁▄
train_loss,█▆▅▅▄▄▃▄▃▃▂▃▂▂▂▂▁▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▄▆▆▆▇▇▇▇▇█▇▇█▇▇████
val_loss,▃▂▁▁▁▂▃▂▃▅▄▆▆▆▆▇▇▇██

0,1
avg epoch runtime (seconds),7.63705
epoch,20.0
epoch_runtime (seconds),7.6177
train_loss,0.05237
val_accuracy,0.717
val_loss,1.63805


Finished Training


In [None]:
def validate_model(model, valid_dl, loss_func, log_images=False, batch_idx=0):
    "Compute performance of the model on the validation dataset and log a wandb.Table"
    model.eval()
    val_loss = 0.
    with torch.inference_mode():
        correct = 0
        for i, (images, labels) in enumerate(valid_dl, 0):
        # for i, (images, labels) in enumerate(valid_dl), leave=False):
            images, labels = images.to(device), labels.to(device)

            # Forward pass ➡
            outputs = model(images)
            val_loss += loss_func(outputs, labels)*labels.size(0)

            # Compute accuracy and accumulate
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()

            # Log one batch of images to the dashboard, always same batch_idx.
            # if i==batch_idx and log_images:
                # log_image_table(images, predicted, labels, outputs.softmax(dim=1))
    return val_loss / len(valid_dl.dataset), correct / len(valid_dl.dataset)

In [None]:
for i, data in enumerate(trainloader, 0):
    print(data)
    exit()

NameError: name 'trainloader' is not defined

In [None]:
len(trainloader)

12500

In [None]:
from load_data import trainloader

Files already downloaded and verified


Files already downloaded and verified


In [None]:
32*391

12512