# DL1 Assignment2 - Q1.1 draft code

This is a small help from us to save you some coding. This notebook is **not** graded, you are free to edit it.

Further advise:
1. Start with File/Save a copy in Drive
2. Set GPU usage under Runtime/Change runtime type/Hardware accelerator.

In [1]:
!pip install timm

Collecting timm
  Downloading timm-0.9.11-py3-none-any.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.5/60.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from timm)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting safetensors (from timm)
  Downloading safetensors-0.4.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading timm-0.9.11-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading safetensors-0.4.0-cp311-cp311-macosx_11_0_arm64.whl (425 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m425.4/425.4 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?

In [26]:
import torch
from torch import nn
import timm
from torchvision import models
from matplotlib import pyplot as plt
from typing import Callable
import numpy as np

%matplotlib inline

In [46]:
def vit_s_8():
    """ViT-S/8 is not a default torchvision model, so we provide it by timm"""
    # Accuracy approximation comes from
    # https://openreview.net/pdf?id=LtKcMgGOeLt
    # and DINO
    # https://arxiv.org/abs/2104.14294
    return timm.create_model('vit_small_patch8_224')

def drop_outliers(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1

    # Define outlier cutoff
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out outliers
    filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]

    # Calculate the mean
    mean_value = np.mean(filtered_data)

    return mean_value

# Model definitions
# Optional Q: These are uncalled functions. What do you think would happen
# if we called all of them once? Why didn't we do that?
model_defs = [
    vit_s_8,
    models.vit_b_32,
    models.vgg11,
    models.vgg11_bn,
    models.resnet18,
    models.densenet121,
    models.mobilenet_v3_small,
]

# Accuracies per model
model_accs = {
    'vit_s_8': 80., # Approximated
    'vit_b_32' : 75.912,
    'vgg11' : 69.02,
    'vgg11_bn' : 70.37,
    'resnet18' : 69.758,
    'densenet121' : 74.434,
    'mobilenet_v3_small' : 67.668,
}


def measure_runtime_per_forward(model:nn.Module, no_grad:bool, batch_size:int=8):
    """Measures the time for a single pass in milliseconds"""

    # Generate fake RGB input (224x224)
    x = torch.rand((batch_size,3,224,224))

    #try:   
        #start = torch.cuda.Event(enable_timing=True)
        #end = torch.cuda.Event(enable_timing=True)
        #x = x.to('cuda')
    #except:

    start = torch.mps.Event(enable_timing=True)
    end = torch.mps.Event(enable_timing=True)
    x = x.to('mps')
    
    start.record()

    # Run the model
    #weights = 'DEFAULT'
    #model = model(weights=weights)
    model.eval()
    
    if no_grad:
        model.zero_grad()
    
    prediction = model(x)
    #class_id = prediction.argmax().item()
    #score = prediction[class_id].item()
    #category_name = weights.meta["categories"][class_id]
    #print(f"{category_name}: {100 * score:.1f}%")

    end.record()
    #try:
        #torch.cuda.synchronize()
    #except:
    torch.mps.synchronize()
        
    return start.elapsed_time(end)


def evaluate_model(model_def:Callable, no_grad:bool, batch_size:int=8):

    # Retreive initial memory allocation
    #try:
        #initial_vram = torch.cuda.memory_allocated()
    #except:
    initial_vram = torch.mps.current_allocated_memory()
    print(f"Initial mem: {torch.mps.driver_allocated_memory()}")
    initial_vram2 = torch.mps.driver_allocated_memory()


    # Define model
    #try:
        #model = model_def().cuda().eval()
    #except:
    model = model_def().eval().to('mps')

    # Access name as: model.__name__

    # Parameters that need to be filled
    times, vrams = [], []
    mean_time = None
    mean_vram = None



    #######################
    # PUT YOUR CODE HERE  #
    #######################

    # Step 1: Calculate the number of **trainable** parameters

    train_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Step 2: Warm up with a few passes

    measure_runtime_per_forward(model, no_grad, batch_size)
    measure_runtime_per_forward(model, no_grad, batch_size)
    measure_runtime_per_forward(model, no_grad, batch_size)
    measure_runtime_per_forward(model, no_grad, batch_size)    

    # Step 3: Run N forward passes and save the runtime +
    #         the vram allocated by the model

    N = 10
    args = {'model': model, 'no_grad': no_grad, 'batch_size': batch_size}

    for _ in range (N):
        time = measure_runtime_per_forward(**args)
        times.append(time)
        #try:
            #allocated_vram = torch.cuda.memory_allocated() - initial_vram
        #except:
        allocated_vram = torch.mps.current_allocated_memory() - initial_vram
        print(f"{torch.mps.driver_allocated_memory() - initial_vram2}")
    
        vrams.append(allocated_vram)

    # Step 4: Take the mean, preferably with dropping possible outliers
    for 
        Q1 = np.percentile(data, 25)
        Q3 = np.percentile(data, 75)
        IQR = Q3 - Q1

        # Define outlier cutoff
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Filter out outliers
        filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]

        # Calculate the mean
        mean_value = np.mean(filtered_data)
        
    mean_time = np.mean(time)
    mean_vram = np.mean(vrams)

    # Clean up space for the model
    del model
    try:
        torch.cuda.empty_cache()
    except:
        torch.mps.empty_cache()

    return mean_time, mean_vram, train_params


In [47]:
#######################
# PUT YOUR CODE HERE  #
#######################

# Make your plots here with matplotlib
#
# Example usage of the above functions:
for model_def in model_defs:
    name = model_def.__name__
    time, vram, n_params = evaluate_model(model_def, no_grad=True)
    print(name, time, vram, n_params)

#######################
# END OF YOUR CODE    #
#######################

Initial mem: 11460739072
169869312
169869312
169869312
169869312
169869312
169869312
169869312
169869312
169869312
169869312
vit_s_8 173.342209 86681088.0 21670272
Initial mem: 11630608384
-37748736
-37748736
-37748736
-37748736
-37748736
-37748736
-37748736
-37748736
-37748736
-37748736
vit_b_32 35.418749999999996 389839360.0 88224232
Initial mem: 11592859648
85983232
85983232
85983232
85983232
85983232
85983232
85983232
85983232
85983232
85983232
vgg11 31.240707999999998 544855040.0 132863336
Initial mem: 11678842880
-12582912
-12582912
-12582912
-12582912
-12582912
-12582912
-12582912
-12582912
-12582912
-12582912
vgg11_bn 34.855790999999996 544902400.0 132868840
Initial mem: 11666259968
60817408
60817408
60817408
60817408
60817408
60817408
60817408
60817408
60817408
60817408
resnet18 11.099375 47971072.0 11689512
Initial mem: 11727077376
-205520896
-205520896
-205520896
-205520896
-205520896
-205520896
-205520896
-205520896
-205520896
-205520896
densenet121 54.0075 35158016.0 79788

In [42]:

time = measure_runtime_per_forward(models.resnet18().to('mps'), no_grad=True)
print(f"Model {name} achieves runtime of {time}")

Model mobilenet_v3_small achieves runtime of 82.28845799999999
