In [None]:
"""
This notebook reports the speed for both LID estimation and density estimation
"""

In [1]:
%load_ext autoreload
%autoreload 2

# add the parent directory to the Python path
import sys
import os
sys.path.append(os.path.abspath("../.."))   # one level up from notebook/
print(sys.path)
from data.utils.dataloaders import get_imagenet_dataloader
import numpy as np
import torch
from flextok.flextok_wrapper import FlexTokFromHub
from flextok.utils.misc import detect_bf16_support, get_bf16_context, get_generator
from flextok.utils.demo import imgs_from_urls, denormalize, batch_to_pil
from LID.fokker_planck_estimator import RectifiedFlowLIDEstimator


device = 'cuda' if torch.cuda.is_available() else 'cpu'
import time

# Detect if bf16 is enabled or not
enable_bf16 = detect_bf16_support()
print('BF16 enabled:', enable_bf16)

['/BS/data_mani_compress/work/miniforge3/envs/dgm_geometry/lib/python311.zip', '/BS/data_mani_compress/work/miniforge3/envs/dgm_geometry/lib/python3.11', '/BS/data_mani_compress/work/miniforge3/envs/dgm_geometry/lib/python3.11/lib-dynload', '', '/BS/data_mani_compress/work/miniforge3/envs/dgm_geometry/lib/python3.11/site-packages', '/BS/data_mani_compress/work/thesis/thesis/external/guided-diffusion', '/BS/data_mani_compress/work/thesis/thesis/external/DenseFlow', '/BS/data_mani_compress/work/thesis/thesis']


  from .autonotebook import tqdm as notebook_tqdm


BF16 enabled: True


In [2]:
# Load a FlexTok d18-d28 model trained on DFN from HuggingFace Hub
flextok = FlexTokFromHub.from_pretrained('EPFL-VILAB/flextok_d18_d18_in1k').to(device)

In [5]:
flextok.count_params_and_size()

Module: vae | Params: 83,821,011 | Size: 319.75 MB (0.312 GB)
Module: encoder | Params: 287,326,086 | Size: 1096.06 MB (1.070 GB)
Module: decoder | Params: 578,587,456 | Size: 2207.14 MB (2.155 GB)
Module: regularizer | Params: 0 | Size: 0.00 MB (0.000 GB)
Module: flow_matching_noise_module | Params: 0 | Size: 0.00 MB (0.000 GB)
------------------------------------------------------------
TOTAL | Params: 949,734,553 | Size: 3622.95 MB (3.538 GB)


In [3]:
lid_estimator = RectifiedFlowLIDEstimator(ambient_dim=65536, model=flextok, device=device)

In [None]:

for p in flextok.parameters():
        p.requires_grad_(False)
flextok.eval()

batch_sizes_to_test = [4, 8, 16]
for batch_size in batch_sizes_to_test:

    # generate random torch of shape 4 256
    random_registers = torch.randn(batch_size, 256).to(device)
    random_input_images = torch.randn(batch_size, 3, 256, 256).to(device)

    token_ids_list = [
                torch.as_tensor(t[:1], dtype=torch.long, device=device).unsqueeze(0)
                for t in random_registers
            ]

    # warm up
    with torch.no_grad():
        for _ in range(10):
            with get_bf16_context(enable_bf16):
                _ = lid_estimator.estimate(random_input_images, t_hyper=0.36, hutchinson_samples=2, token_ids_list=token_ids_list)

    # timing
    torch.cuda.synchronize()  # IMPORTANT if using GPU
    start = time.time()

    with torch.no_grad():
        for _ in range(50):
            with get_bf16_context(enable_bf16):
                _ = lid_estimator.estimate(random_input_images, t_hyper=0.36, hutchinson_samples=2, token_ids_list=token_ids_list)

    torch.cuda.synchronize()
    end = time.time()

    avg_time = (end - start) / 50
    print(f"Average forward pass time per sample: {avg_time/batch_size:.6f} seconds for batch size {batch_size}")


Average forward pass time per sample: 0.269512 seconds for batch size 4


KeyboardInterrupt: 

: 

In [3]:
########################
# Density estimation speed test
########################

In [3]:

for p in flextok.parameters():
        p.requires_grad_(False)
flextok.eval()

batch_sizes_to_test = [4, 8, 16, 32]
for batch_size in batch_sizes_to_test:

    # generate random torch of shape 4 256
    random_registers = torch.randn(batch_size, 256).to(device)
    random_input_images = torch.randn(batch_size, 3, 256, 256).to(device)

    token_ids_list = [
                torch.as_tensor(t[:1], dtype=torch.long, device=device).unsqueeze(0)
                for t in random_registers
            ]

    # warm up
    for _ in range(10):
        with get_bf16_context(enable_bf16):
            _,_ = flextok.estimate_log_density(random_input_images, timesteps=25, hutchinson_samples=2, token_ids_list=token_ids_list)

    # timing
    torch.cuda.synchronize()  # IMPORTANT if using GPU
    start = time.time()

    for _ in range(50):
        with get_bf16_context(enable_bf16):
            _,_ = flextok.estimate_log_density(random_input_images, timesteps=25, hutchinson_samples=2, token_ids_list=token_ids_list)

    torch.cuda.synchronize()
    end = time.time()

    avg_time = (end - start) / 50
    print(f"Average forward pass time per sample: {avg_time/batch_size:.6f} seconds for batch size {batch_size}")

Average forward pass time per sample: 0.866637 seconds for batch size 4
Average forward pass time per sample: 0.818155 seconds for batch size 8


OutOfMemoryError: CUDA out of memory. Tried to allocate 108.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 141.50 MiB is free. Including non-PyTorch memory, this process has 22.55 GiB memory in use. Of the allocated memory 21.62 GiB is allocated by PyTorch, and 630.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)