In [None]:
import os
import time

import torch
from random import shuffle
import numpy as np
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from PIL import Image
from functools import partial
from nvidia.dali.backend import TensorListGPU
from nvidia.dali import pipeline_def, Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import nvidia.dali.fn as fn

import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

import ctypes
from ctypes import CFUNCTYPE, POINTER, c_int, c_void_p, byref, pointer
import xnvme.ctypes_bindings as xnvme
from xnvme.ctypes_bindings.api import char_pointer_cast


DATADIR = "/data"
BATCH_SIZE = 4 # batch size per GPU

In [None]:
def buf_and_view(dev, buffer_size):
    """
    Returns a buffer of the given 'buffer_size' along with a numpy-view

    The view covers the entire buffer and typed as uint8. The buffer is
    zero-filled via the view. Thus, the pages backing the buffer should be
    allocated.
    """

    buf = xnvme.xnvme_buf_alloc(dev, buffer_size)

    view = np.ctypeslib.as_array(
        ctypes.cast(buf, ctypes.POINTER(ctypes.c_uint8)),
        shape=(buffer_size,),
    )
    view[:] = 0  # Zero memory and force page allocation

    return buf, view

In [None]:
#https://docs.nvidia.com/deeplearning/dali/user-guide/docs/examples/image_processing/decoder_examples.html
def show_images(image_batch):
    columns = 4
    rows = BATCH_SIZE // (columns)
    fig = plt.figure(figsize=(32, (32 // columns) * rows))
    gs = gridspec.GridSpec(rows, columns)
    for j in range(rows * columns):
        plt.subplot(gs[j])
        plt.axis("off")
        plt.imshow(image_batch.at(j))

def show_pipeline_output(pipe):
    pipe.build()
    images, _ = pipe.run()
    if isinstance(images, TensorListGPU):
        images = images.as_cpu()
    show_images(images)

In [None]:
@pipeline_def
def ref_pipeline(crop=224):
    traindir = os.path.join(DATADIR, "train")
    # We are interested in replacing this call with an external source
    jpegs, labels = fn.readers.file(file_root=traindir,
            shard_id=0,
            num_shards=1,
            random_shuffle=True,
            pad_last_batch=True,)
    images = fn.decoders.image(jpegs,
        device="mixed",
        output_type=types.RGB,
        device_memory_padding=211025920,
        host_memory_padding=140544512,
    )
    return images, labels

In [None]:
# Based on https://github.com/NVIDIA/DALI/blob/main/docs/examples/frameworks/pytorch/pytorch-external_input.ipynb
class FileInputIterator(object):
    # This is essentially fn.readers.file reimplemented as an external source
    def __init__(self, batch_size, device_id, num_gpus):
        self.images_dir = "/data/train/"
        self.batch_size = batch_size
        dirs = {os.path.join(self.images_dir, d): i for i, d in enumerate(sorted(os.listdir(self.images_dir)))}
        self.files = [(os.path.join(root, file), dirs[root]) for root, _, files in os.walk(self.images_dir) for file in files]
        # whole data set size
        self.data_set_len = len(self.files)
        # based on the device_id and total number of GPUs - world size
        # get proper shard
        self.files = self.files[
            self.data_set_len
            * device_id
            // num_gpus : self.data_set_len
            * (device_id + 1)
            // num_gpus
        ]
        self.n = len(self.files)

    def __iter__(self):
        self.i = 0
        shuffle(self.files)
        return self

    def __next__(self):
        batch = []
        labels = []

        if self.i >= self.n:
            self.__iter__()
            raise StopIteration

        for _ in range(self.batch_size):
            jpeg, label = self.files[self.i % self.n]

            batch.append(
                np.fromfile(jpeg, dtype=np.uint8)
            )
            labels.append(
                torch.tensor([label], dtype=torch.int32)
            )
            self.i += 1
        return (batch, labels)

    def __len__(self):
        return self.data_set_len

    next = __next__
    
@pipeline_def
def file_pipeline(crop=224):
    traindir = os.path.join(DATADIR, "train")
    jpegs, labels = fn.external_source(
        source=FileInputIterator(BATCH_SIZE, 0, 1), num_outputs=2, dtype=[types.UINT8, types.INT32]
    )
    images = fn.decoders.image(jpegs,
        device="mixed",
        output_type=types.RGB,
        device_memory_padding=211025920,
        host_memory_padding=140544512,
    )
    return images, labels

In [None]:
class XNVMEFileInputIterator(object):
    # This is implemented using the xNVMe file API
    def __init__(self, batch_size, device_id, num_gpus):
        self.images_dir = "/data/train/"
        self.batch_size = batch_size
        dirs = {os.path.join(self.images_dir, d): i for i, d in enumerate(sorted(os.listdir(self.images_dir)))}
        self.files = [(os.path.join(root, file), dirs[root]) for root, _, files in os.walk(self.images_dir) for file in files]
        # whole data set size
        self.data_set_len = len(self.files)
        # based on the device_id and total number of GPUs - world size
        # get proper shard
        self.files = self.files[
            self.data_set_len
            * device_id
            // num_gpus : self.data_set_len
            * (device_id + 1)
            // num_gpus
        ]
        self.n = len(self.files)

    def __iter__(self):
        self.i = 0
        shuffle(self.files)
        return self

    def __next__(self):
        batch = []
        labels = []

        if self.i >= self.n:
            self.__iter__()
            raise StopIteration
        
        opts = xnvme.xnvme_opts()
        xnvme.xnvme_opts_set_defaults(ctypes.byref(opts))
        for _ in range(self.batch_size):
            jpeg, label = self.files[self.i % self.n]
                
            file = xnvme.xnvme_file_open(char_pointer_cast(jpeg), ctypes.byref(opts))
            assert file
            size = xnvme.xnvme_dev_get_geo(file).contents.tbytes
            buf, view = buf_and_view(file, size)
            ctx = xnvme.xnvme_file_get_cmd_ctx(file)
            err = xnvme.xnvme_file_pread(byref(ctx), buf, size, 0)
            assert not err
            assert pointer(ctx).contents.cpl.result == size == view.shape[0]
            
            batch.append(
                np.frombuffer(np.copy(view), dtype=np.uint8)
            )
            
            labels.append(
                torch.tensor([label], dtype=torch.int32)
            )
            xnvme.xnvme_buf_free(file, buf)
            xnvme.xnvme_file_close(file)
            
            self.i += 1
        
        return (batch, labels)

    next = __next__
    
@pipeline_def
def xnvme_file_pipeline(crop=224):
    traindir = os.path.join(DATADIR, "train")
    jpegs, labels = fn.external_source(
        source=XNVMEFileInputIterator(BATCH_SIZE, 0, 1),
        num_outputs=2, dtype=[types.UINT8, types.INT32],
    )
    breakpoint()
    images = fn.decoders.image(jpegs,
        device="mixed",
        output_type=types.RGB,
        device_memory_padding=211025920,
        host_memory_padding=140544512,
    )
    return images, labels

In [None]:
class XNVMEBlockInputIterator(object):
    # This will be implemented using the xNVMe API with the filename -> block mapping 
    def __init__(self, batch_size, device_id, num_gpus):
        pass

    def __iter__(self):
        pass

    def __next__(self):
        pass

    def __len__(self):
        pass

    next = __next__
    
@pipeline_def
def xnvme_block_pipeline(crop=224):
    traindir = os.path.join(DATADIR, "train")
    jpegs, labels = fn.external_source(
        source=XNVMEBlockInputIterator(BATCH_SIZE, 0, 1), num_outputs=2, dtype=[types.UINT8, types.INT32]
    )
    images = fn.decoders.image(jpegs,
        device="mixed",
        output_type=types.RGB,
        device_memory_padding=211025920,
        host_memory_padding=140544512,
    )
    return images, labels

In [None]:
# The reference using DALI fn.readers.file
ref_pipe = ref_pipeline(batch_size=BATCH_SIZE, num_threads=2, device_id=0)
ref_start = time.time()
show_pipeline_output(ref_pipe)
ref_end = time.time()


# Replacing with fn.readers.file with external source
file_pipe = file_pipeline(batch_size=BATCH_SIZE, num_threads=2, device_id=0)
file_start = time.time()
show_pipeline_output(file_pipe)
file_end = time.time()


# Using xNVMe file with the external source
xnvme_file_pipe = xnvme_file_pipeline(batch_size=BATCH_SIZE, num_threads=2, device_id=0)
xnvme_file_start = time.time()
show_pipeline_output(xnvme_file_pipe)
xnvme_file_end = time.time()

print("Reference:", ref_end - ref_start)
print("File:", file_end - file_start)
print("xNVMe File:", xnvme_file_end - xnvme_file_start)


# Using xNVMe block with the mapping layer
# xnvme_block_pipe = xnvme_block_pipeline(batch_size=BATCH_SIZE, num_threads=2, device_id=0)
# show_pipeline_output(xnvme_block_pipe)