In [1]:
import os
import pandas as pd
import torch
import cv2
from torch.utils.data import DataLoader, Dataset
from dataloader import custom_collate, PicklebotDataset
import time
from tqdm import tqdm
import numpy as np

dtype = torch.bfloat16

annotations_file = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_val.csv'
video_paths = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_all_together'
batch_size = 16

class PicklebotDataset(Dataset):
    def __init__(self, annotations_file, video_dir, transform=None, target_transform=None, dtype=torch.bfloat16, backend='opencv'):
        self.video_labels = pd.read_csv(annotations_file, engine='pyarrow', encoding='ISO-8859-1')
        self.video_dir = video_dir
        self.transform = transform
        self.target_transform = target_transform
        self.dtype = dtype
        self.backend = backend

    def __len__(self):
        return self.video_labels.shape[0]
        
    def __getitem__(self, idx):
        video_path = os.path.join(self.video_dir, self.video_labels['filename'][idx])

        if self.backend == 'newcv':
            cap = cv2.VideoCapture(video_path)
            cap.set(cv2.CAP_PROP_BUFFERSIZE, 1024)

            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
            frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))

            frames = np.empty((frame_count, frame_height, frame_width,3), dtype=np.uint8) #channel last so we only have to permute once at the end

            for i in range(frame_count):
                ret, frame = cap.read()
                if not ret:
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames[i] = frame

            cap.release()
            video = torch.from_numpy(frames).permute(0,3,1,2).to(self.dtype) / 255

        elif self.backend == 'opencv':
            cap = cv2.VideoCapture(video_path)
            frames = []
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)
            cap.release()
            video = torch.stack(frames).to('cuda').permute(0,3,1,2).to(self.dtype)/255

        label = self.video_labels["zone"][idx]
        if self.transform:
            video = self.transform(video)
        if self.target_transform:
            label = self.target_transform(label)
        return video, label




for be in ['newcv','opencv']:
    #start the timer
    start = time.time()    
    dataset = PicklebotDataset(annotations_file,video_paths,dtype=dtype,backend=be)
    loader = DataLoader(dataset, batch_size=batch_size,shuffle=False,collate_fn=custom_collate,num_workers=8,pin_memory=True)


    for i in tqdm(loader):
        pass
    print(f'time for {be}:',time.time()-start)
        

  6%|▋         | 52/811 [00:34<08:18,  1.52it/s] 


KeyboardInterrupt: 

In [2]:
import torch
import time

# Create a large tensor
tensor_cpu = torch.rand(50000, 50000)
tensor_gpu = tensor_cpu.to('cuda')

# Perform pointwise operation on CPU
start_cpu = time.time()
result_cpu = tensor_cpu / 255
end_cpu = time.time()
cpu_time = end_cpu - start_cpu

# Perform pointwise operation on GPU
start_gpu = time.time()
result_gpu = tensor_gpu / 255
end_gpu = time.time()
gpu_time = end_gpu - start_gpu

print(f"CPU time: {cpu_time:.5f} seconds")
print(f"GPU time: {gpu_time:.5f} seconds")

CPU time: 0.54661 seconds
GPU time: 0.00087 seconds


In [9]:
#for numpy.permute, is it faster to do a ton of small permutes or one big one at the end?
#we test it here:
import numpy as np
import time

a = np.random.rand(1000,224,224,3)
b = np.empty((1000,3,224,224))
start = time.time()
for i in range(len(a)):
    b[i] = a[i].transpose(2,0,1)
print('time for 100 small permutes:',time.time()-start)

a = np.random.rand(1000,224,224,3)
b = np.empty((1000,224,224,3))
start = time.time()
for i in range(1000):
    b[i] = a[i]
b = b.transpose(0,2,3,1)
print('time for one big permute:',time.time()-start)

time for 100 small permutes: 0.2436676025390625
time for one big permute: 0.19919157028198242


In [None]:
#all batch_size=16
#num_workers = 4
# 100%|██████████| 6483/6483 [1:02:53<00:00,  1.72it/s]time for opencv: 3773.560480117798

#num_workers = 6



# num_workers = 8,

# 100%|██████████| 6483/6483 [45:50<00:00,  2.36it/s]  
# time for torchvision: 2751.082986831665
# 100%|██████████| 6483/6483 [39:01<00:00,  2.77it/s] 
# time for opencv: 2341.418078660965

# num_workers = 12 (yes, opencv is practically the same!)
# 100%|██████████| 6483/6483 [39:31<00:00,  2.73it/s] 
# time for torchvision: 2372.112513780594
# 100%|██████████| 6483/6483 [39:01<00:00,  2.77it/s] 
# time for opencv: 2341.3904235363007

In [10]:
from transformers import MobileViTV2Config, MobileViTV2Model
import torch

# Initializing a mobilevitv2-small style configuration
configuration = MobileViTV2Config()

# Initializing a model from the mobilevitv2-small style configuration
model = MobileViTV2Model(configuration)

# Accessing the model configuration
configuration = model.config
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

4388841


In [1]:
from mobilevit import MobileViTV2
import torch
ownmodel = MobileViTV2(num_classes=512)
A = torch.rand(2, 3, 256, 256)
print(ownmodel(A).shape)
print(sum(p.numel() for p in ownmodel.parameters()))



torch.Size([2, 512])
9471273


In [12]:
model

MobileViTV2Model(
  (conv_stem): MobileViTV2ConvLayer(
    (convolution): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (normalization): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation): SiLU()
  )
  (encoder): MobileViTV2Encoder(
    (layer): ModuleList(
      (0): MobileViTV2MobileNetLayer(
        (layer): ModuleList(
          (0): MobileViTV2InvertedResidual(
            (expand_1x1): MobileViTV2ConvLayer(
              (convolution): Conv2d(32, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (activation): SiLU()
            )
            (conv_3x3): MobileViTV2ConvLayer(
              (convolution): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
              (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tr

In [14]:
from datasets import load_dataset
dataset = load_dataset("imagenet-1k", split="train",trust_remote_code=True)

Loading dataset shards:   0%|          | 0/257 [00:00<?, ?it/s]

In [10]:
from nvidia.dali.plugin.pytorch import DALIClassificationIterator, LastBatchPolicy
from nvidia.dali import pipeline_def
import nvidia.dali.fn as fn
import os
import torch
#information for the dali pipeline
sequence_length = 130 #longest videos in our dataset 
initial_prefetch_size = 20
batch_size = 16

@pipeline_def
def dali_video_pipeline(filenames, sequence_length, initial_prefetch_size,mean,std):
    videos, labels = fn.experimental.readers.video(device="cpu", filenames=filenames, sequence_length=sequence_length,
                              shard_id=0, num_shards=1, random_shuffle=False, initial_fill=initial_prefetch_size)
    videos = fn.normalize(videos,mean=mean,stddev=std)
    return videos, labels

#video paths
train_video_paths = '/home/henry/Documents/PythonProjects/picklebotdataset/train_all_together'
val_video_paths = '/home/henry/Documents/PythonProjects/picklebotdataset/val_all_together'
train_files = [f"{train_video_paths}/{video}" for video in os.listdir(train_video_paths)]
val_files = [f"{val_video_paths}/{video}" for video in os.listdir(val_video_paths)]


num_train_videos = len(train_files)
num_val_videos = len(val_files)
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
#multiply mean and val by 255 to convert to 0-255 range
mean = (torch.tensor(mean)*255)[None,None,None,:]
std = (torch.tensor(std)*255)[None,None,None,:]

print("Building DALI pipelines...")

#build our pipelines
train_pipe = dali_video_pipeline(batch_size=batch_size, num_threads=24, device_id=None, filenames=train_files,sequence_length=sequence_length,initial_prefetch_size=initial_prefetch_size,mean=mean*255,std=std*255)
# val_pipe = dali_video_pipeline(batch_size=batch_size, num_threads=24, device_id=None, filenames=val_files, sequence_length=sequence_length,initial_prefetch_size=initial_prefetch_size,mean=mean,std=std)

train_pipe.build()
# val_pipe.build()


train_loader = DALIClassificationIterator(train_pipe, auto_reset=True,last_batch_policy=LastBatchPolicy.PARTIAL, size=num_train_videos)
val_loader = DALIClassificationIterator(val_pipe, auto_reset=True,last_batch_policy=LastBatchPolicy.PARTIAL, size=num_val_videos)

In [1]:
from nvidia.dali.plugin.pytorch import DALIClassificationIterator, LastBatchPolicy
from nvidia.dali import pipeline_def
import nvidia.dali.types as types
import nvidia.dali.fn as fn
import os
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time
from tqdm import tqdm
import numpy as np

def custom_collate(batch, memory_format):
    """Based on fast_collate from the APEX example
       https://github.com/NVIDIA/apex/blob/5b5d41034b506591a316c308c3d2cd14d5187e23/examples/imagenet/main_amp.py#L265
    """
    imgs = [img[0] for img in batch]
    targets = torch.tensor([target[1] for target in batch], dtype=torch.int64)
    w = imgs[0].size[0]
    h = imgs[0].size[1]
    tensor = torch.zeros( (len(imgs), 3, h, w), dtype=torch.uint8).contiguous(memory_format=memory_format)
    for i, img in enumerate(imgs):
        nump_array = np.asarray(img, dtype=np.uint8)
        if(nump_array.ndim < 3):
            nump_array = np.expand_dims(nump_array, axis=-1)
        nump_array = np.rollaxis(nump_array, 2)
        tensor[i] += torch.from_numpy(nump_array)
    return tensor, targets


@pipeline_def
def create_dali_pipeline(data_dir, crop, size, shard_id, num_shards, dali_cpu=True, is_training=True):
    images, labels = fn.readers.file(file_root=data_dir,
                                     shard_id=shard_id,
                                     num_shards=num_shards,
                                     random_shuffle=is_training,
                                     pad_last_batch=True,
                                     name="Reader")
    dali_device = 'cpu' if dali_cpu else 'gpu'
    decoder_device = 'cpu' if dali_cpu else 'mixed'
    # ask nvJPEG to preallocate memory for the biggest sample in ImageNet for CPU and GPU to avoid reallocations in runtime
    device_memory_padding = 211025920 if decoder_device == 'mixed' else 0
    host_memory_padding = 140544512 if decoder_device == 'mixed' else 0
    # ask HW NVJPEG to allocate memory ahead for the biggest image in the data set to avoid reallocations in runtime
    preallocate_width_hint = 5980 if decoder_device == 'mixed' else 0
    preallocate_height_hint = 6430 if decoder_device == 'mixed' else 0
    if is_training:
        images = fn.decoders.image_random_crop(images,
                                               device=decoder_device, output_type=types.RGB,
                                               device_memory_padding=device_memory_padding,
                                               host_memory_padding=host_memory_padding,
                                               preallocate_width_hint=preallocate_width_hint,
                                               preallocate_height_hint=preallocate_height_hint,
                                               random_aspect_ratio=[0.8, 1.25],
                                               random_area=[0.1, 1.0],
                                               num_attempts=100)
        images = fn.resize(images,
                           device=dali_device,
                           resize_x=crop,
                           resize_y=crop,
                           interp_type=types.INTERP_TRIANGULAR)
        mirror = fn.random.coin_flip(probability=0.5)
    else:
        images = fn.decoders.image(images,
                                   device=decoder_device,
                                   output_type=types.RGB)
        images = fn.resize(images,
                           device=dali_device,
                           size=size,
                           mode="not_smaller",
                           interp_type=types.INTERP_TRIANGULAR)
        mirror = False

    images = fn.crop_mirror_normalize(images,
                                      dtype=types.FLOAT,
                                      output_layout="CHW",
                                      crop=(crop, crop),
                                      mean=[0.485 * 255,0.456 * 255,0.406 * 255],
                                      std=[0.229 * 255,0.224 * 255,0.225 * 255],
                                      mirror=mirror)
    labels = labels
    return images, labels


crop_size = 256
val_size = 256
batch_size = 256
num_threads = 24
device_id = None
dali_cpu = True

traindir = '/home/henry/Documents/imagenet/train/'
torch_dataset = datasets.ImageFolder(traindir, transforms.Compose([transforms.RandomResizedCrop(crop_size),transforms.RandomHorizontalFlip()]))
torch_loader = torch.utils.data.DataLoader(torch_dataset,
                                            batch_size=batch_size,
                                            shuffle=False,
                                            num_workers=num_threads,
                                            pin_memory=True,
                                            collate_fn= lambda b: custom_collate(b,torch.channels_last))
train_loader = None
val_loader = None
train_pipe = create_dali_pipeline(batch_size=batch_size,
                                    num_threads=num_threads,
                                    device_id=device_id,
                                    seed=-1,
                                    data_dir=traindir,
                                    crop=crop_size,
                                    size=val_size,
                                    dali_cpu=dali_cpu,
                                    shard_id=0,
                                    num_shards=1,
                                    is_training=True)
train_pipe.build()
train_loader = DALIClassificationIterator(train_pipe, reader_name="Reader",
                                            last_batch_policy=LastBatchPolicy.PARTIAL,
                                            auto_reset=True)

#start the timer
start = time.time()
for data in tqdm(train_loader):
    pass
print('dali time:',time.time()-start)

#start the timer
start = time.time()
for data in tqdm(torch_loader):
    pass
print('torch time:',time.time()-start)

100%|██████████| 5005/5005 [12:06<00:00,  6.89it/s]


dali time: 726.4026355743408


  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(nump_array)
  tensor[i] += torch.from_numpy(

torch time: 254.73878645896912





In [1]:
import torch
from nvidia.dali import pipeline_def
from nvidia.dali import fn
from nvidia.dali.plugin.pytorch import DALIClassificationIterator, LastBatchPolicy
from psutil import cpu_count
import pandas as pd
#information for the dali pipeline
sequence_length = 130 #longest videos in our dataset 
initial_prefetch_size = 1 
batch_size = 16

#video paths
video_paths = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_all_together'
train_df = pd.read_csv('/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_train.csv', engine='pyarrow', encoding='ISO-8859-1')
val_df = pd.read_csv('/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_val.csv', engine='pyarrow', encoding='ISO-8859-1')
train_files = [f"{video_paths}/{video}" for video in train_df['filename']]
val_files = [f"{video_paths}/{video}" for video in val_df['filename']]
train_labels = train_df['zone'].to_list()
val_labels = val_df['zone'].to_list()


num_train_videos = len(train_files)
num_val_videos = len(val_files)

#multiply mean and val by 255 to convert to 0-255 range
mean = [0.485, 0.456, 0.406] 
std = [0.229, 0.224, 0.225]
mean = (torch.tensor(mean)*255)[None,None,None,:]
std = (torch.tensor(std)*255)[None,None,None,:]

@pipeline_def
def dali_video_pipeline(filenames, labels,sequence_length, initial_prefetch_size,mean,std):
    videos, labels = fn.experimental.readers.video(device="cpu", filenames=filenames, sequence_length=sequence_length, labels=labels,
                              random_shuffle=False, initial_fill=initial_prefetch_size)
    videos = fn.normalize(videos,mean=mean,stddev=std)
    return videos, labels



print("Building DALI pipelines...")

#build our pipelines
train_pipe = dali_video_pipeline(batch_size=batch_size, num_threads=cpu_count()//2, device_id=None, filenames=train_files, labels=train_labels,
                            sequence_length=sequence_length,initial_prefetch_size=initial_prefetch_size,mean=mean*255,std=std*255)
val_pipe = dali_video_pipeline(batch_size=batch_size, num_threads=cpu_count()//2, device_id=None, filenames=val_files, labels=val_labels,
                        sequence_length=sequence_length,initial_prefetch_size=initial_prefetch_size,mean=mean,std=std)

train_pipe.build()
val_pipe.build()


train_loader = DALIClassificationIterator(train_pipe, auto_reset=True,last_batch_policy=LastBatchPolicy.PARTIAL, size=num_train_videos)
val_loader = DALIClassificationIterator(val_pipe, auto_reset=True,last_batch_policy=LastBatchPolicy.PARTIAL, size=num_val_videos)


for i in train_loader:
    print(i.shape)

In [1]:
import os
import numpy as np

from nvidia.dali import pipeline_def
import nvidia.dali.fn as fn
import nvidia.dali.types as types

batch_size=2
sequence_length=8
initial_prefetch_size=16
video_directory = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_all_together'
video_files=[video_directory + '/' + f for f in os.listdir(video_directory)]
n_iter=6


@pipeline_def
def video_pipe(filenames):
    videos = fn.experimental.readers.video(device="cpu", filenames=filenames, sequence_length=sequence_length,
                              shard_id=None, num_shards=1, random_shuffle=True, initial_fill=initial_prefetch_size)
    return videos


pipe = video_pipe(batch_size=batch_size, num_threads=2, device_id=None, filenames=video_files, seed=123456)
pipe.build()
print('about to start loop')
for i in range(n_iter):
    pipe_out = pipe.run()
    sequences_out = pipe_out[0]
    print(sequences_out.shape)
