In [2]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
import cv2
import torch.nn.functional as F
from torchvision.io import read_video
from torch.utils.data import DataLoader
from dataloader import custom_collate
import time
from tqdm import tqdm
import gc

dtype = torch.bfloat16
val_annotations_file = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_val.csv'
video_paths = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_all_together'
batch_size = 16


class cv2PicklebotDataset(Dataset):
    def __init__(self, annotations_file, video_dir, transform=None, target_transform=None, dtype=torch.bfloat16):
        self.video_labels = pd.read_csv(annotations_file, engine='pyarrow', encoding='ISO-8859-1')
        self.video_dir = video_dir
        self.transform = transform
        self.target_transform = target_transform
        self.dtype = dtype

    def __len__(self):
        return self.video_labels.shape[0]
        
    def __getitem__(self, idx):
        video_path = os.path.join(self.video_dir, self.video_labels['filename'][idx])
        cap = cv2.VideoCapture(video_path)
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = torch.from_numpy(frame).permute(2, 0, 1).to(self.dtype) / 255
            frames.append(frame)
        cap.release()
        video = torch.stack(frames)
        label = self.video_labels["zone"][idx]
        if self.transform:
            video = self.transform(video)
        if self.target_transform:
            label = self.target_transform(label)
        return video, label
    
class PicklebotDataset(Dataset):
    def __init__(self, annotations_file, video_dir, transform=None,target_transform=None,dtype=torch.bfloat16):
        self.video_labels = pd.read_csv(annotations_file,engine='pyarrow',encoding='ISO-8859-1')
        self.video_dir = video_dir
        self.transform = transform
        self.target_transform = target_transform
        self.dtype = dtype

    def __len__(self):
        return self.video_labels.shape[0]
        
    def __getitem__(self,idx):
        video_path = os.path.join(self.video_dir, self.video_labels['filename'][idx])
        video = ((read_video(video_path,output_format="TCHW",pts_unit='sec')[0]).to(self.dtype))/255
        label = self.video_labels["zone"][idx]
        if self.transform:
            video = self.transform(video)
        if self.target_transform:
            label = self.target_transform(label)
        return video, label
    


cv2val_dataset = cv2PicklebotDataset(val_annotations_file,video_paths,dtype=dtype)
val_dataset = PicklebotDataset(val_annotations_file,video_paths,dtype=dtype)
cv2val_loader = DataLoader(cv2val_dataset, batch_size=batch_size,shuffle=False,collate_fn=custom_collate,num_workers=16,pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size,shuffle=False,collate_fn=custom_collate,num_workers=16,pin_memory=True)


#test how long it takes each loader to load the data, passing in the loop, and running both loaders twice
for i in range(2):
    print('cv2 loader')
    start = time.time()
    for batch in tqdm(cv2val_loader):
        pass
        batch = None
    print('time:',time.time()-start)
    
    gc.collect()
    print('old loader')
    start = time.time()
    for batch in tqdm(val_loader):
        pass
        batch = None
    
    print('time:',time.time()-start)
    gc.collect()

cv2 loader


100%|██████████| 811/811 [05:54<00:00,  2.28it/s] 


time: 354.9956748485565
old loader


  0%|          | 0/811 [00:00<?, ?it/s]

In [None]:
import os
import pandas as pd
import torch
import cv2
from torch.utils.data import DataLoader
from dataloader import custom_collate, PicklebotDataset
import time
from tqdm import tqdm

dtype = torch.bfloat16

annotations_file = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_train.csv'
video_paths = '/home/henry/Documents/PythonProjects/picklebot_2m/picklebot_130k_all_together'
batch_size = 16

for be in ['opencv']: #,'torchvision']:
    #start the timer
    start = time.time()    
    dataset = PicklebotDataset(annotations_file,video_paths,dtype=dtype,backend=be)
    loader = DataLoader(dataset, batch_size=batch_size,shuffle=False,collate_fn=custom_collate,num_workers=6,pin_memory=True)


    for i in tqdm(loader):
        pass
    print(f'time for {be}:',time.time()-start)
        

In [None]:
#all batch_size=16
#num_workers = 4
# 100%|██████████| 6483/6483 [1:02:53<00:00,  1.72it/s]time for opencv: 3773.560480117798

#num_workers = 6



# num_workers = 8,

# 100%|██████████| 6483/6483 [45:50<00:00,  2.36it/s]  
# time for torchvision: 2751.082986831665
# 100%|██████████| 6483/6483 [39:01<00:00,  2.77it/s] 
# time for opencv: 2341.418078660965

# num_workers = 12 (yes, opencv is practically the same!)
# 100%|██████████| 6483/6483 [39:31<00:00,  2.73it/s] 
# time for torchvision: 2372.112513780594
# 100%|██████████| 6483/6483 [39:01<00:00,  2.77it/s] 
# time for opencv: 2341.3904235363007



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from mobilenet import Bottleneck2D, Bottleneck3D
from einops import rearrange
from einops.layers.torch import Reduce
from typing import Union, Tuple, List
from mobilevit import MobileViTBlock, conv_nxn_bn, LinearAttention, LinearFeedForward
init_dim = 16
expansion = 4
depths = (2,4,5)
kernel_size = 3
patch_size = (2,2)
use_linear_attention = False
image_size=(256,256)
dims=[96,120,144]
channels=[16, 32, 48, 48, 64, 64, 80, 80, 96, 96, 384]

conv1 = conv_nxn_bn(3, init_dim, stride=2) #3 channels to 16
bn1 = Bottleneck2D(channels[0],channels[1],expanded_channels=channels[0]*expansion,stride=1)
bn2 = Bottleneck2D(channels[1],channels[2],expanded_channels=channels[1]*expansion,stride=2)
bn3 = Bottleneck2D(channels[2],channels[3],expanded_channels=channels[2]*expansion,stride=1)
bn4 = Bottleneck2D(channels[2],channels[3],expanded_channels=channels[2]*expansion,stride=1)
bn5 = Bottleneck2D(channels[3],channels[4],expanded_channels=channels[3]*expansion,stride=2)
tf1 = MobileViTBlock(dims[0],depths[0],channels[5],kernel_size,patch_size, int(dims[0]*2),use_linear_attention=use_linear_attention)

ln1 = nn.GroupNorm(num_groups=1,num_channels=dims[0], eps=1e-05)
attn = LinearAttention(embed_dim=dims[0], dropout=0.)
dropout = nn.Dropout(p=0.)
ln2 = nn.GroupNorm(num_groups=1,num_channels=dims[0], eps=1e-05)
ffw = LinearFeedForward(embed_dim=dims[0], ffw_dim=dims[0], dropout=0.)




tf2 = MobileViTBlock(dims[0],depths[0],channels[5],kernel_size,patch_size, int(dims[0]*2),use_linear_attention=True)

A = torch.randn(2,3,256,256)

A = conv1(A)

A = bn1(A)
A = bn2(A)
A = bn3(A)
A = bn4(A)
A = bn5(A)
A1 = tf1(A)
B = A.clone()
print(A.shape)
A = ln1(A)
A = attn(A)
A = dropout(A)
A = A + B
A =ln2(A)
A = ffw(A)
A = A + B

conv1 shape: torch.Size([2, 64, 32, 32])
torch.Size([2, 64, 32, 32])


RuntimeError: Expected weight to be a vector of size equal to the number of channels in input, but got weight of shape [96] and input of shape [2, 64, 32, 32]

In [1]:
from mobilevitv2 import *
import torch

model = MobileViTV2()
A = torch.rand(2,3,256,256)
A = model(A)


shape after bottleneck 3.5: torch.Size([2, 128, 64, 64])
