In [4]:
import torch
from torch.utils.data import DataLoader
import numpy as np


data = np.array([
    [0.1, 7.4, 0],
    [-0.2, 5.3, 0],
    [0.2, 8.2, 1],
    [0.2, 7.7, 1]
])
loader = DataLoader(data, batch_size=2, shuffle=False)
batch = next(iter(loader))
batch

tensor([[ 0.1000,  7.4000,  0.0000],
        [-0.2000,  5.3000,  0.0000]], dtype=torch.float64)

In [5]:
dict_data = [
    {'x1': 0.1, 'x2': 7.4, 'y': 0},
    {'x1': -0.2, 'x2': 5.3, 'y': 0},
    {'x1': 0.2, 'x2': 8.2, 'y': 1},
    {'x1': 0.2, 'x2': 7.7, 'y': 10},
]
loader = DataLoader(dict_data, batch_size=2, shuffle=False)
batch = next(iter(loader))
batch

{'x1': tensor([ 0.1000, -0.2000], dtype=torch.float64),
 'x2': tensor([7.4000, 5.3000], dtype=torch.float64),
 'y': tensor([0, 0])}

In [6]:
nlp_data = [
    {'tokenized_input': [1, 4, 5, 9, 3, 2], 'label':0},
    {'tokenized_input': [1, 7, 3, 14, 48, 7, 23, 154, 2], 'label':0},
    {'tokenized_input': [1, 30, 67, 117, 21, 15, 2], 'label':1},
    {'tokenized_input': [1, 17, 2], 'label':0},
]
loader = DataLoader(nlp_data, batch_size=2, shuffle=False)
batch = next(iter(loader))

RuntimeError: each element in list of batch should be of equal size

In [7]:
from torch.nn.utils.rnn import pad_sequence


def custom_collate(data):
    inputs = [torch.tensor(d['tokenized_input']) for d in data]
    labels = [d['label'] for d in data]

    inputs = pad_sequence(inputs, batch_first=True)
    labels = torch.tensor(labels)

    return {'tokenized_input': inputs,'label': labels}


loader = DataLoader(nlp_data, batch_size=2, shuffle=False, collate_fn=custom_collate)

iter_loader = iter(loader)
batch1 = next(iter_loader)
print(batch1)
batch2 = next(iter_loader)
print(batch2)

{'tokenized_input': tensor([[  1,   4,   5,   9,   3,   2,   0,   0,   0],
        [  1,   7,   3,  14,  48,   7,  23, 154,   2]]), 'label': tensor([0, 0])}
{'tokenized_input': tensor([[  1,  30,  67, 117,  21,  15,   2],
        [  1,  17,   2,   0,   0,   0,   0]]), 'label': tensor([1, 0])}


In [8]:
img = torch.rand([100,100,3])

caption_data = [
    {'tokenized_input': torch.Tensor([1, 4, 5, 9, 3, 2]), 'image': img},
    {'tokenized_input': torch.Tensor([1, 7, 3, 14, 48, 7, 23, 154, 2]), 'image': img},
    {'tokenized_input': torch.Tensor([1, 30, 67, 117, 21, 15, 2]), 'image': img},
    {'tokenized_input': torch.Tensor([1, 17, 2]), 'image': img},
]

In [9]:
def collate_v2(batch):
    imgs = [item['image'].unsqueeze(0) for item in batch]
    img = torch.cat(imgs, dim=0)
    targets = [item['tokenized_input'] for item in batch]
    targets = pad_sequence(targets, batch_first=False)
    return img, targets


loader = DataLoader(caption_data, batch_size=2, shuffle=False, collate_fn=collate_v2)
batch1 = next(iter(loader))
batch1

(tensor([[[[0.1041, 0.2867, 0.3715],
           [0.4931, 0.8088, 0.2303],
           [0.3669, 0.2837, 0.0822],
           ...,
           [0.3211, 0.9675, 0.4290],
           [0.8574, 0.9144, 0.8274],
           [0.5688, 0.2943, 0.0994]],
 
          [[0.5970, 0.7365, 0.6103],
           [0.3946, 0.5017, 0.0118],
           [0.5527, 0.0566, 0.7331],
           ...,
           [0.7194, 0.5310, 0.6839],
           [0.4039, 0.2468, 0.8682],
           [0.4612, 0.5002, 0.9753]],
 
          [[0.6160, 0.0323, 0.6637],
           [0.1150, 0.9734, 0.4589],
           [0.6361, 0.9561, 0.1510],
           ...,
           [0.9925, 0.8388, 0.8565],
           [0.0661, 0.9182, 0.3989],
           [0.7344, 0.2131, 0.7458]],
 
          ...,
 
          [[0.1396, 0.2027, 0.4563],
           [0.7817, 0.1654, 0.7436],
           [0.6120, 0.3277, 0.9097],
           ...,
           [0.7917, 0.9720, 0.0761],
           [0.4049, 0.7703, 0.1730],
           [0.5654, 0.4673, 0.3991]],
 
          [[0.3920,