In [1]:
from torch.utils.data import IterableDataset, Dataset, SequentialSampler, Sampler, SubsetRandomSampler, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler, BatchSampler, SequentialSampler, RandomSampler
import numpy as np
import pandas as pd

### Dataset 
    - Map Style Dataset
    - Iterable Style dataset
### DataLoader
### Sampler

# Map Style Datasets
A dataset where its bebeviour is controlled by \__getitem__() and \__len__()

In [2]:
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __getitem__(self, ix):
        return self.data[ix], self.labels[ix]
    
    def __len__(self):
        return len(self.data)


In [3]:
X = list(range(1000))
y = list(range(1000))

ds = CustomDataset(X, y)

print(ds[0])
print(ds[1])
print(len(ds))

(0, 0)
(1, 1)
1000


In [298]:
dl =DataLoader(ds, batch_size=1)
print(dl.sampler)

for b in dl:
    break
print(b)

<torch.utils.data.sampler.SequentialSampler object at 0x128644890>
[tensor([0]), tensor([0])]


In [299]:
dl =DataLoader(ds, batch_size=16)

for data, target in dl:
    break
print(data, target)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])


# Samplers 
control wich items we choose to collect a batch

In [300]:
print([s for s in RandomSampler(y)][:10])

[285, 756, 870, 239, 564, 813, 76, 385, 321, 90]


In [301]:
print([s for s in SequentialSampler(y)][:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [302]:
dl =DataLoader(ds, batch_size=16, shuffle=True)
print(dl.sampler)
for b in dl: break
print(b[0])
print(b[1])

<torch.utils.data.sampler.RandomSampler object at 0x12adc1a10>
tensor([379, 917, 713, 731,   8, 720,  74,  58, 197, 449, 287, 461, 608, 640,
        536,  51])
tensor([379, 917, 713, 731,   8, 720,  74,  58, 197, 449, 287, 461, 608, 640,
        536,  51])


# Batching and collate_fn

If batching enabled dataloader will combine multiple samples into a batch. The default collate functions i quite powerful but there are scenarios where you might need to adjust it

In [4]:
import torch

class CustomRecordDataset(Dataset):
    def __init__(self,records):
        self.records = records
        
    def __getitem__(self, ix):
        return self.records[ix]
    
    def __len__(self):
        return len(self.records)

If a record-field value is a tensor or numpy array they will be concataneted on a additional batch_dimension (dim=0)

In [304]:

R = [{'feature_vector': torch.rand(3,224,224), 'label':1}] * 100

for b in DataLoader(CustomRecordDataset(R), batch_size=4):
    break
b['feature_vector'].shape

torch.Size([4, 3, 224, 224])

In [5]:

R = [{'feature_vector': np.array([1,2,3]), 'label':1}] * 100

for b in DataLoader(CustomRecordDataset(R), 
                    batch_size=4):
    break
b['feature_vector']
b['label']

tensor([1, 1, 1, 1])

In [8]:
def custom_collate(*args):
    for a in args[0]:
        print(a)

In [9]:
for b in DataLoader(CustomRecordDataset(R), batch_size=12, collate_fn=custom_collate):
    break

{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}


## Exercise 1

Given the dataframe below, create a custom dataset (**DataFrameDataset**) that you will be able to use with a DataLoader.   Be aware of the fact that data is stored as strings!


In [55]:
df = pd.DataFrame([{'vector': str(list(np.random.randn(128))), 
                    'cls': str(np.random.randint(5))} for i in range(100)])
df

Unnamed: 0,vector,cls
0,"[-0.6455748076557228, 0.14581910048898597, 0.7...",2
1,"[0.30972425360634626, 0.5334698343817319, 0.18...",0
2,"[2.00549490085801, 0.24793027917198568, -0.833...",2
3,"[1.7075235609806672, 1.0314500868555092, -1.51...",1
4,"[1.035192799215285, -0.40234613870886565, 1.35...",2
...,...,...
95,"[-2.0379258586075757, 0.9748072853267833, 0.16...",2
96,"[-0.5632616120688013, 0.9628132799409411, 0.98...",2
97,"[0.9415353378508489, -0.2491448249107736, -1.1...",2
98,"[0.37183559847150327, 0.08097419444312148, 1.8...",4


In [51]:
class DataFrameDataset(Dataset):
    
    def __init__(self, df):
        # your code
        pass
        
        
    def __len__(self):
        # your code
        pass
        
        
    def __getitem__(self, ix):
    
#        # yout code
        pass
        

## Expected result


In [53]:

ds = DataFrameDataset(df)

for b in DataLoader(ds, batch_size=32):
    print(b)
    break

    
# expect result: 

# b[0].shape = (32, 128)
# b[1].shape = (32,)
# len(df) = 128


# [tensor([[ 1.2975, -0.5364,  0.2255,  ...,  0.2322,  0.7593, -0.7148],
#          [ 1.1905,  2.5280, -1.0700,  ..., -1.2392,  0.7593,  0.8753],
#          [ 0.7722, -1.0419,  0.3524,  ...,  1.2047,  0.8558, -1.8945],
#          ...,
#          [-0.7296, -0.5408,  0.7186,  ..., -0.9272, -0.9017, -1.9553],
#          [ 0.7755,  1.1396, -0.9595,  ..., -0.5878, -0.3832, -0.8104],
#          [ 2.7820, -0.7205,  1.3589,  ..., -0.4224,  0.2663,  0.9374]],
#         dtype=torch.float64),
#  tensor([3, 1, 4, 4, 4, 2, 4, 1, 0, 2, 3, 3, 1, 1, 1, 2, 1, 4, 3, 4, 1, 4, 4, 3,
#          0, 3, 0, 3, 1, 1, 2, 2])]