In [12]:
from torch.utils.data import IterableDataset, Dataset, SequentialSampler, Sampler, SubsetRandomSampler, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler, BatchSampler, SequentialSampler, RandomSampler
import numpy as np
import pandas as pd

### Dataset 
    - Map Style Dataset
    - Iterable Style dataset
### DataLoader
### Sampler

# Map Style Datasets
A dataset where its bebeviour is controlled by \__getitem__() and \__len__()

In [13]:
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __getitem__(self, ix):
        return self.data[ix], self.labels[ix]
    
    def __len__(self):
        return len(self.data)


In [14]:
X = list(range(1000))
y = list(range(1000))

ds = CustomDataset(X, y)

print(ds[0])
print(ds[1])
print(len(ds))

(0, 0)
(1, 1)
1000


In [15]:
dl =DataLoader(ds, batch_size=1)
print(dl.sampler)

for b in dl:
    break
print(b)

<torch.utils.data.sampler.SequentialSampler object at 0x7f335e0afd90>
[tensor([0]), tensor([0])]


In [16]:
dl =DataLoader(ds, batch_size=16)

for data, target in dl:
    break
print(data, target)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]) tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])


# Samplers 
control wich items we choose to collect a batch

In [17]:
print([s for s in RandomSampler(y)][:10])

[165, 445, 140, 984, 858, 17, 997, 886, 65, 229]


In [18]:
print([s for s in SequentialSampler(y)][:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [19]:
dl =DataLoader(ds, batch_size=16, shuffle=True)
print(dl.sampler)
for b in dl: break
print(b[0])
print(b[1])

<torch.utils.data.sampler.RandomSampler object at 0x7f335e0cd050>
tensor([950,  53, 387,  20, 878,  87, 789, 307, 724, 153, 823,  97, 597, 319,
         62, 533])
tensor([950,  53, 387,  20, 878,  87, 789, 307, 724, 153, 823,  97, 597, 319,
         62, 533])


In [20]:
b[0].shape

torch.Size([16])

# Batching and collate_fn

If batching enabled dataloader will combine multiple samples into a batch. The default collate functions i quite powerful but there are scenarios where you might need to adjust it

In [21]:
import torch

class CustomRecordDataset(Dataset):
    def __init__(self,records):
        self.records = records
        
    def __getitem__(self, ix):
        return self.records[ix]
    
    def __len__(self):
        return len(self.records)

If a record-field value is a tensor or numpy array they will be concataneted on a additional batch_dimension (dim=0)

In [23]:

R = [{'feature_vector': torch.rand(3,224,224), 'label':1}] * 100

for b in DataLoader(CustomRecordDataset(R), batch_size=4):
    break
b['feature_vector'].shape

torch.Size([4, 3, 224, 224])

In [5]:

R = [{'feature_vector': np.array([1,2,3]), 'label':1}] * 100

for b in DataLoader(CustomRecordDataset(R), 
                    batch_size=4):
    break
b['feature_vector']
b['label']

tensor([1, 1, 1, 1])

In [8]:
def custom_collate(*args):
    for a in args[0]:
        print(a)

In [9]:
for b in DataLoader(CustomRecordDataset(R), batch_size=12, collate_fn=custom_collate):
    break

{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}
{'feature_vector': array([1, 2, 3]), 'label': 1}


## Exercise 1

Given the dataframe below, create a custom dataset (**DataFrameDataset**) that you will be able to use with a DataLoader.   Be aware of the fact that data is stored as strings!


In [24]:
df = pd.DataFrame([{'vector': str(list(np.random.randn(128))), 
                    'cls': str(np.random.randint(5))} for i in range(100)])
df

Unnamed: 0,vector,cls
0,"[-1.699801475934545, 0.09268444777898342, -0.2...",2
1,"[-0.747517082449728, 1.208480231677957, -0.867...",4
2,"[0.7402412279486025, 0.02220018439142956, -0.9...",3
3,"[-1.4197976716370841, 0.7137226157746042, -0.6...",2
4,"[-0.5567741318539665, 0.3752355258290632, -0.6...",1
...,...,...
95,"[-1.0709808154174603, -1.1544218628423701, -1....",2
96,"[1.8072831572288406, 0.5660121141064004, -0.55...",2
97,"[-0.8834171967780721, -1.3738268928543496, -0....",1
98,"[-0.02968978529547735, -0.25260334522969147, -...",0


In [30]:
eval(df.iloc[0].vector)

[-1.699801475934545,
 0.09268444777898342,
 -0.279765963828878,
 0.8599558291589802,
 0.4431945401025378,
 -0.1748647228299461,
 0.9722227564984574,
 -0.6392266325552667,
 -0.26969222491609945,
 -0.42886011833264054,
 0.09217335638997833,
 -0.8992504183881692,
 -0.6775889359777681,
 -0.7265122879993583,
 1.071740693225215,
 -0.7135544097037698,
 -0.24655306178083627,
 -0.7496031244987634,
 0.6371118555962387,
 -1.4680690356960726,
 -1.506858504096004,
 -0.17249433759681707,
 -0.37690362973154207,
 1.3864814205892606,
 -0.1189823489648525,
 -0.674990877121737,
 -0.5465502255568178,
 0.8869418034749424,
 0.6283752971954946,
 1.2506436892769404,
 0.1441429311299944,
 -0.07780328340474876,
 0.0898707803579152,
 -1.1878094502494259,
 2.2105732918727394,
 -0.022199736683702363,
 0.61190478749321,
 -0.0825132374145533,
 -0.1232848066939051,
 0.42307841766139176,
 -0.09334858368291528,
 0.6262091977695436,
 1.7243142808397094,
 -1.1977454837799588,
 -0.7306917745230376,
 0.12211896962565941,
 

In [36]:
class DataFrameDataset(Dataset):
    
    def __init__(self, df):
        self.df = df
        
    def __len__(self):
        return len(self.df)
        
    def __getitem__(self, ix):
        row = self.df.iloc[ix]
        v = np.array(eval(row.vector))
        c = int(row.cls)
        
        return v, c

In [37]:
np.array(eval(df.iloc[0].vector))

array([-1.69980148,  0.09268445, -0.27976596,  0.85995583,  0.44319454,
       -0.17486472,  0.97222276, -0.63922663, -0.26969222, -0.42886012,
        0.09217336, -0.89925042, -0.67758894, -0.72651229,  1.07174069,
       -0.71355441, -0.24655306, -0.74960312,  0.63711186, -1.46806904,
       -1.5068585 , -0.17249434, -0.37690363,  1.38648142, -0.11898235,
       -0.67499088, -0.54655023,  0.8869418 ,  0.6283753 ,  1.25064369,
        0.14414293, -0.07780328,  0.08987078, -1.18780945,  2.21057329,
       -0.02219974,  0.61190479, -0.08251324, -0.12328481,  0.42307842,
       -0.09334858,  0.6262092 ,  1.72431428, -1.19774548, -0.73069177,
        0.12211897,  0.82519516, -0.44115936, -1.17311137,  1.50296651,
       -0.74609928,  0.26618131, -0.22295859, -0.28860316,  1.17216325,
        1.24491673,  0.81181419,  1.03277007, -0.7812265 ,  1.51447292,
       -1.0477604 ,  0.56842073,  0.66640195, -0.51244861, -3.71480701,
       -0.43795522, -0.32598926,  0.05734935,  1.02062956, -0.45

## Expected result


In [38]:

ds = DataFrameDataset(df)

for b in DataLoader(ds, batch_size=32):
    print(b)
    break

    
# expect result: 

# b[0].shape = (32, 128)
# b[1].shape = (32,)
# len(df) = 128


# [tensor([[ 1.2975, -0.5364,  0.2255,  ...,  0.2322,  0.7593, -0.7148],
#          [ 1.1905,  2.5280, -1.0700,  ..., -1.2392,  0.7593,  0.8753],
#          [ 0.7722, -1.0419,  0.3524,  ...,  1.2047,  0.8558, -1.8945],
#          ...,
#          [-0.7296, -0.5408,  0.7186,  ..., -0.9272, -0.9017, -1.9553],
#          [ 0.7755,  1.1396, -0.9595,  ..., -0.5878, -0.3832, -0.8104],
#          [ 2.7820, -0.7205,  1.3589,  ..., -0.4224,  0.2663,  0.9374]],
#         dtype=torch.float64),
#  tensor([3, 1, 4, 4, 4, 2, 4, 1, 0, 2, 3, 3, 1, 1, 1, 2, 1, 4, 3, 4, 1, 4, 4, 3,
#          0, 3, 0, 3, 1, 1, 2, 2])]

[tensor([[-1.6998,  0.0927, -0.2798,  ...,  0.0354, -0.0410, -0.3734],
        [-0.7475,  1.2085, -0.8676,  ..., -0.4179, -0.7688,  0.5704],
        [ 0.7402,  0.0222, -0.9671,  ...,  0.7752,  0.2379, -0.7153],
        ...,
        [-2.3917,  1.2423,  1.6472,  ...,  0.8940, -0.4067, -0.3080],
        [-0.0841,  0.4208,  0.5599,  ...,  0.2798,  1.0094,  0.3988],
        [-0.2963,  0.5013,  0.3362,  ..., -0.6216, -0.3353,  0.3426]],
       dtype=torch.float64), tensor([2, 4, 3, 2, 1, 4, 1, 2, 3, 1, 0, 1, 1, 3, 4, 2, 0, 1, 0, 0, 1, 4, 1, 1,
        3, 2, 4, 2, 1, 1, 4, 1])]


In [41]:
b[1].shape

torch.Size([32])