In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F

**Get Data**

In [3]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

In [4]:
from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [5]:
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

**Dataset Class**

In [6]:
class Dataset:
  def __init__(self, x, y):
    self.x, self.y = x, y

  def __len__(self):
    return len(self.x)

  def __getitem__(self, i):
    return self.x[i], self.y[i]

In [8]:
train_ds, valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)
assert len(train_ds)==len(x_train)
assert len(valid_ds)==len(x_valid)

In [9]:
xb,yb = train_ds[0:5]
assert xb.shape==(5,28*28)
assert yb.shape==(5,)
xb,yb

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([5, 0, 4, 1, 9]))

**DataLoader Class**

In [10]:
class DataLoader:
  def __init__(self, ds, bs):
    self.ds, self.bs = ds, bs

  def __iter__(self):
    for i in range(0, len(self.ds), self.bs): yield self.ds[i:i+self.bs]

In [11]:
train_dl = DataLoader(train_ds, 16)

In [13]:
xb, yb = next(iter(train_dl))
print(xb.shape, yb.shape)
xb, yb

torch.Size([16, 784]) torch.Size([16])


(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7]))

**Random Sampling**

We want our data to be randomly sampled for the training so we need to add randomness to the dataloader.

In [14]:
import random

class Sampler:
  def __init__(self, ds, shuffle=False): self.n, self.shuffle = len(ds), shuffle
  def __iter__(self):
    res = list(range(self.n))
    if self.shuffle: random.shuffle(res)
    return iter(res)

In [22]:
x, y = torch.randn(10,2), torch.arange(10)
ds_x = Dataset(x, y)

samp_x = Sampler(ds_x, shuffle=True)
list(islice(samp_x, 10))

[4, 9, 1, 5, 3, 2, 6, 0, 8, 7]

In [15]:
from itertools import islice

ss = Sampler(train_ds)
list(islice(ss, 5))

[0, 1, 2, 3, 4]

In [17]:
ss_shuffle = Sampler(train_ds, shuffle=True)
list(islice(ss_shuffle, 5))

[38594, 7872, 42339, 17660, 44081]

In [36]:
class BatchSampler:
  def __init__(self, sampler, bs, drop_last=False):
    self.sampler, self.bs, self.drop_last = sampler, bs, drop_last

  def __iter__(self):
    batch = []
    for idx in self.sampler:
      batch.append(idx)
      if len(batch) == self.bs:
        yield batch
        batch = []
    if batch and not self.drop_last:
      yield batch

In [39]:
batches = BatchSampler(samp_x, 3)

In [45]:
for batch in batches:
  print(batch)

[1, 9, 0]
[5, 3, 4]
[2, 8, 7]
[6]
