In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from fastml.core import *

In [9]:
#export
def normalize(x, mean, std): return (x-mean)/std

In [10]:
#export
def normalize_data(x_train, y_train, x_valid, y_valid): 
    train_mean,train_std = x_train.mean(),x_train.std()
    x_train = normalize(x_train, train_mean, train_std)
    x_valid = normalize(x_valid, train_mean, train_std)
    return x_train, y_train, x_valid, y_valid

In [11]:
#export
def get_data(path):
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

In [12]:
#export
class Dataset():
    def __init__(self, x, y): self.x,self.y = x,y
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i],self.y[i]

In [52]:
#export
def collate(batches):
    xs,ys = zip(*batches)
    return torch.stack(xs),torch.stack(ys)

In [85]:
#export
class Sampler():
    def __init__(self, dataset, batch_size, shuffle=False):
        self.n,self.batch_size,self.shuffle = len(dataset),batch_size,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for batch in range(0, self.n, self.batch_size): yield self.idxs[batch:batch+self.batch_size]

In [115]:
sampler = Sampler([1, 2, 3, 4], 2, True);
[s for s in sampler]

[tensor([3, 1]), tensor([2, 0])]

In [174]:
#export
class DataLoader():
    def __init__(self, dataset, batch_size, collate_fn=collate, shuffle=False, sampler=None): 
        self.dataset, self.batch_size, self.collate_fn, self.shuffle = dataset, batch_size, collate_fn, shuffle
        if sampler == None:
            self.sampler = Sampler(dataset, batch_size, shuffle)
        else: 
            self.sampler = sampler
    
    def __len__(self): return len(self.dataset)//self.batch_size
            
    def __iter__(self):
        for sample in self.sampler:
            yield self.collate_fn([self.dataset[i] for i in sample])

In [166]:
#export
def get_data_loaders(train_ds, valid_ds, batch_size, shuffle_train=True):
    return DataLoader(train_ds, batch_size, shuffle=shuffle_train), DataLoader(valid_ds, batch_size, shuffle=False)

In [101]:
#export
class Datasets:
    
    @classmethod
    def MNIST(cls):
        path = download_data('http://deeplearning.net/data/mnist/mnist.pkl', ext='.gz')
        x_train, y_train, x_valid, y_valid = get_data(path)
        x_train, y_train, x_valid, y_valid = normalize_data(x_train, y_train, x_valid, y_valid)
        return Dataset(x_train, y_train), Dataset(x_valid, y_valid)

In [102]:
train_ds,valid_ds = Datasets.MNIST()

In [103]:
test_near_zero(train_ds.x.mean())
test_near_zero(1-train_ds.x.std())

In [162]:
train_dl,valid_dl = get_data_loaders(train_ds,valid_ds, 64)

In [163]:
_, yb1 = next(iter(train_dl))
_, yb2 = next(iter(train_dl))
assert yb1[0] != yb2[0]

In [164]:
_, yb1 = next(iter(valid_dl))
_, yb2 = next(iter(valid_dl))
assert yb1[0] == yb2[0]

In [213]:
#export
class DataBunch():
    def __init__(self, train_dl, valid_dl, c=None):
        self.train_dl, self.valid_dl, self.c = train_dl,valid_dl,c
        
    @property
    def train_ds(self): return self.train_dl.dataset
        
    @property
    def valid_ds(self): return self.valid_dl.dataset

In [1]:
#export
def data_bunch(train_ds, valid_ds, batch_size, c=None):
    return DataBunch(*get_data_loaders(train_ds,valid_ds, batch_size), c)

In [223]:
data = data_bunch(train_ds,valid_ds, 64)
len(data.train_dl)

781

## Export

In [2]:
!python notebook2script.py data_dev.ipynb fastml/data/data.py

Converted datasets_dev.ipynb to fastml/data/datasets.py
