In [1]:
from fastbook import *

# Data In fastai

One of the most important things in fastai to understand is how you prepare your data for a model.  The main workhorse for accomplishing this in fastai is the `DataBlock` api.  Here is a hello world example of how this works:

## Hello World DataBlock

The argument `get_x` and `get_y` operate on an iterable.  Let's define an interable as our data:

In [2]:
data = list(range(100))

In [3]:
def get_x(r): return r
def get_y(r): return r + 10
dblock = DataBlock(get_x=get_x, get_y = get_y)
dsets = dblock.datasets(data)

You can see a dataset like so:

In [4]:
dsets.train[0]

(89, 99)

You can also see a DataLoader like so:

In [5]:
dls = dblock.dataloaders(data, bs=5)

In [6]:
next(iter(dls.train))

(tensor([57, 66, 73, 30, 14]), tensor([67, 76, 83, 40, 24]))

### With A DataFrame

Similarly, you can operate on one row at a time:

In [7]:
import pandas as pd
df = pd.DataFrame({'x': range(100), 'y': range(100) })
df.head()

Unnamed: 0,x,y
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [8]:
def get_x(r): return r.x
def get_y(r): return r.y + 10
dblock = DataBlock(get_x=get_x, get_y=get_y)
dsets = dblock.datasets(df)

In [9]:
dsets.train[0]

(78, 88)

In [10]:
dls = dblock.dataloaders(df, bs=3)
next(iter(dls.train))

(tensor([90, 55, 11]), tensor([100,  65,  21]))

In [134]:
def tracer(nm):
    def f(x, nm):
        # print(f'{nm}:')
        # print(f'\tinput: {x}')
        # import ipdb; ipdb.set_trace()
        return str(x)
    return partial(f, nm=nm)

In [207]:
tb = TransformBlock(item_tfms=tracer('item_tfms'))
# def get_y(l): return sum(l)
db = DataBlock(blocks=(TransformBlock, CategoryBlock),
               get_x=lambda x: x* 0,
               get_y=lambda x: 2)

In [208]:
data = L(range(10))
result = db.datasets(data)

In [209]:
result.train[0]

(0, TensorCategory(0))

In [210]:
result = db.dataloaders(data, bs=3)

In [211]:
thing = iter(result.train)

In [212]:
next(thing)

(tensor([0, 0, 0]), TensorCategory([0, 0, 0]))

In [213]:
next(thing)

(tensor([0, 0, 0]), TensorCategory([0, 0, 0]))

In [108]:
db = DataBlock(blocks=(TransformBlock, tb),
              get_y=lambda x: str(x),
              batch_tfms=tracer('batch_tfms'))

In [95]:
result = db.datasets(data)
result = db.dataloaders(data, bs=3)

In [96]:
result

<fastai.data.core.DataLoaders at 0x7f9faa4c49a0>

In [74]:
thing = iter(result.train)

In [75]:
next(thing)

batch_tfms:
	input: tensor([5, 0, 8])
> [0;32m/var/folders/jj/xl1rktcs6mn7ms6b8vvx8k4r0000gn/T/ipykernel_35435/4244414374.py[0m(6)[0;36mf[0;34m()[0m
[0;32m      5 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m        [0;32mreturn[0m [0mx[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m    [0;32mreturn[0m [0mpartial[0m[0;34m([0m[0mf[0m[0;34m,[0m [0mnm[0m[0;34m=[0m[0mnm[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  x


tensor([5, 0, 8])


ipdb>  c


batch_tfms:
	input: 5
> [0;32m/var/folders/jj/xl1rktcs6mn7ms6b8vvx8k4r0000gn/T/ipykernel_35435/4244414374.py[0m(6)[0;36mf[0;34m()[0m
[0;32m      5 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m        [0;32mreturn[0m [0mx[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m    [0;32mreturn[0m [0mpartial[0m[0;34m([0m[0mf[0m[0;34m,[0m [0mnm[0m[0;34m=[0m[0mnm[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  x


'5'


ipdb>  x


'5'


ipdb>  x


'5'


ipdb>  c


batch_tfms:
	input: 0
> [0;32m/var/folders/jj/xl1rktcs6mn7ms6b8vvx8k4r0000gn/T/ipykernel_35435/4244414374.py[0m(6)[0;36mf[0;34m()[0m
[0;32m      5 [0;31m        [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m        [0;32mreturn[0m [0mx[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m    [0;32mreturn[0m [0mpartial[0m[0;34m([0m[0mf[0m[0;34m,[0m [0mnm[0m[0;34m=[0m[0mnm[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  x


'0'


ipdb>  exit


BdbQuit: 

In [27]:
f = aug_transforms()[0]

In [29]:
f

Flip -- {'size': None, 'mode': 'bilinear', 'pad_mode': 'reflection', 'mode_mask': 'nearest', 'align_corners': True, 'p': 0.5}:
encodes: (TensorImage,object) -> encodes
(TensorMask,object) -> encodes
(TensorBBox,object) -> encodes
(TensorPoint,object) -> encodes
decodes: 