### Data Block API
This notebook curates dataset and the Data Block API calls that will create the Data Bunch

https://docs.fast.ai/data_block.html

#### Import

In [1]:
from pathlib import *
from six.moves import urllib

from fastai.vision import *
from fastai.collab import *
from fastai.tabular import *
from fastai.text import *

from fastai.metrics import error_rate

%load_ext autoreload
%autoreload 2

#### Download sample images

In [2]:
img_path = Path('samples')

img_name = 'car_bbox.jpg'

if not (img_path/img_name).exists():    
    urllib.request.urlretrieve('https://github.com/fastai/fastai/blob/master/docs/imgs/car_bbox.jpg?raw=true', img_path/img_name)[0]
       

### Image Single Label Classification

#### MNIST (ImageNet style)

<img src="nb_images/imagenet_style.jpg" style="width:250;height:250px;">

**train/valid folders containing subfolders whose name are the labels**

In [None]:
path = untar_data(URLs.MNIST_TINY)
tfms = get_transforms(do_flip=False)
path.ls()

In [None]:
tmp = ImageItemList.from_folder(path); print(type(tmp))

tmp = tmp.split_by_folder(); print(type(tmp))

tmp = tmp.label_from_folder(); print(type(tmp))

tmp.train.y[-1].data

In [None]:
data = (ImageItemList.from_folder(path)
       .split_by_folder()
       .label_from_folder()
       .add_test_folder()
       .transform(tfms, size=64)
       .databunch())

In [None]:
data.show_batch(3, figsize=(6,6), hide_axis=False)

#### Pets

<img src="nb_images/label_embedded_in_filenames.jpg" style="width:250;height:250px;">

**Labels embedded in the filenames.**

In [None]:
path = untar_data(URLs.PETS)
path.ls()

In [None]:
path_img = path/'images'
fnames = get_image_files(path_img)
tfms=get_transforms()

np.random.seed(2)
pat = r'/([^/]+)_\d+.jpg$'

In [None]:
data = (ImageItemList.from_folder(path_img)
        .random_split_by_pct()
        .label_from_re(pat)
        .transform(tfms, size=224)
        .databunch(bs=64, num_workers=0))

In [None]:
data.show_batch(3, figsize=(8,8))

### Image Multi Label classification

#### Planets

<img src="nb_images/multi_label.jpg" style="width:250;height:250px;">

In [None]:
planet = untar_data(URLs.PLANET_TINY)
planet_tfms = get_transforms(flip_vert=True, max_lighting=0.1, max_zoom=1.05, max_warp=0.)

In [None]:
data = (ImageItemList.from_csv(planet, 'labels.csv', folder='train', suffix='.jpg')
        .random_split_by_pct()
        .label_from_df(label_delim=' ')
        .transform(planet_tfms, size=128)
        .databunch())                         

In [None]:
data.show_batch(rows=2, figsize=(9,7))

### Image Segmentation

#### Camvid

<img src="nb_images/image_segmentation.jpg" style="width:250;height:250px;">

In [None]:
camvid = untar_data(URLs.CAMVID_TINY)
path_lbl = camvid/'labels'
path_img = camvid/'images'

In [None]:
codes = np.loadtxt(camvid/'codes.txt', dtype=str); codes

In [None]:
get_y_fn = lambda x: path_lbl/f'{x.stem}_P{x.suffix}'   # file pattern for the corresponding masks

In [None]:
data = (SegmentationItemList.from_folder(path_img)
        .random_split_by_pct()
        .label_from_func(get_y_fn, classes=codes)
        .transform(get_transforms(), tfm_y=True, size=128)
        .databunch(num_workers=0)
        )

In [None]:
data.show_batch(rows=2, figsize=(7,7))

### Image Detection

#### Coco

In [None]:
coco = untar_data(URLs.COCO_TINY)
images, lbl_bbox = get_annotations(coco/'train.json')
img2bbox = dict(zip(images, lbl_bbox))
get_y_func = lambda o:img2bbox[o.name]

In [None]:
type(img2bbox)

In [None]:
data = (ObjectItemList.from_folder(coco)
 .random_split_by_pct()
 .label_from_func(get_y_func)
 .transform(get_transforms(), tfm_y=True)
 .databunch(bs=16, collate_fn=bb_pad_collate, num_workers=0)
)

In [None]:
data.show_batch(rows=2, ds_type=DatasetType.Valid, figsize=(6,6))

In [None]:
data.train_ds[2][0]

In [None]:
data.train_ds[2][1]

In [None]:
img = open_image(img_path/'car_bbox.jpg')
#bbox = ImageBBox.create(*img.size, [[96, 155, 270, 351]], labels=[0], classes=['car'])

bbox = ImageBBox.create(*img.size, [[165, 249, 166, 250]], labels=[0], classes=['car'])

In [None]:
img.size

In [None]:
bbox.data

### Text

#### Sentiment Analysis (csv)

In [None]:
imdb = untar_data(URLs.IMDB_SAMPLE)
imdb_nano = Path(str(imdb) + '_nano'); imdb_nano

In [None]:
# tmp = TextList.from_csv(imdb, 'texts.csv', cols='text'); print(type(tmp))
# tmp = tmp.random_split_by_pct(); print(type(tmp))
# tmp = tmp.label_for_lm(); print(type(tmp))
# tmp = tmp.databunch(); print(type(tmp))

In [None]:
# language model

data_lm = (TextList.from_csv(imdb, 'texts.csv', cols='text')
 .random_split_by_pct()
 .label_for_lm()      # label is just 1-lag, for unsupervised language model
 .databunch(bptt=70)
)

In [None]:
data_lm.show_batch()

In [None]:
# for x, _ in data_lm.train_dl:
#     print(x.shape)
data_lm.vocab

In [None]:
# Binary classification (sentiment analysis)
data_clas = (TextList.from_csv(imdb, 'texts.csv', cols='text')
 .split_from_df(col='is_valid')
 .label_from_df(cols='label')
 .databunch())

In [None]:
data_clas.show_batch()

In [None]:
for k, (x, _) in enumerate(data_clas.train_dl):
    if k == 1:
        print(x.shape)
        print(x)

In [None]:
data_clas.vocab.itos[5]

### Tabular

In [None]:
adult = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(adult/'adult.csv')
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
cont_names = ['education-num', 'hours-per-week', 'age', 'capital-loss', 'fnlwgt', 'capital-gain']
procs = [FillMissing, Categorify, Normalize]

In [None]:
data = (TabularList.from_df(df, path=adult, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(valid_idx=range(800,1000))
                           .label_from_df(cols=dep_var)
                           .databunch())

In [None]:
data.show_batch()