In [None]:
#default_exp training.data

# Data
> The data API

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

## The API

AdaptNLP's data api takes inspiration from [fastai](https://docs.fast.ai)'s `DataBlock` API, as well has has partial compatibility with it. 

There is a medium and high-level API through `TaskDataset`, and individual task wrappers.

In [None]:
#export
from transformers import AutoTokenizer

from fastcore.foundation import mask2idxs, L
from fastcore.meta import delegates
from fastcore.xtras import Path, range_of
from fastai.data.core import DataLoaders

from torch.utils.data import DataLoader

import pandas as pd
from typing import Union
import os, torch

In [None]:
#export
class ColReader:
    """
    Reads `cols` in `row` with potential `pref` and `suff`
    Based on the fastai class
    """
    def __init__(
        self,
        cols, # Some column names to use
        pref:str='', # A prefix
        suff:str='', # A suffix
        label_delim:str=None, # A label delimiter
    ):
        self.pref = str(pref) + os.path.sep if isinstance(pref, Path) else pref
        self.suff, self.label_delim = suff, label_delim
        self.cols = L(cols)
    
    def _do_one(self, r, c):
        o = r[c] if isinstance(c,int) else r[c] if c=='name' or c=='cat' else getattr(r,c)
        if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
        if self.label_delim is None: return f'{self.pref}{o}{self.suff}'
        else: return o.split(self.label_delim) if len(o)>0 else []
    
    def __call__(self, o):
        if len(self.cols) == 1: return self._do_one(o, self.cols[0])
        return L(self._do_one(o,c) for c in self.cols)

In [None]:
#export
class Categorize:
    """
    Collection of categories with reverse mapping in `o2i`
    Based on the fastai class
    """
    def __init__(
        self, 
        names, # An interable collection of items to create a vocab from
        sort=True # Whether to make the items sorted
    ):
        names = L(names)
        self.classes = L(o for o in names.unique() if o == o)
        if sort: self.classes = self.classes.sorted()
        self.o2i = dict(self.classes.val2idx())
        
    def map_objs(
        self, 
        objs # Some iterable collection
    ):
        "Map `objs` to IDs"
        return L(self.o2i[o] for o in objs)
    
    def map_ids(
        self, 
        ids # Some ids correlating to `self.classes`
    ):
        "Map `ids` to objects in vocab"
        return L(self.classes[o] for o in ids)
    
    def __call__(self, o): return torch.tensor(self.o2i[o])
    
    def decode(self, o): return self.classes[o]

In [None]:
#export
def RandomSplitter(valid_pct=0.2, seed=None):
    """
    Creates a function that splits some items between train and validation with `valid_pct` randomly
    """
    def _inner(o):
        if seed is not None: torch.manual_seed(seed)
        rand_idx = L(list(torch.randperm(len(o)).numpy()))
        cut = int(valid_pct * len(o))
        return rand_idx[cut:], rand_idx[:cut]
    return _inner

In [None]:
#export
class TaskDatasets:
    """
    A set of datasets for a particular task, with a simple API.
    
    Note: This is the base API, `items` should be a set of regular text and model-ready labels,
          including label or one-hot encoding being applied.
    """
    @delegates(AutoTokenizer.from_pretrained)
    def __init__(
        self,
        items, # An array of (input, label)
        splits = None, # Indexes to split the data with
        tokenizer_name:str = None, # The string name of a `HuggingFace` tokenizer or model. If `None`, will not tokenize the dataset.
        tokenize:bool = True, # Whether to tokenize the dataset immediatly
        **kwargs, # kwargs to go to `AutoTokenizer.from_pretrained`
    ):
        self.items = L(items)
        splits = L([slice(None), []] if splits is None else splits).map(mask2idxs)
        self.train_idxs, self.valid_idxs = splits[0], splits[1]
        self.tokenizer = None
        if tokenizer_name is not None: self.set_tokenizer(tokenizer_name, **kwargs)
        if tokenize and self.tokenizer is not None: self._tokenize()
        elif tokenize and self.tokenizer is None:
            print("Tried to tokenize a dataset without a tokenizer. Please set a tokenizer with `set_tokenizer` and call `_tokenize()`")

    def __getitem__(self, idx): return self.items[idx]
    
    @property
    def train(self): return self.items[self.train_idxs]
    
    @property
    def valid(self): return self.items[self.train_idxs]
    
    def _tokenize(self):
        "Tokenize dataset in `self.items`"
        if not self.tokenizer: raise ValueError("Tried to tokenize a dataset without a tokenizer. Please add a tokenizer with `set_tokenizer(tokenizer_name` and try again")
        def _inner(item): 
            if len(item) > 1:
                data = self.tokenizer(item[0], padding=True, truncation=True)
                data['labels'] = item[1]
                return data
            else: return self.tokenizer(item[0], padding=True, truncation=True)
        self.items = self.items.map(_inner)
    
    @delegates(AutoTokenizer.from_pretrained)
    def set_tokenizer(
        self,
        tokenizer_name:str, # A string name of a `HuggingFace` tokenizer or model
        override_existing:bool = False, # Whether to override an existing tokenizer
        **kwargs # kwargs to go to `AutoTokenizer.from_pretrained`
    ):
        "Sets a new `AutoTokenizer` to `self.tokenizer`"
        if self.tokenizer and not override_existing:
            print(f'Warning! You are trying to override an existing tokenizer: {self.tokenizer.name_or_path}. Pass `override_existing=True` to use a new tokenizer')
            return
        elif self.tokenizer and override_existing:
            print(f'Setting new tokenizer to {tokenizer_name}')
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **kwargs)
        except:
            raise ValueError(f'{tokenizer_name} is not a valid pretrained model on the HuggingFace Hub or a local model')
    
    @delegates(DataLoaders)
    def dataloaders(
        self, 
        batch_size=8, # A batch size
        shuffle_train=True, # Whether to shuffle the training dataset
        collate_fn = None, # A custom collation function
        **kwargs): # Torch DataLoader kwargs
        "Creates `DataLoaders` from the dataset"
        if collate_fn is None:
            from transformers import DataCollatorWithPadding
            collate_fn = DataCollatorWithPadding(self.tokenizer)
        train_dl = DataLoader(self.train, shuffle=shuffle_train, collate_fn=collate_fn, batch_size=batch_size, **kwargs)
        valid_dl = DataLoader(self.valid, shuffle=False, collate_fn=collate_fn, batch_size=batch_size, **kwargs)
        return DataLoaders(train_dl, valid_dl)

In [None]:
#export
class SequenceClassificationDatasets(TaskDatasets):
    """
    A set of datasets designed for sequence classification
    """
    def __init__(
        self,
        items, # Some items we can pull x's and y's from
        get_x = ColReader('text'), # A function taking in one item and extracting the text
        get_y = ColReader('label'), # A function taking in one item and extracting the label(s)
        splits = None, # Indexs to split the data from
        tokenizer_name:str = None, # The string name of a HuggingFace tokenizer or model. If `None`, will not tokenize immediatly
        tokenize:bool=True, # Whether to tokenize the dataset immediatly
        **kwargs # kwargs to go to `AutoTokenizer.from_pretrained`
    ):
        xs = L(L(items).map(get_x)[0].values, use_list=True)
        ys = L(L(items).map(get_y)[0].values, use_list=True)
        self.categorize = Categorize(ys)
        ys = [self.categorize(y) for y in ys]
        items = L(zip(xs,ys))
        super().__init__(items, splits, tokenizer_name, tokenize, **kwargs)
        
    
    @delegates(AutoTokenizer.from_pretrained)
    @classmethod
    def from_df(
        cls,
        df:Union[pd.DataFrame, Path], # A Pandas Dataframe or Path to a DataFrame
        text_col:str = 'text', # Name of the column the text is stored
        label_col:str = 'labels', # Name of the column the label(s) are stored
        splits = None, # Indexes to split the data with
        tokenizer_name:str = None, # The string name of a huggingFace tokenizer or model. If `None`, will not tokenize the dataset
        tokenize:bool = True, # Whether to tokenize the dataset immediatly
        **kwargs
    ):
        "Builds `SequenceClassificationDatasets` from a `DataFrame` or file path"
        if not isinstance(df, pd.DataFrame): df = pd.read_csv(df)
        get_x = ColReader(text_col)
        get_y = ColReader(label_col)
        if splits is None: splits = RandomSplitter(0.2)(range_of(df))
        return cls(df, get_x, get_y, splits, tokenizer_name, tokenize, **kwargs)
    
    @delegates(DataLoaders)
    def dataloaders(
        self, 
        batch_size=8, # A batch size
        shuffle_train=True, # Whether to shuffle the training dataset
        collate_fn = None, # A custom collation function
        **kwargs): # Torch DataLoader kwargs
        dls = super().dataloaders(batch_size, shuffle_train, collate_fn, **kwargs)
        dls[0].categorize = self.categorize
        return dls

In [None]:
#slow
from fastai.data.external import URLs, untar_data
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
dsets = SequenceClassificationDatasets.from_df(
    df,
    'text',
    'label',
    tokenizer_name = "bert-base-uncased",
    tokenize=True
)
dsets[1]

{'input_ids': [101, 2023, 2003, 1037, 5186, 2092, 1011, 2081, 2143, 1012, 1996, 3772, 1010, 5896, 1998, 4950, 1011, 2147, 2024, 2035, 2034, 1011, 3446, 1012, 1996, 2189, 2003, 2204, 1010, 2205, 1010, 2295, 2009, 2003, 3262, 2220, 1999, 1996, 2143, 1010, 2043, 2477, 2024, 2145, 4659, 15138, 2100, 1012, 2045, 2024, 2053, 2428, 18795, 2015, 1999, 1996, 3459, 1010, 2295, 2195, 5344, 2097, 2022, 5220, 1012, 1996, 2972, 3459, 2515, 2019, 6581, 3105, 2007, 1996, 5896, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2021, 2009, 2003, 2524, 2000, 3422, 1010, 2138, 2045, 2003, 2053, 2204, 2203, 2000, 1037, 3663, 2066, 1996, 2028, 3591, 1012, 2009, 2003, 2085, 19964, 2000, 7499, 1996, 2329, 2005, 4292, 18221, 1998, 7486, 2114, 2169, 2060, 1010, 1998, 2059, 10311, 2135, 14443, 2068, 2046, 2048, 3032, 1012, 2045, 2003, 2070, 7857, 1999, 2023, 3193, 1010, 2021, 2009, 1005, 1055, 2036, 2995, 2008, 2053, 2028, 3140, 18221, 1998, 7486, 1999, 1996, 2555, 2000, 11094, 16416, 2102, 2169, 2060, 2004,

## Export -

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_file_utils.ipynb.
Converted 01_callback.ipynb.
Converted 02_model_hub.ipynb.
Converted 03_model.ipynb.
Converted 04_embeddings.ipynb.
Converted 04a_tutorial.embeddings.ipynb.
Converted 05_token_classification.ipynb.
Converted 05a_tutorial.token_tagging.ipynb.
Converted 06_sequence_classification.ipynb.
Converted 06a_tutorial.easy_sequence_classifier.ipynb.
Converted 07_summarization.ipynb.
Converted 07a_tutorial.summarization.ipynb.
Converted 08_translation.ipynb.
Converted 08a_tutorial.translation.ipynb.
Converted 09_text_generation.ipynb.
Converted 09a_tutorial.easy_text_generator.ipynb.
Converted 10_question_answering.ipynb.
Converted 10a_tutorial.question_answering.ipynb.
Converted 13a_transformers.squad_metrics.ipynb.
Converted 14_result.ipynb.
Converted 15_training.data.ipynb.
Converted 16_training.tuner.ipynb.
Converted 20_tutorial.tuner.sequence_classification.ipynb.
Converted index.ipynb.
