In [None]:
#default_exp training.sequence_classification

# Sequence Classification Tuning
> Data and Tuning API for Sequence Classification Tasks

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#export
import pandas as pd
from fastcore.foundation import L
from fastcore.meta import delegates
from fastcore.xtras import Path, range_of

from fastai.basics import * # TODO: Replace with absolutes in fastai_minima

from datasets import Dataset
from transformers import AutoModelForSequenceClassification, default_data_collator, AutoTokenizer

from adaptnlp.training.core import * # Core has everything we need so you should always import * with it

from adaptnlp.inference.sequence_classification import TransformersSequenceClassifier, SequenceResult, DetailLevel
from typing import List

## Datasets

In [None]:
#export
class SequenceClassificationDatasets(TaskDatasets):
    """
    A set of datasets designed for sequence classification
    """
    def __init__(
        self,
        items, # Some items we can pull x's and y's from
        get_x = ColReader('text'), # A function taking in one item and extracting the text
        get_y = ColReader('label'), # A function taking in one item and extracting the label(s)
        splits = None, # Indexs to split the data from
        tokenizer_name:str = None, # The string name of a `HuggingFace` tokenizer or model. If `None`, will not tokenize the dataset.
        tokenize_func:callable = None, # Optional custom tokenize function for a single item, such as `def _inner(item): return self.tokenizer(item['text'])`
        tokenize:bool = True, # Whether to tokenize the dataset immediatly
        tokenize_kwargs:dict = {'padding':True}, # Some kwargs for when we call the tokenizer
        auto_kwargs:dict = {}, # Some kwargs when calling `AutoTokenizer.from_pretrained`
        remove_columns:list = None, # Names of columns to remove from teh dataset, such as `text`
    ):
        xs = L(L(items).map(get_x)[0].values, use_list=True)
        ys = L(L(items).map(get_y)[0].values, use_list=True)
        self.categorize = Categorize(ys)
        ys = L([self.categorize(y) for y in ys], use_list=True)
        train_xs, train_ys = xs[splits[0]], ys[splits[0]]
        valid_xs, valid_ys = xs[splits[1]], ys[splits[1]]
        
        train_dset = Dataset.from_dict({
            'text':train_xs,
            'labels':train_ys
        })
        
        valid_dset = Dataset.from_dict({
            'text':valid_xs,
            'labels':valid_ys
        })
        
        
        super().__init__(train_dset, valid_dset, tokenizer_name, tokenize, tokenize_func, tokenize_kwargs, auto_kwargs, remove_columns)
        
    
    @classmethod
    def from_df(
        cls,
        df:pd.DataFrame, # A Pandas Dataframe or Path to a DataFrame
        text_col:str = 'text', # Name of the column the text is stored
        label_col:str = 'labels', # Name of the column the label(s) are stored
        splits = None, # Indexes to split the data with
        tokenizer_name:str = None, # The string name of a `HuggingFace` tokenizer or model. If `None`, will not tokenize the dataset.
        tokenize:bool = True, # Whether to tokenize the dataset immediatly
        tokenize_func:callable = None, # Optional custom tokenize function for a single item, such as `def _inner(item): return self.tokenizer(item['text'])`
        tokenize_kwargs:dict = {'padding':True}, # Some kwargs for when we call the tokenizer
        auto_kwargs:dict = {}, # Some kwargs when calling `AutoTokenizer.from_pretrained`
        remove_columns:list = None, # Names of columns to remove from the dataset, such as `text`
    ):
        "Builds `SequenceClassificationDatasets` from a `DataFrame` or file path"
        get_x = ColReader(text_col)
        get_y = ColReader(label_col)
        if splits is None: splits = RandomSplitter(0.2)(range_of(df))
        return cls(df, get_x, get_y, splits, tokenizer_name, tokenize_func, tokenize, tokenize_kwargs, auto_kwargs, remove_columns)
    
    @delegates(DataLoaders)
    def dataloaders(
        self, 
        batch_size=8, # A batch size
        shuffle_train=True, # Whether to shuffle the training dataset
        collate_fn = None, # A custom collation function
        **kwargs): # Torch DataLoader kwargs
        dls = super().dataloaders(batch_size, shuffle_train, collate_fn, **kwargs)
        dls[0].categorize = self.categorize
        return dls

Currently `SequenceClassificationDatasets` only supports reading in data from a `DataFrame` or similar item.

When passing in kwargs if anything should go to the `tokenize` function they should go to `tokenize_kwargs`, and if it should go to the `Auto` class constructor, they should go to `auto_kwargs`

In [None]:
from fastai.data.external import URLs, untar_data
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')

def tok_func(text, tokenizer, tokenize_kwargs): return tokenizer(text['text'], **tokenize_kwargs)
dset = SequenceClassificationDatasets.from_df(
    df,
    'text',
    'label',
    tokenizer_name = "bert-base-uncased",
    tokenize=True,
    tokenize_func=tok_func,
    tokenize_kwargs = {'max_length':512, 'truncation':True},
    remove_columns=['text']
)

test_eq(len(dset.train), 800)
test_eq(len(dset.valid), 200)
test_eq(len(dset.train[0]), 4) 
test_eq(dset.train[0].keys(), ['attention_mask', 'input_ids', 'labels', 'token_type_ids'])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Sequence Classification Tuner

In [None]:
#export
class SequenceClassificationTuner(AdaptiveTuner):
    """
    An `AdaptiveTuner` with good defaults for Sequence Classification tasks
    
    **Valid kwargs and defaults:**
      - `lr`:float = 0.001
      - `splitter`:function = `trainable_params`
      - `cbs`:list = None
      - `path`:Path = None
      - `model_dir`:Path = 'models'
      - `wd`:float = None
      - `wd_bn_bias`:bool = False
      - `train_bn`:bool = True
      - `moms`: tuple(float) = (0.95, 0.85, 0.95)
    
    """
    def __init__(
        self,
        dls:DataLoaders, # A set of DataLoaders
        model_name:str, # A HuggingFace model
        tokenizer = None, # A HuggingFace tokenizer
        loss_func = CrossEntropyLossFlat(), # A loss function
        metrics = [accuracy, F1Score()], # Metrics to monitor the training with
        opt_func = Adam, # A fastai or torch Optimizer
        additional_cbs = None, # Additional Callbacks to have always tied to the Tuner,
        expose_fastai_api = False, # Whether to expose the fastai API
        num_classes:int=None, # The number of classes
        **kwargs, # kwargs for `Learner.__init__`
    ):
        additional_cbs = listify(additional_cbs)
        for arg in 'dls,model,loss_func,metrics,opt_func,cbs,expose_fastai'.split(','): 
            if arg in kwargs.keys(): kwargs.pop(arg) # Pop all existing kwargs
        if hasattr(dls[0], 'categorize'): num_classes = getattr(dls[0].categorize, 'classes', None)
        if num_classes is None: raise ValueError("Could not extrapolate number of classes, please pass it in as a param")
        if not isinstance(num_classes, int): num_classes = len(num_classes)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
        if tokenizer is None: tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        super().__init__(
            expose_fastai_api,
            dls = dls, 
            model = model, 
            tokenizer = tokenizer,
            loss_func = loss_func, 
            metrics = metrics, 
            opt_func = opt_func, 
            cbs=additional_cbs, 
            **kwargs
        )
    
    @delegates(Learner.__init__)
    @classmethod
    def from_df(
        cls,
        df:pd.DataFrame, # A Pandas Dataframe or Path to a DataFrame
        text_col:str = 'text', # Name of the column the text is stored
        label_col:str = 'labels', # Name of the column the label(s) are stored
        remove_columns:Union[str,List[str]] = None, # Name of columns to be removed after tokenizing
        model_name:str = None, # The string name of a huggingFace model
        split_func:callable = RandomSplitter(), # A function which splits the data
        loss_func = CrossEntropyLossFlat(), # A loss function
        metrics = [accuracy, F1Score()], # Metrics to monitor the training with
        batch_size=8, # A batch size
        collate_fn=default_data_collator, # An optional custom collate function
        opt_func = Adam, # A fastai or torch Optimizer
        additional_cbs = None, # Additional Callbacks to have always tied to the Tuner,
        expose_fastai_api = False, # Whether to expose the fastai API
        tokenize_func:callable = None, # Optional custom tokenize function for a single item, such as `def _inner(item): return self.tokenizer(item['text'])`
        tokenize_kwargs:dict = {'padding':True}, # Some kwargs for when we call the tokenizer
        auto_kwargs:dict = {}, # Some kwargs when calling `AutoTokenizer.from_pretrained`
        **kwargs # Learner kwargs
    ):
        "Convience method to build a `SequenceClassificationTuner` from a Pandas Dataframe"
        try:
            splits = split_func(df)
        except:
            splits = split_func(range_of(df))
        dset = SequenceClassificationDatasets.from_df(
            df,
            text_col,
            label_col,
            splits,
            tokenizer_name=model_name,
            tokenize_kwargs=tokenize_kwargs,
            auto_kwargs=auto_kwargs,
            tokenize_func=tokenize_func,
            remove_columns=remove_columns
        )
        
        tokenizer = dset.tokenizer
        
        dls = dset.dataloaders(batch_size, collate_fn)
        
        return cls(dls, model_name, tokenizer, loss_func, metrics, opt_func, additional_cbs, expose_fastai_api)
    
    def predict(
        self,
        text:Union[List[str], str], # Some text or list of texts to do inference with
        bs:int=64, # A batch size to use for multiple texts
        detail_level:DetailLevel = DetailLevel.Low, # A detail level to return on the predictions
    ):
        "Predict some `text` for sequence classification with the currently loaded model"
        if getattr(self, '_inferencer', None) is None: self._inferencer = TransformersSequenceClassifier(self.tokenizer, self.model)
        preds = self._inferencer.predict(text,bs)
        cat = getattr(self.dls, 'categorize', None)
        vocab = cat.classes if cat is not None else None
        return SequenceResult(preds, vocab).to_dict(detail_level)

In [None]:
show_doc(SequenceClassificationTuner.from_df)

<h4 id="SequenceClassificationTuner.from_df" class="doc_header"><code>SequenceClassificationTuner.from_df</code><a href="__main__.py#L52" class="source_link" style="float:right">[source]</a></h4>

> <code>SequenceClassificationTuner.from_df</code>(**`df`**:`DataFrame`, **`text_col`**:`str`=*`'text'`*, **`label_col`**:`str`=*`'labels'`*, **`remove_columns`**:`Union`\[`str`, `List`\[`str`\]\]=*`None`*, **`model_name`**:`str`=*`None`*, **`split_func`**:`callable`=*`_inner`*, **`loss_func`**=*`CrossEntropyLoss()`*, **`metrics`**=*`[<function accuracy at 0x7fddb3ae2c10>, <fastai.metrics.AccumMetric object at 0x7fdda899c190>]`*, **`batch_size`**=*`8`*, **`collate_fn`**=*`default_data_collator`*, **`opt_func`**=*`Adam`*, **`additional_cbs`**=*`None`*, **`expose_fastai_api`**=*`False`*, **`tokenize_func`**:`callable`=*`None`*, **`tokenize_kwargs`**:`dict`=*`{'padding': True}`*, **`auto_kwargs`**:`dict`=*`{}`*, **`lr`**=*`0.001`*, **`splitter`**=*`trainable_params`*, **`cbs`**=*`None`*, **`path`**=*`None`*, **`model_dir`**=*`'models'`*, **`wd`**=*`None`*, **`wd_bn_bias`**=*`False`*, **`train_bn`**=*`True`*, **`moms`**=*`(0.95, 0.85, 0.95)`*)

Convience method to build a [`SequenceClassificationTuner`](/adaptnlp/training.sequence_classification.html#SequenceClassificationTuner) from a Pandas Dataframe



In [None]:
show_doc(SequenceClassificationTuner.predict)

<h4 id="SequenceClassificationTuner.predict" class="doc_header"><code>SequenceClassificationTuner.predict</code><a href="__main__.py#L97" class="source_link" style="float:right">[source]</a></h4>

> <code>SequenceClassificationTuner.predict</code>(**`text`**:`Union`\[`List`\[`str`\], `str`\], **`bs`**:`int`=*`64`*, **`detail_level`**:`DetailLevel`=*`'low'`*)

Predict some `text` for sequence classification with the currently loaded model

**Function Arguments**:
* `text` (`Union[List[str], str]`): Some text or list of texts to do inference with
* `bs` (`int`): A batch size to use for multiple texts
* `detail_level` (`DetailLevel `): A detail level to return on the predictions
