In [None]:
#default_exp training.language_model

# Language Model Tuning
> Data and Tuning API for Language Model Fine-Tuning

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#export
from transformers import DataCollatorForLanguageModeling, default_data_collator

from adaptnlp.training.core import *

from fastai.data.core import DataLoaders

from fastcore.basics import mk_class

from fastai.basics import *

import pandas as pd
from fastcore.meta import delegates

## Datasets

In [None]:
#exporti
def _group_texts(examples, block_size):
    # Concatenate all texts, based on code by Transformers
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
#export
class LanguageModelDatasets(TaskDatasets):
    """
    A set of datasets designed for language model fine-tuning
    """
    def __init__(
        self,
        items, # Some items we can pull x's and y's from
        get_x = ColReader('text'), # A function taking in one item and extracting the text
        block_size:int = 512, # A block size to split up the data with. Note: this is different than `max_len`
        masked_lm:bool=False, # Whether this is a Masked Language Model
        splits = None, # Indexs to split the data from
        tokenizer_name:str = None, # The string name of a `HuggingFace` tokenizer or model. If `None`, will not tokenize the dataset.
        tokenize:bool = True, # Whether to tokenize the dataset immediatly
        tokenize_kwargs:dict = {}, # Some kwargs for when we call the tokenizer
        auto_kwargs:dict = {}, # Some kwargs when calling `AutoTokenizer.from_pretrained`
    ):
        xs = L(L(items).map(get_x)[0].values, use_list=True)
        train_xs = xs[splits[0]]
        valid_xs = xs[splits[1]]
        
        train_dset = Dataset.from_dict({
            'text':train_xs.items
        })
        
        valid_dset = Dataset.from_dict({
            'text':valid_xs.items
        })
        
        super().__init__(train_dset, valid_dset, tokenizer_name, tokenize, tokenize_kwargs, auto_kwargs)
        self.masked_lm = masked_lm
        self.block_size = block_size
        f = partial(_group_texts, block_size=self.block_size)
        self.train = self.train.map(f, batched=True)
        self.valid = self.valid.map(f, batched=True)
        
    @classmethod
    def from_df(
        cls,
        df:pd.DataFrame, # A Pandas Dataframe or Path to a DataFrame
        text_col:str = 'text', # Name of the column the text is stored
        splits = None, # Indexes to split the data with
        block_size:int = 512, # A block size to split up the data with. Note: this is different than `max_len`
        masked_lm:bool=False, # Whether this is a Masked Language Model
        tokenizer_name:str = None, # The string name of a `HuggingFace` tokenizer or model. If `None`, will not tokenize the dataset.
        tokenize:bool = True, # Whether to tokenize the dataset immediatly
        tokenize_kwargs:dict = {}, # Some kwargs for when we call the tokenizer
        auto_kwargs:dict = {}, # Some kwargs when calling `AutoTokenizer.from_pretrained`
    ):
        "Builds `SequenceClassificationDatasets` from a `DataFrame` or file path"
        get_x = ColReader(text_col)
        if splits is None: splits = RandomSplitter(0.2)(range_of(df))
        return cls(df, get_x, block_size, masked_lm, splits, tokenizer_name, tokenize, tokenize_kwargs, auto_kwargs)
    
    @delegates(DataLoaders)
    def dataloaders(
        self, 
        batch_size=8, # A batch size
        shuffle_train=True, # Whether to shuffle the training dataset
        collate_fn = default_data_collator, # A custom collation function
        mlm_probability:float = 0.15, # Token masking probablity for Masked Language Models
        **kwargs): # Torch DataLoader kwargs
        if self.masked_lm: collate_fn = DataCollatorForLanguageModeling(tokenizer=self.tokenizer, mlm_probability=mlm_probability)
        return super().dataloaders(batch_size, shuffle_train, collate_fn, **kwargs)

In [None]:
from transformers import AutoModelForMaskedLM, AutoModelForCausalLM, AutoModelForSeq2SeqLM

# Cell
mk_class('LMType', **{o:o.lower() for o in ['Masked', 'Causal', 'Seq2Seq']},
        doc="All valid language model classes with typo-proofing")

_constructors = {
    'masked':AutoModelForMaskedLM.from_pretrained,
    'causal':AutoModelForCausalLM.from_pretrained,
    'seq2seq':AutoModelForSeq2SeqLM.from_pretrained
                }

In [None]:
class LanguageModelTuner(AdaptiveTuner):
    """
    An `AdaptiveTuner` with good defaults for Language Model fine-tuning
    **Valid kwargs and defaults:**
      - `lr`:float = 0.001
      - `splitter`:function = `trainable_params`
      - `cbs`:list = None
      - `path`:Path = None
      - `model_dir`:Path = 'models'
      - `wd`:float = None
      - `wd_bn_bias`:bool = False
      - `train_bn`:bool = True
      - `moms`: tuple(float) = (0.95, 0.85, 0.95)
    """
    def __init__(
        self,
        dls:DataLoaders, # A set of DataLoaders
        model_name, # A HuggingFace model
        language_model_type:LMType = LMType.Causal, # The type of language model to use
        loss_func = CrossEntropyLossFlat(), # A loss function
        metrics = [Perplexity()], # Metrics to monitor the training with
        opt_func = Adam, # A fastai or torch Optimizer
        additional_cbs = None, # Additional Callbacks to have always tied to the Tuner,
        expose_fastai_api = False, # Whether to expose the fastai API
        **kwargs, # kwargs for `Learner.__init__`
    ):
        additional_cbs = listify(additional_cbs)
        for arg in 'dls,model,loss_func,metrics,opt_func,cbs,expose_fastai'.split(','):
            if arg in kwargs.keys(): kwargs.pop(arg) # Pop all existing kwargs

        if language_model_type is None: raise ValueError("Please specify the type of language model you want to use, such as `masked` or `causal`")
        if language_model_type not in _constructors.keys():
            raise ValueError(
                """
                Please enter a valid Langauge Model Type of:
                  * `masked` or `LMType.Masked`
                  * `causal` or `LMType.Causal`
                  * `seq2seq` or `LMType.Seq2Seq`
                """
            )
        try:
            model = _constructors[language_model_type](model_name)
        except Exception as e:
            message = e.args[0]
            m = f"Was not able to create a {language_model_type} instance of {model_name}. Please use a valid model for your task:"
            m += message
            e.args = [m]
            raise e

        super().__init__(
            expose_fastai_api,
            dls = dls,
            model = model,
            loss_func = loss_func,
            metrics = metrics,
            opt_func = opt_func,
            cbs=additional_cbs,
            **kwargs
        )

    @delegates(Learner.__init__)
    @classmethod
    def from_df(
        cls,
        df:pd.DataFrame, # A Pandas Dataframe or Path to a DataFrame
        text_col:str = 'text', # Name of the column the text is stored
        model_name:str = None, # The string name of a huggingFace model
        language_model_type:LMType = LMType.Causal, # The type of language model to use
        split_func:callable = RandomSplitter(), # A function which splits the data
        loss_func = CrossEntropyLossFlat(), # A loss function
        metrics = [Perplexity()], # Metrics to monitor the training with
        batch_size=8, # A batch size
        collate_fn=default_data_collator, # An optional custom collate function
        opt_func = Adam, # A fastai or torch Optimizer
        additional_cbs = None, # Additional Callbacks to have always tied to the Tuner,
        expose_fastai_api = False, # Whether to expose the fastai API
        dataset_kwargs:dict = {}, # Kwargs for LanguageModelDatasets.from_df
        tokenize_kwargs:dict = {}, # kwargs for the tokenize function
        auto_kwargs:dict = {}, # Some kwargs when calling `AutoTokenizer.from_pretrained`
        **kwargs # Learner kwargs
    ):
        "Convience method to build a `LanguageModelTuner` from a Pandas Dataframe"
        splits = split_func(range_of(df))
        dls = LanguageModelDatasets.from_df(
            df,
            text_col,
            splits,
            tokenizer_name=model_name,
            tokenize_kwargs=tokenize_kwargs,
            auto_kwargs=auto_kwargs,
            **dataset_kwargs
        ).dataloaders(batch_size, collate_fn)

        return cls(
            dls,
            model_name,
            language_model_type,
            loss_func,
            metrics,
            opt_func,
            additional_cbs,
            expose_fastai_api,
            **kwargs
        )

## Export - 

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_file_utils.ipynb.
Converted 01_callback.ipynb.
Converted 02_model_hub.ipynb.
Converted 03_model.ipynb.
Converted 04_embeddings.ipynb.
Converted 04a_tutorial.embeddings.ipynb.
Converted 05_token_classification.ipynb.
Converted 05a_tutorial.token_tagging.ipynb.
Converted 06_sequence_classification.ipynb.
Converted 06a_tutorial.easy_sequence_classifier.ipynb.
Converted 07_summarization.ipynb.
Converted 07a_tutorial.summarization.ipynb.
Converted 08_translation.ipynb.
Converted 08a_tutorial.translation.ipynb.
Converted 09_text_generation.ipynb.
Converted 09a_tutorial.easy_text_generator.ipynb.
Converted 10_question_answering.ipynb.
Converted 10a_tutorial.question_answering.ipynb.
Converted 13a_transformers.squad_metrics.ipynb.
Converted 14_result.ipynb.
Converted 14_training.core.ipynb.
Converted 15_training.sequence_classification.ipynb.
Converted 16_training.language_model.ipynb.
Converted 20_tutorial.tuner.sequence_classification.ipynb.
Converted 21_tutorial.training.langua