In [None]:
#all_slow

In [None]:
# default_exp language_model

# Language Models
> Language Models within the AdaptNLP library

In [None]:
#export
# coding=utf-8
# This file uses code from the language modeling examples in the huggingface Transformer's repo

import os
import logging
import math
from typing import Dict, Union
from pathlib import Path

import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForMaskedLM,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    TextDataset,
    DataCollatorForLanguageModeling,
    # TODO: For XLNet, will be available in Transformers release 3.0.2+
    # DataCollatorForPermutationLanguageModeling,
    LineByLineTextDataset,
)

from adaptnlp.model_hub import HFModelResult

from fastcore.basics import mk_class

In [None]:
#export
logger = logging.getLogger(__name__)

In [None]:
#export
_types = {'Causal':'causal-lm', 'Masked':'masked-lm', 'Seq2Seq':'seq2seq'}
mk_class('LMClass', **_types,
        doc="All possible Language Model types as attributes to get tab-completion and typo-proofing")

In [None]:
#export
class LMFineTuner:
    """
     A Language Model Fine Tuner object you can set language model configurations and then train and evaluate

    Usage:

    ```python
    >>> finetuner = adaptnlp.LMFineTuner()
    >>> finetuner.train()
    ```

    **Parameters:**

    * **model_name_or_path** - The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.
    * **language_model_class** - The type of language model you are trying to train, such as "causal" or "seq2seq"
    """
    
    def _get_automodel_func(self, tags):
        if 'causal-lm' in tags: return AutoModelForCausalLM, 'causal-lm'
        elif 'masked-lm' in tags: return AutoModelForMaskedLM, 'masked-lm'
        elif 'seq2seq' in tags: return AutoModelForSeq2SeqLM, 'seq2seq'
        else: raise ValueError(f'Not a valid Language Model type: {tags[0]}')

    def __init__(
        self,
        model_name_or_path:Union[str, HFModelResult]="bert-base-cased",
        language_model_class:Union[str, LMClass] = 'causal-lm',
    ):

        logger.info(
            "This is the new updated `LMFineTuner` class object for 0.2.0+. If you're looking for `LMFineTuner` from <=0.1.6, you can instantiate it with LMFineTunerManual"
        )
        # Load model and tokenizer
        name = getattr(model_name_or_path, 'name', model_name_or_path)
        if not isinstance(model_name_or_path, HFModelResult) and language_model_class is None:
            raise ValueError("""
            No `language_model_class` was passed in with a model string. 
            Please specify the type of language model it is (Either causal, masked, or seq2seq). 
            
            To find the proper type, search your model on the HuggingFaceHub and you will see its tag near the top,
              such as "causal-lm" or "seq2seq"
            """)
        
        tags = getattr('tags', model_name_or_path, [language_model_class])
        model_constructor, self.lm_class = self._get_automodel_func(tags)
        self.model = model_constructor.from_pretrained(name)
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            name, use_fast=True
        )
        self.trainer = None

        # Setup cuda and automatic allocation of model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def train(
        self,
        training_args: TrainingArguments,
        train_file: Union[str, Path],
        eval_file: Union[str, Path],
        line_by_line: bool = False,
        mlm: bool = False,
        mlm_probability: float = 0.15,
        plm_probability: float = 1 / 6,
        max_span_length: int = 5,
        block_size: int = -1,
        overwrite_cache: bool = False,
    ):
        """Train and fine-tune the loaded language model

        * **train_file** - The input training data file (a text file).
        * **eval_file** - An optional input evaluation data file to evaluate the perplexity on (a text file).
        * **line_by_line** - Whether distinct lines of text in the dataset are to be handled as distinct sequences.
        * **mlm** - Train with masked-language modeling loss instead of language modeling.
        * **mlm_probability** - Ratio of tokens to mask for masked language modeling loss
        * **plm_probability** - Ratio of length of a span of masked tokens to surrounding context length for permutation language modeling.
        * **max_span_length** - Maximum length of a span of masked tokens for permutation language modeling.
        * **block_size** - Optional input sequence length after tokenization.
                            The training dataset will be truncated in block of this size for training."
                            `-1` will default to the model max input length for single sentence inputs (take into account special tokens).
        * **overwrite_cache** - Overwrite the cached training and evaluation sets
        """

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
        )
        logger.warning(
            f"""Process rank: {training_args.local_rank},
                device: {training_args.device},
                n_gpu: {training_args.n_gpu},
                distributed training: {bool(training_args.local_rank != -1)},
                16-bits training: {training_args.fp16}
            """
        )
        logger.info(f"Training/evaluation parameters: {training_args.to_json_string()}")

        # Check if masked language model or not
        if (
            self.model.config.model_type
            in ["bert", "roberta", "distilbert", "camembert"]
            and not mlm
        ):
            raise ValueError(
                """BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run with
                mlm set as True(masked language modeling)."""
            )

        # Check block size for Dataset
        if block_size <= 0:
            block_size = self.tokenizer.model_max_length
        else:
            block_size = min(block_size, self.tokenizer.model_max_length)

        # Get datasets
        train_dataset = self._get_dataset(
            file_path=train_file,
            line_by_line=line_by_line,
            block_size=block_size,
            overwrite_cache=overwrite_cache,
        )
        eval_dataset = self._get_dataset(
            file_path=eval_file,
            line_by_line=line_by_line,
            block_size=block_size,
            overwrite_cache=overwrite_cache,
        )
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset

        # Get Collator
        # TODO: DataCollatorForPermutationLanguageModeling not availbe until release 3.0.2+
        if self.model.config.model_type == "xlnet":
            logger.info("Cannot currently finetune XLNet model")
            raise ValueError(
                "Use another language model besides XLNet for LM finetuning"
            )
            """
            data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=self.tokenizer,
            plm_probability=plm_probability,
            max_span_length=max_span_length,
            )
            """
        else:
            data_collator = DataCollatorForLanguageModeling(
                tokenizer=self.tokenizer, mlm=mlm, mlm_probability=mlm_probability
            )

        # Initialize Trainer
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )

        # Train and serialize
        self.trainer.train()
        self.trainer.save_model()
        self.tokenizer.save_pretrained(training_args.output_dir)

    def evaluate(self) -> Dict[str, float]:

        if not self.trainer:
            logger.info(
                "No trainer loaded, you should probably run `LMFineTuner.train(...)` first"
            )
            return None
        results = {}

        logger.info("*** Evaluate ***")

        eval_output = self.trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(
            self.trainer.args.output_dir, "eval_results_lm.txt"
        )

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
        results.update(result)

        return results

    def _get_dataset(
        self,
        file_path: str,
        line_by_line: bool,
        block_size: int,
        overwrite_cache: bool,
    ) -> Dataset:
        if line_by_line:
            return LineByLineTextDataset(
                tokenizer=self.tokenizer, file_path=file_path, block_size=block_size
            )
        else:
            return TextDataset(
                tokenizer=self.tokenizer,
                file_path=file_path,
                block_size=block_size,
                overwrite_cache=overwrite_cache,
            )

In [None]:
#hide
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
!unzip wikitext-2-raw-v1.zip

train_file = "./wikitext-2-raw/wiki.train.raw"
eval_file = "./wikitext-2-raw/wiki.test.raw"

--2021-06-18 16:30:35--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.21.45
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.21.45|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4721645 (4.5M) [application/zip]
Saving to: ‘wikitext-2-raw-v1.zip’


2021-06-18 16:30:35 (162 MB/s) - ‘wikitext-2-raw-v1.zip’ saved [4721645/4721645]

Archive:  wikitext-2-raw-v1.zip
   creating: wikitext-2-raw/
  inflating: wikitext-2-raw/wiki.test.raw  
  inflating: wikitext-2-raw/wiki.valid.raw  
  inflating: wikitext-2-raw/wiki.train.raw  


In [None]:
#hide
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='../models',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="no",
    logging_dir='../logs',
    save_steps=2500,
    eval_steps=100
)

In [None]:
#hide
finetuner = LMFineTuner(model_name_or_path='gpt2', language_model_class='causal-lm')

In [None]:
#hide
finetuner.train(
    training_args=training_args,
    train_file=eval_file,
    eval_file=eval_file,
    mlm=False,
    overwrite_cache=False
)

                device: cuda:0,
                n_gpu: 1,
                distributed training: False,
                16-bits training: False
            
06/18/2021 17:05:36 - INFO - __main__ -   Training/evaluation parameters: {
  "output_dir": "../models",
  "overwrite_output_dir": false,
  "do_train": false,
  "do_eval": false,
  "do_predict": false,
  "evaluation_strategy": "no",
  "prediction_loss_only": false,
  "per_device_train_batch_size": 1,
  "per_device_eval_batch_size": 1,
  "per_gpu_train_batch_size": null,
  "per_gpu_eval_batch_size": null,
  "gradient_accumulation_steps": 1,
  "eval_accumulation_steps": null,
  "learning_rate": 5e-05,
  "weight_decay": 0.01,
  "adam_beta1": 0.9,
  "adam_beta2": 0.999,
  "adam_epsilon": 1e-08,
  "max_grad_norm": 1.0,
  "num_train_epochs": 1,
  "max_steps": -1,
  "lr_scheduler_type": "linear",
  "warmup_ratio": 0.0,
  "warmup_steps": 500,
  "logging_dir": "../logs",
  "logging_strategy": "steps",
  "logging_first_step": false,
  "loggin

Step,Training Loss


In [None]:
finetuner.evaluate()

06/18/2021 17:08:00 - INFO - __main__ -   *** Evaluate ***


06/18/2021 17:08:35 - INFO - __main__ -   ***** Eval results *****
06/18/2021 17:08:35 - INFO - __main__ -     perplexity = 19.988802908297547


{'perplexity': 19.988802908297547}

In [None]:
#hide
from adaptnlp import EasyTextGenerator

text = "China and the U.S. will begin to"

generator = EasyTextGenerator()

In [None]:
#hide
generated_text = generator.generate(
    text, 
    model_name_or_path="../models", 
    num_tokens_to_produce=50
)

print(generated_text)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
06/18/2021 17:09:16 - INFO - adaptnlp.text_generation -   Running text generator on 1 text sequences
06/18/2021 17:09:16 - INFO - adaptnlp.text_generation -   Batch size = 32


['China and the U.S. will begin to work together to develop a new energy source for the country.\n\nThe U.S. is also working with China on a new energy source for the country, the South China Sea, which China has said is a "strategic and economic priority']


In [None]:
#hide
generated_text = generator.generate(
    text, 
    model_name_or_path="gpt2", 
    num_tokens_to_produce=50
)

print(generated_text)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
06/18/2021 17:09:37 - INFO - adaptnlp.text_generation -   Running text generator on 1 text sequences
06/18/2021 17:09:37 - INFO - adaptnlp.text_generation -   Batch size = 32


['China and the U.S. will begin to see the effects of the new sanctions on the Russian economy.\n\n"The U.S. is going to be the first to see the effects of the new sanctions," said Michael O\'Hanlon, a senior fellow at the Center for Strategic']


## Export - 

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_file_utils.ipynb.
Converted 01_callback.ipynb.
Converted 02_model_hub.ipynb.
Converted 03_model.ipynb.
Converted 04_embeddings.ipynb.
Converted 04a_tutorial.embeddings.ipynb.
Converted 05_token_classification.ipynb.
Converted 05a_tutorial.token_tagging.ipynb.
Converted 06_sequence_classification.ipynb.
Converted 06a_tutorial.easy_sequence_classifier.ipynb.
Converted 07_summarization.ipynb.
Converted 07a_tutorial.summarization.ipynb.
Converted 08_translation.ipynb.
Converted 08a_tutorial.translation.ipynb.
Converted 09_text_generation.ipynb.
Converted 09a_tutorial.easy_text_generator.ipynb.
Converted 10_question_answering.ipynb.
Converted 10a_tutorial.question_answering.ipynb.
Converted 11_language_model.ipynb.
Converted 12_training.ipynb.
Converted 13a_transformers.squad_metrics.ipynb.
Converted 13b_transformers.finetuning.ipynb.
Converted 13c_transformers.utils_squad_evaluate.ipynb.
Converted 20a_tutorial.fine_tuning_lm.ipynb.
Converted 20b_tutorial.fine_tuning_manual.ipy