In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
from packaging import version

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric
import yaml
from easydict import EasyDict

import torch
import transformers
from filelock import FileLock
from optimum.graphcore import IPUSeq2SeqTrainer#, IPUConfig
from model.ipu_configuration import IPUConfig

from optimum.graphcore import IPUSeq2SeqTrainingArguments as Seq2SeqTrainingArguments
from optimum.graphcore.modeling_utils import to_pipelined

from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version as tf_check_min_version
from transformers.utils import is_offline_mode
from transformers.utils.versions import require_version



from sum_dataloader import SummaryCollator, get_dataloader, get_train_sampler, ipu_dataloader


In [4]:
__version__ = "0.2.4.dev"

def check_min_version(min_version):
    if version.parse(__version__) < version.parse(min_version):
        if "dev" in min_version:
            error_message = "This example requires a source install from HuggingFace Optimum-Graphcore"
        else:
            error_message = f"This example requires a minimum version of {min_version},"
        error_message += f" but the version found is {__version__}.\n"
        raise ImportError(error_message)

In [5]:
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
#tf_check_min_version("4.19.0.dev0")

# Will error if the minimal version of Optimum Graphcore is not installed. Remove at your own risks.
check_min_version("0.2.4.dev")

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")


In [6]:
logger = logging.getLogger(__name__)


In [7]:
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]

In [8]:

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = -1 # training_args.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.info(f"Training/evaluation parameters  ")

04/29/2022 07:03:26 - INFO - __main__ - Training/evaluation parameters  


In [9]:
argus = {}
with open('configuration/summrization.yaml') as f:
    hparams = yaml.load_all(f, Loader=yaml.FullLoader)
    for argu in hparams:
        argus[list(argu.keys())[0]]=list(argu.values())[0]


In [10]:
model_args, data_args, training_args = argus['ModelArguments'], argus['DataTrainingArguments'], argus['IPUSeq2SeqTrainingArguments']
model_args, data_args = EasyDict(model_args), EasyDict(data_args)
training_args = Seq2SeqTrainingArguments(**training_args)

In [11]:
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
    if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. "
            "Use --overwrite_output_dir to overcome."
        )
    elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
        logger.info(
            f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
            "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
        )

In [13]:
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
ipu_config = IPUConfig.from_pretrained(
    training_args.ipu_config_name if training_args.ipu_config_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=model_args.use_fast_tokenizer,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

[INFO|configuration_utils.py:654] 2022-04-29 07:03:33,990 >> loading configuration file https://huggingface.co/facebook/bart-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f5310d276a6d1648d00c32fadc8bf7b4607e0fbd5b404fc4a0045960aa2bdfdb.a243ed957122436adb0b8d8e9d20f896f45c174b6324d625ca0a20a84f72a910
[INFO|configuration_utils.py:690] 2022-04-29 07:03:33,992 >> Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  

In [14]:
model.resize_token_embeddings(len(tokenizer))


Embedding(50265, 768, padding_idx=1)

In [15]:
from model.pipeline_bart import PipelinedBartForConditionalGeneration

In [16]:
model = PipelinedBartForConditionalGeneration.from_transformers(model, ipu_config)

In [18]:
#model = to_pipelined(model, ipu_config, force=False)
model.parallelize()
if not training_args.fp32:
    model = model.half()

# Data make

In [19]:
summarization_name_mapping = {
    "amazon_reviews_multi": ("review_body", "review_title"),
    "big_patent": ("description", "abstract"),
    "cnn_dailymail": ("article", "highlights"),
    "orange_sum": ("text", "summary"),
    "pn_summary": ("article", "summary"),
    "psc": ("extract_text", "summary_text"),
    "samsum": ("dialogue", "summary"),
    "thaisum": ("body", "summary"),
    "xglue": ("news_body", "news_title"),
    "xsum": ("document", "summary"),
    "wiki_summary": ("article", "highlights"),
}

In [20]:
data_args.dataset_name

'cnn_dailymail'

In [21]:
#Set seed before initializing model.
set_seed(training_args.seed)

# Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
# (the dataset will be downloaded automatically from the datasets Hub).
#
# For CSV/JSON files this script will use the first column for the full texts and the second column for the
# summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
#
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if data_args.dataset_name is not None:
    # Downloading and loading a dataset from the hub.
    raw_datasets = load_dataset(
        data_args.dataset_name,
        data_args.dataset_config_name,
        cache_dir=model_args.cache_dir,
        use_auth_token=True if model_args.use_auth_token else None,
    )
else:
    data_files = {}
    if data_args.train_file is not None:
        data_files["train"] = data_args.train_file
        extension = data_args.train_file.split(".")[-1]
    if data_args.validation_file is not None:
        data_files["validation"] = data_args.validation_file
        extension = data_args.validation_file.split(".")[-1]
    if data_args.test_file is not None:
        data_files["test"] = data_args.test_file
        extension = data_args.test_file.split(".")[-1]
    raw_datasets = load_dataset(
        extension,
        data_files=data_files,
        cache_dir=model_args.cache_dir,
        use_auth_token=True if model_args.use_auth_token else None,
    )


04/29/2022 07:19:42 - DEBUG - datasets.load - Checking /root/.cache/huggingface/datasets/downloads/12660d351f66522858b33549dc51dcb9e2175c19106780ee8f29e3dc4af71195.ef876a56bbc40513db0e303f5b9afb0e07995a4dd89f5f5a9a339ffdb1da19a5.py for additional imports.
04/29/2022 07:19:42 - DEBUG - datasets.utils.filelock - Attempting to acquire lock 140450581330800 on /root/.cache/huggingface/modules/datasets_modules/datasets/cnn_dailymail.lock
04/29/2022 07:19:42 - DEBUG - datasets.utils.filelock - Lock 140450581330800 acquired on /root/.cache/huggingface/modules/datasets_modules/datasets/cnn_dailymail.lock
04/29/2022 07:19:42 - DEBUG - datasets.utils.filelock - Attempting to release lock 140450581330800 on /root/.cache/huggingface/modules/datasets_modules/datasets/cnn_dailymail.lock
04/29/2022 07:19:42 - DEBUG - datasets.utils.filelock - Lock 140450581330800 released on /root/.cache/huggingface/modules/datasets_modules/datasets/cnn_dailymail.lock
04/29/2022 07:19:42 - DEBUG - datasets.load - Crea

  0%|          | 0/3 [00:00<?, ?it/s]

In [22]:
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""


In [23]:
# Preprocessing the datasets.
# We need to tokenize inputs and targets.
if training_args.do_train:
    column_names = raw_datasets["train"].column_names
elif training_args.do_eval:
    column_names = raw_datasets["validation"].column_names
elif training_args.do_predict:
    column_names = raw_datasets["test"].column_names
else:
    logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
    raise

if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
    assert (
        data_args.lang is not None
    ), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"

    tokenizer.src_lang = data_args.lang
    tokenizer.tgt_lang = data_args.lang

    # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
    # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
    forced_bos_token_id = (
        tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None
    )
    model.config.forced_bos_token_id = forced_bos_token_id


In [24]:
prefix

''

In [21]:

# # Get the column names for input/target.
# dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None)
# if data_args.text_column is None:
#     text_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
# else:
#     text_column = data_args.text_column
#     if text_column not in column_names:
#         raise ValueError(
#             f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
#         )
# if data_args.summary_column is None:
#     summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
# else:
#     summary_column = data_args.summary_column
#     if summary_column not in column_names:
#         raise ValueError(
#             f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
#         )

# # Temporarily set max_target_length for training.
# max_target_length = data_args.max_target_length
# padding = "max_length" if data_args.pad_to_max_length else False

# if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
#     logger.warning(
#         "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
#         f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
#     )

# def preprocess_function(examples):
#     # remove pairs where at least one record is None

#     inputs, targets = [], []
#     for i in range(len(examples[text_column])):
#         if examples[text_column][i] is not None and examples[summary_column][i] is not None:
#             inputs.append(examples[text_column][i])
#             targets.append(examples[summary_column][i])

#     inputs = [prefix + inp for inp in inputs]
#     model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)

#     # Setup the tokenizer for targets
#     with tokenizer.as_target_tokenizer():
#         labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)

#     # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
#     # padding in the loss.
#     if padding == "max_length" and data_args.ignore_pad_token_for_loss:
#         labels["input_ids"] = [
#             [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
#         ]

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs

In [22]:
# if training_args.do_train:
#     if "train" not in raw_datasets:
#         raise ValueError("--do_train requires a train dataset")
#     train_dataset = raw_datasets["train"]
#     if data_args.max_train_samples is not None:
#         max_train_samples = min(len(train_dataset), data_args.max_train_samples)
#         train_dataset = train_dataset.select(range(max_train_samples))
#     #with training_args.main_process_first(desc="train dataset map pre-processing"):
#     train_dataset = train_dataset.map(
#         preprocess_function,
#         batched=True,
#         num_proc=data_args.preprocessing_num_workers,
#         remove_columns=column_names,
#         load_from_cache_file=not data_args.overwrite_cache,
#         desc="Running tokenizer on train dataset",
#     )

In [23]:
# if training_args.do_eval:
#     max_target_length = data_args.val_max_target_length
#     if "validation" not in raw_datasets:
#         raise ValueError("--do_eval requires a validation dataset")
#     eval_dataset = raw_datasets["validation"]
#     if data_args.max_eval_samples is not None:
#         max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
#         eval_dataset = eval_dataset.select(range(max_eval_samples))
#     #with training_args.main_process_first(desc="validation dataset map pre-processing"):
#     eval_dataset = eval_dataset.map(
#         preprocess_function,
#         batched=True,
#         num_proc=data_args.preprocessing_num_workers,
#         remove_columns=column_names,
#         load_from_cache_file=not data_args.overwrite_cache,
#         desc="Running tokenizer on validation dataset",
#     )

# if training_args.do_predict:
#     max_target_length = data_args.val_max_target_length
#     if "test" not in raw_datasets:
#         raise ValueError("--do_predict requires a test dataset")
#     predict_dataset = raw_datasets["test"]
#     if data_args.max_predict_samples is not None:
#         max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
#         predict_dataset = predict_dataset.select(range(max_predict_samples))
#     #with training_args.main_process_first(desc="prediction dataset map pre-processing"):
#     predict_dataset = predict_dataset.map(
#         preprocess_function,
#         batched=True,
#         num_proc=data_args.preprocessing_num_workers,
#         remove_columns=column_names,
#         load_from_cache_file=not data_args.overwrite_cache,
#         desc="Running tokenizer on prediction dataset",
#     )


In [24]:
# label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
# data_collator = DataCollatorForSeq2Seq(
#     tokenizer,
#     model=model,
#     label_pad_token_id=label_pad_token_id,
#     pad_to_multiple_of=None,
# )

# # Metric
# metric = load_metric("rouge")

# def postprocess_text(preds, labels):
#     preds = [pred.strip() for pred in preds]
#     labels = [label.strip() for label in labels]

#     # rougeLSum expects newline after each sentence
#     preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
#     labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

#     return preds, labels

# def compute_metrics(eval_preds):
#     preds, labels = eval_preds
#     if isinstance(preds, tuple):
#         preds = preds[0]
#     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     if data_args.ignore_pad_token_for_loss:
#         # Replace -100 in the labels as we can't decode them.
#         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Some simple post-processing
#     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

#     result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
#     # Extract a few results from ROUGE
#     result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
#     result["gen_len"] = np.mean(prediction_lens)
#     result = {k: round(v, 4) for k, v in result.items()}
#     return result


In [25]:
collator = SummaryCollator(tokenizer, 'cnn_dailymail')
dataloader = get_dataloader(raw_datasets["train"], collator)

In [27]:
# data_collator = DataCollatorForSeq2Seq(
#         tokenizer,
#         model=model,
#         label_pad_token_id=label_pad_token_id,
#         pad_to_multiple_of=None,
#     )

In [28]:
#ipu_train_dataloader = ipu_dataloader(train_dataset, tokenizer, ipu_config, training_args, data_collator, shuffle=False)
ipu_train_dataloader = ipu_dataloader(raw_datasets["train"], tokenizer, ipu_config, training_args, collator, shuffle=False)

In [29]:
a = next(iter(ipu_train_dataloader))

In [30]:
a

{'input_ids': tensor([[    0,   243,    18,  ...,  4139,    19,     2],
         [    0,  1640, 16256,  ...,    71,   292,     2],
         [    0, 38959,   412,  ...,  3603,     7,     2],
         ...,
         [    0,  1640, 16256,  ...,  1707,     9,     2],
         [    0,  1640, 16256,  ...,  2964,    30,     2],
         [    0, 23122,    36,  ...,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([[    0, 38291,   781,  ...,  -100,  -100,  -100],
         [    0, 16419,  1851,  ...,  -100,  -100,  -100],
         [    0,   133,  3200,  ...,  -100,  -100,  -100],
         ...,
         [    0,  1620,   391,  ...,  -100,  -100,  -100],
         [    0, 35021,  1928,  ...,  -100,  -100,  -100],
         [    0, 15685,    34,  ...,  -100,  -100,  -1

# Training

In [31]:
from train import get_optimizer_scheduler, wrap_model


In [32]:
optimizer, scheduler = get_optimizer_scheduler(model, ipu_config, training_args, ipu_train_dataloader)

In [33]:
import poptorch
from poptorch import DataLoaderMode, PoplarExecutor
import time
from torch import nn, optim


In [34]:
if optimizer is not None and not isinstance(optimizer, poptorch.optim.Optimizer):
    #optimizer = self._pytorch_optimizer_to_poptorch(self.optimizer, model, self.model)
    raise Exception('Error : convert to poptorch optimzier')

In [35]:
opts = ipu_config.to_options()


In [36]:
training_model = poptorch.trainingModel(
    model.train(), options=opts, optimizer=optimizer
)
training_model = wrap_model(training_model, opts)

In [37]:
if training_model.isCompiled():
    pass
else:
    logger.info("Compiling Model...")
    start_compile = time.perf_counter()

    sample_batch = next(iter(ipu_train_dataloader))
    
    if isinstance(sample_batch, tuple):
        training_model.compile(*dict(a))
    else:
        training_model.compile(**dict(a))
    duration_compilation = time.perf_counter() - start_compile
    logger.info(f"Compiled/Loaded model in {duration_compilation} secs")


04/28/2022 11:02:55 - INFO - __main__ - Compiling Model...


  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
  if input_shape[-1] > 1:
Graph compilation: 100%|██████████| 100/100 [00:22<00:00]


04/28/2022 11:03:40 - INFO - __main__ - Compiled/Loaded model in 45.3809361460153 secs


In [38]:
data = a

In [39]:
training_model(**a)

tensor(3.0820, dtype=torch.float16)

In [122]:
training_model(**a)

tensor(3.0820, dtype=torch.float16)