In [1]:
import datasets
import transformers
import rouge

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
raw_validation_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:10%]")

Reusing dataset cnn_dailymail (/Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)
Reusing dataset cnn_dailymail (/Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


### Sample of the data

In [3]:
def show_examples(dataset, num_samples=3, seed=42):
    samples = dataset.shuffle(seed=seed).select(range(num_samples))
        
    for idx, sample in enumerate(samples):
        display(f'sample {idx}: {sample["article"]} \n')
        display(f'highlight {idx}: {sample["highlights"]} \n')
        display(f'id: {sample["id"]}')
        display('-------')
        
def get_samples(dataset, num_samples=10):
    return dataset.shuffle(seed=1).select(range(num_samples))

def get_random_sample(dataset):
    sample = dataset.shuffle(seed=1).select(range(1)) 
    return [sample["article"][0], sample["highlights"][0]]

In [4]:
get_random_sample(raw_train_data)

Loading cached shuffled indices for dataset at /Users/jeroen/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-d65aa25a11484207.arrow


 'Denise Hendry was ill for seven years after her botched liposuction .\nShe eventually died in 2009 from an infection .\nHer daughter Rheagan Hendry is campaigning for better regulation .\nRheagan says that clinic owners ‘answer to no one’\nDoctors must be properly trained in the procedures they undertake.\nA regulatory body be established to register all cosmetic surgeons.\nPatients should receive a 30-minute pre-surgery\xa0 consultation and two-week cooling-off period.\nHard-sell advertising is banned.\nBotox professionals be registered with the regulator.']

### Tokenizer

In [15]:
batch_size=4  # change to 16 for full training
encoder_max_length=512
decoder_max_length=128


tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data_to_model_input(batch):
    inputs  = tokenizer(batch["article"], padding="max_length", 
                       truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["highlights"], padding="max_length", 
                        truncation=True, max_length=decoder_max_length)
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    
    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    # batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch

# For now subsample is being used
raw_train_data = raw_train_data.select(range(32))

train_data = raw_train_data.map(
    tokenize_data_to_model_input, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["article", "highlights", "id"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
raw_validation_data = raw_validation_data.select(range(16))

val_data = raw_validation_data.map(
    tokenize_data_to_model_input, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["article", "highlights", "id"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
    

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /Users/jeroen/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/re

### Encoder - Decoder

In [6]:
bert2bert = transformers.EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relatio

In [7]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

### Evaluation Metric

In [8]:
from rouge import Rouge 

rouge_scorer = Rouge()

def compute_evaluation_metric(prediction):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    score = rouge_scorer.get_scores(label_str, pred_str)
    score = round(score, 4)
    
    return {
        "rouge2_fmeasure": score[0]["rouge-2"]["f"]
    }

### Training

In [17]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

learning_rate=2e-5
weight_decay = 0.01
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    weight_decay=weight_decay,
    logging_steps=2,  # set to 1000 for full training
    save_steps=16,  # set to 500 for full training
    eval_steps=4,  # set to 8000 for full training
    warmup_steps=1,  # set to 2000 for full training
    max_steps=16, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=3,
#     fp16=True, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_evaluation_metric,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs


TypeError: Instance and class checks can only be used with @runtime_checkable protocols