# Testing & Evaluation

Computing the metric while training uses too much RAM. Therefore, we have the possibility to perform a separate evaluation after each epoch. We do this at the final epoch on the test set, so that we do not see the test set before. We choose the final epoch based on the loss on the train and eval set during training.

We can also perform an evaluation on the validation set to get to score.

In [1]:
!pip install transformers datasets
!pip3 install rouge_score
!pip install bert_score



In [31]:
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import re
import transformers
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split

from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer, DataCollatorForSeq2Seq, BigBirdPegasusPreTrainedModel
import datasets
from datasets import load_dataset, list_metrics, load_metric
from datasets import Features, Sequence, Value
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.utils.checkpoint import checkpoint

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### The different test paths depending on the extractive methods

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
EPOCH = 6

In [33]:
MODEL_VERSION = 2
TRAIN_PATH = '/content/drive/MyDrive/Text-Mining/Data/paragraph_selection/train_with_shortened_3072.tsv'
TEST_PATH = '/content/drive/MyDrive/Text-Mining/Data/paragraph_selection/test_with_shortened_3072.tsv'
test_df = pd.read_csv(TEST_PATH, sep='\t', usecols=[2, 6, 12])

In [6]:
#MODEL_VERSION = 3
#TRAIN_PATH = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection/train_with_shortened_sent_sel_3072.tsv'
#TEST_PATH = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection/test_with_shortened_sent_sel_3072.tsv'
#test_df = pd.read_csv(TEST_PATH, sep='\t', usecols=[2, 6, 12])

In [7]:
#MODEL_VERSION = 4
#TRAIN_PATH = '/content/drive/MyDrive/Text-Mining/Data/paragraph_selection_random/train_with_random_paragraph_sel_3072.tsv'
#TEST_PATH = '/content/drive/MyDrive/Text-Mining/Data/paragraph_selection_random/test_with_random_paragraph_sel_3072.tsv'
#test_df = pd.read_csv(TEST_PATH, sep='\t', usecols=[1, 5, 7])

In [8]:
#MODEL_VERSION = 5
#TRAIN_PATH = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection_random/train_with_random_sent_sel_3072.tsv'
#TEST_PATH = '/content/drive/MyDrive/Text-Mining/Data/sentence_selection_random/test_with_random_sent_sel_3072.tsv'
#test_df = pd.read_csv(TEST_PATH, sep='\t', usecols=[1, 5, 7])

In [9]:
#MODEL_VERSION = 6
#TRAIN_PATH = '/content/drive/MyDrive/Text-Mining/Data/before_shortening/train_with_rouge_sent.tsv'
#TEST_PATH = '/content/drive/MyDrive/Text-Mining/Data/before_shortening/test_with_rouge_sent.tsv'
#test_df = pd.read_csv(TEST_PATH, sep='\t', usecols=[1, 4, 5])

In [10]:
!nvidia-smi -L 

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-da7beb16-2e99-3f00-2b4c-26da46a18a2e)


In [34]:
#train_df = pd.read_csv(TRAIN_PATH, sep='\t', usecols=[2, 6, 12])

#train_df, val_df = train_test_split(train_df.dropna(), test_size=0.2, shuffle=False, random_state=42) # shuffle later

#val_df.to_csv('clean_val.tsv', sep='\t', index=False) # get part of validation dataset to do some experiments for the metrics on it
test_df.dropna().to_csv('clean_test.tsv', sep='\t', index=False)

In [35]:
test_df.head()

Unnamed: 0,id,highlights,shortened_articles
0,S0003687013000549,We quantified the effect of four wrist posture...,The hand and wrist was modeled as a four-segme...
1,S0003687013000550,The relationship of task variation during dent...,Objectives We aimed to investigate the relatio...
2,S0003687013000562,We carried out an evaluation of a set of pilot...,We report a study which aimed to provide furth...
3,S0003687013000574,We describe ergonomic changes implemented with...,Meat cutters face higher risks of injury and m...
4,S0003687013000586,"SoS differ from systems with subsystems, becau...",There is a particular difficulty in defining t...


In [36]:
features = Features({'id': Value('string'), 
                     'highlights': Value('string'), 
                     'shortened_articles': Value('string')})

In [37]:
dataset = load_dataset('csv', 
                       data_files={
                           #'val': 'clean_val.tsv', 
                           'test': 'clean_test.tsv'
                                   }, 
                       delimiter='\t',
                       features=features)

Using custom data configuration default-4c549256f57309e7


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4c549256f57309e7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4c549256f57309e7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [38]:
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

loading configuration file https://huggingface.co/google/bigbird-pegasus-large-arxiv/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/0c5c2a21485ba0e75fd41928cbb901586887479c8fad3f3965b9bcae7632825b.c65855e5554b00a37b55e85d3a9f9dd66ca2c3f276ee79e8daea2165fe581bbf
Model config BigBirdPegasusConfig {
  "_name_or_path": "google/bigbird-pegasus-large-arxiv",
  "activation_dropout": 0.0,
  "activation_function": "gelu_new",
  "architectures": [
    "BigBirdPegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "gradient_checkpointing": fal

In [39]:
def tokenize_function(examples):
    inputs = tokenizer(examples["shortened_articles"], 
                       #padding="max_length", 
                       max_length=3072, 
                       truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], 
                           #padding="max_length",
                           max_length=128,  
                           truncation=True)
    
    inputs["labels"] = labels["input_ids"]

    return inputs

In [40]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [41]:
#val_dataset = tokenized_datasets['val'].shuffle(seed=42)
test_dataset = tokenized_datasets["test"].shuffle(seed=42)

In [42]:
rouge = load_metric('rouge')

In [43]:
bertscore = load_metric("bertscore")

In [44]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #predictions = np.argmax(logits[0], axis=-1)
    print(predictions.shape)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds_newline = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels_newline = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result_rouge = rouge.compute(predictions=decoded_preds_newline, references=decoded_labels_newline, use_stemmer=True) # TODO: Uncoment to use rouge metric
    
    decoded_preds = [re.sub(r' +', ' ', pred) for pred in decoded_preds] # it is suggested to remove multiple white spaces from the predicitons https://github.com/Tiiiger/bert_score
    result_bert = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang='en') # TODO: Uncoment to use bertscore metric

    # average over results
    result_rouge = {key: round(value.mid.fmeasure * 100, 2) for key, value in result_rouge.items()}  
    result_bert = {key: round((sum(value)/len(value)), 4) for key, value in result_bert.items() if key != 'hashcode'}

    results = {**result_rouge, **result_bert}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    results["mean_prediction_length"] = np.mean(prediction_lens)
    
    return results

In [45]:
training_args_eval = Seq2SeqTrainingArguments(
    output_dir="eval", 
    evaluation_strategy="steps",
    learning_rate=2e-7,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=5,
    weight_decay=0.01,
    num_train_epochs=1,
    eval_accumulation_steps=1,
    gradient_accumulation_steps=4,
    eval_steps=10,
    gradient_checkpointing=True,
    dataloader_num_workers=4,
    predict_with_generate=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [46]:
model_for_eval = BigBirdPegasusForConditionalGeneration.from_pretrained(
    f"/content/drive/MyDrive/Text-Mining/model_v{MODEL_VERSION}/epoch{EPOCH}", 
    attention_type="block_sparse",
    repetition_penalty=1.3,
    min_length=50,
    max_length=128,
    do_sample=True,  
    top_k=100, 
    top_p=0.95,
    temperature=0.95
)

loading configuration file /content/drive/MyDrive/Text-Mining/model_v2/epoch6/config.json
Model config BigBirdPegasusConfig {
  "_name_or_path": "/content/drive/MyDrive/Text-Mining/model_v2/epoch4",
  "activation_dropout": 0.0,
  "activation_function": "gelu_new",
  "architectures": [
    "BigBirdPegasusForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "attention_type": "block_sparse",
  "block_size": 64,
  "bos_token_id": 2,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 16,
  "decoder_start_token_id": 2,
  "do_sample": true,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 16,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "length_penalty": 0.8,
  "max_length": 128,
  "max_position_embeddings": 4096,
  "min_length": 50,
  "model_t

In [47]:
data_collator_eval = DataCollatorForSeq2Seq(tokenizer, model=model_for_eval)

In [48]:
trainer_eval = Seq2SeqTrainer(
    model=model_for_eval,
    args=training_args_eval,
    tokenizer=tokenizer,
    data_collator=data_collator_eval,
    compute_metrics=compute_metrics
)

In [49]:
model_output = trainer_eval.predict(test_dataset)

The following columns in the test set  don't have a corresponding argument in `BigBirdPegasusForConditionalGeneration.forward` and have been ignored: id, highlights, shortened_articles. If id, highlights, shortened_articles are not expected by `BigBirdPegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 150
  Batch size = 5
Input ids are automatically padded from 3055 to 3072 to be a multiple of `config.block_size`: 64
  * num_indices_to_pick_from
Input ids are automatically padded from 3055 to 3072 to be a multiple of `config.block_size`: 64


Input ids are automatically padded from 3006 to 3008 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3006 to 3008 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 2994 to 3008 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 2994 to 3008 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3048 to 3072 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3048 to 3072 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3023 to 3072 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3023 to 3072 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3034 to 3072 to be a multiple of `config.block_size`: 64
Input ids are automatically padded from 3034 to 3072 to be a multiple of `config.block_size`: 64
Input ids are automatically pa

(150, 128)


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_c

In [50]:
predictions, labels, metrics = model_output
#predictions = np.argmax(logits_pred[0], axis=-1)
decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

In [51]:
metrics 

{'test_f1': 0.8485,
 'test_loss': 4.979737281799316,
 'test_mean_prediction_length': 87.23333333333333,
 'test_precision': 0.8364,
 'test_recall': 0.8613,
 'test_rouge1': 29.77,
 'test_rouge2': 6.94,
 'test_rougeL': 19.68,
 'test_rougeLsum': 25.86,
 'test_runtime': 344.9986,
 'test_samples_per_second': 0.435,
 'test_steps_per_second': 0.087}

In [52]:
d = {'id': test_dataset['id'], 'prediction': decoded_predictions, 'reference': decoded_labels}
df = pd.DataFrame(d)
df.to_csv(f'/content/drive/MyDrive/Text-Mining/model_v{MODEL_VERSION}/model_v{MODEL_VERSION}-predictions-epoch{EPOCH}.tsv', sep='\t', index=False)

In [53]:
with open(f'/content/drive/MyDrive/Text-Mining/model_v{MODEL_VERSION}/model_v{MODEL_VERSION}-metrics-epoch{EPOCH}.txt', 'w', encoding='utf8') as metrics_file:
    metrics_file.write(str(metrics))