In [1]:
from datasets import load_metric

In [3]:
metric = load_metric("rouge")
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

In [7]:
predictions = ["Hello.\nThis is a second sentence."]
references = ["Hello.\nThis is a first sentence."]
rouge_types = ["rouge1, rouge2, rougeL, rougeLSum"]
metric.compute(predictions=predictions, references=references, use_stemmer=False)

{'rouge1': AggregateScore(low=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), mid=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), high=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334)),
 'rouge2': AggregateScore(low=Score(precision=0.6, recall=0.6, fmeasure=0.6), mid=Score(precision=0.6, recall=0.6, fmeasure=0.6), high=Score(precision=0.6, recall=0.6, fmeasure=0.6)),
 'rougeL': AggregateScore(low=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), mid=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), high=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334)),
 'rougeLsum': AggregateScore(low=Score(precision=0.8333333333333334, recall=0.8333333333333334, fmeasure=0.8333333333333334), mid=Score(precision=0.8333333333333334, recall=0.833

In [8]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
train_texts, train_labels = dataset['train']['article'], dataset['train']['highlights']
val_texts, val_labels = dataset["validation"]["article"], dataset["validation"]["highlights"]

Reusing dataset cnn_dailymail (/home/jvidakovic/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
train_labels[0]

'Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"\nObama sends a letter to the heads of the House and Senate .\nObama to seek congressional approval on military action against Syria .\nAim is to determine whether CW were used, not by whom, says U.N. spokesman .'

In [11]:
train_labels[0].split("\n")

['Syrian official: Obama climbed to the top of the tree, "doesn\'t know how to get down"',
 'Obama sends a letter to the heads of the House and Senate .',
 'Obama to seek congressional approval on military action against Syria .',
 'Aim is to determine whether CW were used, not by whom, says U.N. spokesman .']

In [12]:
from transformers import PegasusTokenizerFast

model_name = "google/pegasus-large"

tokenizer = PegasusTokenizerFast.from_pretrained(model_name)

In [14]:
from transformers import PegasusForConditionalGeneration

model: PegasusForConditionalGeneration = PegasusForConditionalGeneration.from_pretrained(model_name).to("cuda")

In [19]:
import torch
from torch.utils.data import Dataset

class PegasusDataset(Dataset):
    def __init__(self, encodings, labels):
        # TODO - check what this is
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)


In [39]:
from transformers import BatchEncoding

texts, labels = train_texts[:10], train_labels[:10]
max_input_length = 512
max_target_length = 128
encodings = tokenizer(texts, max_length=512, truncation=True, padding="longest", return_tensors="pt")

with tokenizer.as_target_tokenizer():
    encoded_labels = tokenizer(labels, max_length=max_target_length, truncation=True, padding="longest", return_tensors="pt")

# tokenized_dataset = dataset.map(preprocessing_function, batched=True)

In [40]:
dataset = PegasusDataset(encodings, encoded_labels)

In [26]:
encodings = encodings.to("cuda")
output = model.generate(**encodings)

In [27]:
len(output)

10

In [28]:
output[0]

tensor([    0,  1276, 12998,  3531,  1728, 11895,   112,  9835,   115,   124,
          682,   112,   207,  2002,  1937,   115,  6881,   107,  3531,  1406,
          114,  1801,   112,   109,  4082,   113,   109,  1087,   111,  4533,
          124,  1327,   565,   108,   539,   244, 13501,   120,   178,  3999,
         2002,   918,   464, 10298,  5128,   117,   109,   268,   863,   112,
          248,   204,   109,  6854,   207,   113,  3568,  4841,   107,   139,
         2962,  4024,   135,  3531,  6937,  3108,   112,  9572,   109,   207,
          113,  2002,  1937,   198,   497, 20438,   108, 17027,   108,  1585,
          111, 30189,   109,   866,   118,   533,  1481,   113,  3568,  4841,
          132,   176,  4841,   113,  2977,  7601,   496,   168,   131,   116,
          114,   863,   120,   117,   323,   112,   795,   142,   942,  3533,
          190,   114, 13598,  2970,  1488,  2949,   107, 23195,   518,  6881,
          108,  4069,  1812,   120,   138,  1735,   682,  3568, 

In [33]:
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)

In [36]:
decoded_output[2]

'"And when factoring all of those in, it was determined that he was the best candidate, even in light of the cost that would be incurred." Klumb called the GSA\'s teleworking program "a successful program that\'s going to lead to cost savings for taxpayers." But a GSA spokeswoman said, "We are not going to defend this type of travel." And a GSA employee in Kansas City, who requested anonymity, said that hiring someone in Hawaii to work for the Kansas City region was ludicrous. It would have reduced the cost of travel by at least 70 percent when you look at just the airfare of what it takes to from Honolulu to Washington, D.C., where a lot of business is done." Dan Tangherlini, who was appointed acting GSA administrator this year, said the agency was examining the cost of the entire teleworking program.'

In [38]:
labels

{'input_ids': tensor([[10298,  1571,   151,  3531, 14026,   112,   109,   349,   113,   109,
          1681,   108,   198, 56019,   131,   144,   235,   199,   112,   179,
           308,   194,  3531,  9274,   114,  1801,   112,   109,  4082,   113,
           109,  1087,   111,  4533,   110,   107,  3531,   112,  2395, 17250,
          3619,   124,  2002,   918,   464,  6881,   110,   107, 22848,   117,
           112,  1735,   682, 17783,   195,   263,   108,   146,   141,  2901,
           108,   649,   475,   107,  1400,   107,  9619,   110,   107,     1,
             0,     0],
        [84434, 23093,  4777,   776,  1460,   113,   278,  6906,   110,   107,
         22371,   116, 14236,   112,   384,   757,  5391,   208, 11414,  3669,
           110,   107, 44517,  1460,   134,   109, 17461,   118, 23093,   110,
           107, 14236,  1394,   164,   115,   652,   131,   116,   384,   757,
          5391,   208, 11414,   110,   107,     1,     0,     0,     0,     0,
             0

In [50]:
with tokenizer.as_target_tokenizer():
    supposed_output = tokenizer.batch_decode(encoded_labels["input_ids"], skip_special_tokens=True)

supposed_output[1]

"Usain Bolt wins third gold of world championship. Anchors Jamaica to 4x100m relay victory. Eighth gold at the championships for Bolt. Jamaica double up in women's 4x100m relay."

In [43]:
encoded_labels.keys()

dict_keys(['input_ids', 'attention_mask'])

In [48]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /home/jvidakovic/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [53]:
nltk.sent_tokenize(supposed_output[3])

['NEW: A Canadian doctor says she was part of a team examining Harry Burkhart in 2010.',
 'NEW: Diagnosis: "autism, severe anxiety, post-traumatic stress disorder and depression" Burkhart is also suspected in a German arson probe, officials say.',
 'Prosecutors believe the German national set a string of fires in Los Angeles.']

In [None]:
# okay this works

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    # labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)  # not sure if using stemmer is correct here

    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result["rougecomb"] = result["rouge1"] + 2 * result["rouge2"] + result["rougeLsum"]
    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers.training_args import OptimizerNames
from transformers import Seq2SeqTrainingArguments, IntervalStrategy, SchedulerType

batch_size=1
output_dir = "few-shot-results"
args = Seq2SeqTrainingArguments(
    output_dir=output_dir,  # output directory
    max_steps=2000,
    per_device_train_batch_size=1,  # batch size per device during training, can increase if memory allows
    per_device_eval_batch_size=1,  # batch size for evaluation, can increase if memory allows
    gradient_accumulation_steps=128,
    eval_accumulation_steps=128,
    # save_steps=100,  # number of updates steps before checkpoint saves
    save_steps=1,
    save_total_limit=1,  # limit the total amount of checkpoints and deletes the older checkpoints
    load_best_model_at_end=True,
    metric_for_best_model = "rougecomb",
    greater_is_better=True,
    evaluation_strategy=IntervalStrategy.STEPS,  # evaluation strategy to adopt during training
    eval_steps=1,  # number of update steps before evaluation
    # eval_steps = 100
    logging_dir='./test-logs',  # directory for storing logs
    logging_steps=10,
    adafactor=True,
    optim=OptimizerNames.ADAFACTOR,
    lr_scheduler_type=SchedulerType.CONSTANT
)

