## Model1

In [1]:
! pip install transformers datasets evaluate rouge_score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ev

In [35]:
from datasets import load_dataset

cnn = load_dataset("cnn_dailymail", "3.0.0", split="train[:1000]")

In [36]:
cnn = cnn.train_test_split(test_size=0.2)

In [37]:
from transformers import AutoTokenizer

checkpoint = "t5-small" #https://huggingface.co/t5-small
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [38]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [39]:
tokenized_cnn = cnn.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [40]:
tokenized_cnn

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [41]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [42]:
import evaluate

rouge = evaluate.load("rouge")

In [43]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [44]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [12]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
The token `nlp token` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-au

In [45]:
training_args = Seq2SeqTrainingArguments(
    output_dir="report_model_t5",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn["train"],
    eval_dataset=tokenized_cnn["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [46]:
#start training
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,1.966717,0.2383,0.0947,0.1944,0.1944,19.0
2,No log,1.877096,0.24,0.098,0.1953,0.1954,19.0
3,No log,1.854153,0.2432,0.0981,0.1968,0.1967,19.0
4,No log,1.848633,0.2454,0.096,0.1964,0.1961,19.0




TrainOutput(global_step=200, training_loss=2.1305841064453124, metrics={'train_runtime': 221.6281, 'train_samples_per_second': 14.439, 'train_steps_per_second': 0.902, 'total_flos': 866187529420800.0, 'train_loss': 2.1305841064453124, 'epoch': 4.0})

In [47]:
text = "LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in 'Harry Potter and the Order of the Phoenix' To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. 'I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar,' he told an Australian interviewer earlier this month. 'I don't think I'll be particularly extravagant. 'The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs.' At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film 'Hostel: Part II,' currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. 'I'll definitely have some sort of party,' he said in an interview. 'Hopefully none of you will be reading about it.' Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. 'People are always looking to say 'kid star goes off the rails,'' he told reporters last month. 'But I try very hard not to go that way because it would be too easy for them.' His latest outing as the boy wizard in 'Harry Potter and the Order of the Phoenix' is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called 'My Boy Jack,' about author Rudyard Kipling and his son, due for release later this year. He will also appear in 'December Boys,' an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's 'Equus.' Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: 'I just think I'm going to be more sort of fair game,' he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed."

In [48]:
inputs = tokenizer(text, return_tensors="pt").input_ids

Token indices sequence length is longer than the specified maximum sequence length for this model (660 > 512). Running this sequence through the model will result in indexing errors


In [49]:
outputs = model.generate(inputs.cuda(), max_new_tokens=100, do_sample=False)

In [50]:
outputs

tensor([[    0, 32099,     3, 18844,  1636,  8929, 16023,  2213,  4173,  6324,
         12591,    15,    19,     3,     9, 23980,  3996,  1755,   770,  8785,
           591, 11039,   770,    61, 13462,    38,     3,    88,  5050,   507,
            30,  2089,     3,     5,  6324, 12591,    15,   845,     3,    88,
            65,   150,  1390,    12,  9030,    17,   449,   112,  1723,   550,
            30,  1006,  2948,     6,  3281,    11, 17086,  2251,     3,     5,
            37,  1524,    49,    65,     3, 25403,     3,     9,  1424,  1974,
           718,     3,    31,  7008,  7508,  4496,    31,    81,  2291, 17806,
          6636,  4320,   102,   697,    11,   112,   520,     6,   788,    21,
          1576,   865,    48,   215,     3,     5,     1]], device='cuda:0')

In [51]:
#Decode the generated token ids back into text:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"Reuters -- Harry Potter star Daniel Radcliffe is averaging £20 million ($41.1 million) fortune as he turns 18 on Monday. Radcliffe says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. The Londoner has filmed a TV movie called 'My Boy Jack' about author Rudyard Kipling and his son, due for release later this year."

## Model2

In [20]:
from datasets import load_dataset

cnn = load_dataset("cnn_dailymail", "3.0.0", split="train[:1000]")

In [21]:
cnn = cnn.train_test_split(test_size=0.2)

In [22]:
from transformers import AutoTokenizer

checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [23]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [24]:
tokenized_cnn = cnn.map(preprocess_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [25]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [26]:
import evaluate

rouge = evaluate.load("rouge")

In [27]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [28]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [29]:
training_args = Seq2SeqTrainingArguments(
    output_dir="report_model_bart",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_cnn["train"],
    eval_dataset=tokenized_cnn["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [30]:
#start training
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.196114,0.2251,0.0838,0.1782,0.2091,20.0
2,No log,2.118857,0.2318,0.0906,0.1852,0.2145,20.0




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.196114,0.2251,0.0838,0.1782,0.2091,20.0
2,No log,2.118857,0.2318,0.0906,0.1852,0.2145,20.0
3,No log,2.064595,0.231,0.0895,0.186,0.2147,20.0
4,No log,2.061769,0.2316,0.0907,0.1883,0.2145,20.0




TrainOutput(global_step=200, training_loss=2.4183062744140624, metrics={'train_runtime': 242.5077, 'train_samples_per_second': 13.195, 'train_steps_per_second': 0.825, 'total_flos': 1949575162429440.0, 'train_loss': 2.4183062744140624, 'epoch': 4.0})

In [31]:
text = "LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in 'Harry Potter and the Order of the Phoenix' To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. 'I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar,' he told an Australian interviewer earlier this month. 'I don't think I'll be particularly extravagant. 'The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs.' At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film 'Hostel: Part II,' currently six places below his number one movie on the UK box office chart. Details of how he'll mark his landmark birthday are under wraps. His agent and publicist had no comment on his plans. 'I'll definitely have some sort of party,' he said in an interview. 'Hopefully none of you will be reading about it.' Radcliffe's earnings from the first five Potter films have been held in a trust fund which he has not been able to touch. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. 'People are always looking to say 'kid star goes off the rails,'' he told reporters last month. 'But I try very hard not to go that way because it would be too easy for them.' His latest outing as the boy wizard in 'Harry Potter and the Order of the Phoenix' is breaking records on both sides of the Atlantic and he will reprise the role in the last two films. Watch I-Reporter give her review of Potter's latest » . There is life beyond Potter, however. The Londoner has filmed a TV movie called 'My Boy Jack,' about author Rudyard Kipling and his son, due for release later this year. He will also appear in 'December Boys,' an Australian film about four boys who escape an orphanage. Earlier this year, he made his stage debut playing a tortured teenager in Peter Shaffer's 'Equus.' Meanwhile, he is braced for even closer media scrutiny now that he's legally an adult: 'I just think I'm going to be more sort of fair game,' he told Reuters. E-mail to a friend . Copyright 2007 Reuters. All rights reserved.This material may not be published, broadcast, rewritten, or redistributed."

In [32]:
inputs = tokenizer(text, return_tensors="pt").input_ids

In [33]:
outputs = model.generate(inputs.cuda(), max_new_tokens=100, do_sample=False)

In [34]:
#Decode the generated token ids back into text:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him .\nDaniel Radcliffe as Harry Potter in 'Harry Potter and the Order of the Phoenix' To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink"