<a href="https://colab.research.google.com/github/jaredmontierth/summarizer/blob/main/fine_tuning_bart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets 

In [None]:
!pip install rouge.score nltk py7zr

In [None]:
import transformers
from datasets import load_dataset, load_metric, load_from_disk
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
data = load_dataset('argilla/news-summary')
data.save_to_disk('/content/argilla')


In [None]:
data = load_from_disk("/content/argilla")
metric = load_metric('rouge')
model_checkpoints = 'facebook/bart-large-xsum'

  metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
max_input = 512
max_target = 128
tokenizer = transformers.AutoTokenizer.from_pretrained(model_checkpoints)

In [None]:
def preprocess_data(data_to_process):
  #get the dialogue text
  inputs = [dialogue for dialogue in data_to_process['dialogue']]
  #tokenize text
  model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

  #tokenize labels
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['summary'], max_length=max_target, padding='max_length', truncation=True)
    
  model_inputs['labels'] = targets['input_ids']
  #reuturns input_ids, attention_masks, labels
  return model_inputs

In [None]:
tokenize_data = data.map(preprocess_data, batched = True, remove_columns=['id', 'dialogue', 'summary'])

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
#sample the data
train_sample = tokenize_data['train'].shuffle(seed=123).select(range(1000))
validation_sample = tokenize_data['validation'].shuffle(seed=123).select(range(500))
test_sample = tokenize_data['test'].shuffle(seed=123).select(range(200))

In [None]:
tokenize_data['train'] = train_sample
tokenize_data['validation'] = validation_sample
tokenize_data['test'] = test_sample

In [None]:
tokenize_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [None]:
#load model
model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints)

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [None]:
batch_size = 4

In [None]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def compute_rouge(pred):
  predictions, labels = pred
  decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
  res = {key: value.mid.fmeasure * 100 for key, value in res.items()}
  pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  res['gen_len'] = np.mean(pred_lens)
  return {k: round(v, 4) for k, v in res.items()}

In [None]:
args = transformers.Seq2SeqTrainingArguments(
    'news-summary',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size= 1,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=1,
    fp16=True
    )

In [None]:
trainer = transformers.Seq2SeqTrainer(
    model, 
    args,
    train_dataset=tokenize_data['train'],
    eval_dataset=tokenize_data['validation'],
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [None]:
!nvidia-smi

Mon Apr 17 18:37:59 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   74C    P0    31W /  70W |   9165MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.1969,0.378427,50.3319,25.0298,40.8275,40.8181,26.448
2,0.1407,0.420369,50.969,25.0171,40.8411,40.8084,28.684
3,0.0792,0.49264,51.0857,25.2011,40.9193,40.8647,29.162


TrainOutput(global_step=1500, training_loss=0.13892815907796224, metrics={'train_runtime': 1752.3723, 'train_samples_per_second': 1.712, 'train_steps_per_second': 0.856, 'total_flos': 3250656903168000.0, 'train_loss': 0.13892815907796224, 'epoch': 3.0})

In [None]:
article = """
First lady Jill Biden's walk-back of her suggestion that runner-up Iowa should join NCAA women's basketball champion LSU for a visit to the White House didn't sit well with Tigers star Angel Reese.
Prompted by a discussion of Biden's comments during her Wednesday appearance on "The Paper Route Podcast," Reese said the Tigers should celebrate their title with former President Barack Obama and former first lady Michelle Obama rather than Joe and Jill Biden.
Jill Biden, at an appearance in Denver on Monday, had praised Iowa's sportsmanship and congratulated both teams. She also said that as part of the longstanding tradition of having champions visit the White House, Iowa should come as well "because they played such a good game."
The Tigers defeated Iowa 102-85 for the title in Dallas on Sunday.
EDITOR'S PICKS
Clark: Reese flak unfair, WH not for 'runner-ups'
1hM.A. Voepel
White House clarifies LSU women's invitation
2h
'In my moment': LSU star Reese explains gesture
3dM.A. Voepel
Reese on Monday called Jill Biden's suggestion "a joke."
Joe Biden invited LSU and men's champion UConn to the White House on Tuesday with no mention of Iowa. Vanessa Valdivia, a spokesperson for Jill Biden, said the first lady had meant no disrespect to LSU and that her comments were intended to applaud the historic game and all women athletes.
"I'm not going to lie to you, I don't accept the apology because of, you said what you said. I said what I said. And like, you can't go back on certain things that you say," Reese told podcast hosts Brandon Marshall and Ashley Nicole Moss.
"I mean, you felt like they should've came because of sportsmanship, right?" Reese added. "They can have that spotlight. We'll go to the Obamas. We'll see Michelle. We'll see Barack."
Some social media commenters noted the racial dynamics involved, saying that only winners should be rewarded with a White House visit and that hosting both teams would detract from the achievement by LSU's team, which is predominantly Black. The Iowa team is largely white. Others noted the important role of Black women in Democratic Party politics.
Following LSU's victory, coach Kim Mulkey said she would go to the White House if invited. Reese said Wednesday she was uncertain if she would go.
Reese faced criticism on social media for waving her hand in front of Iowa star Caitlin Clark's face while staring down Clark during the game. Clark, The Associated Press Player of the Year, made a similar gesture to no one in particular during Iowa's victory over Louisville in the Elite Eight.
Reese said she didn't think LSU, had it lost to Iowa, would have gotten the same praise from Jill Biden as the Hawkeyes did.
"If we were to lose, we would not be getting invited to the White House," she added. "I remember she made a comment about both teams should be invited because of sportsmanship. And I'm like, 'Are you saying that because of what I did?' Stuff like that, it bothers me because you are a woman at the end of the day. White, Black, it doesn't matter, you're a woman, you're supposed to be standing behind us before anything."

"""

In [None]:
model_inputs = tokenizer(article,  max_length=max_input, padding='max_length', truncation=True)

In [None]:
model_inputs

In [None]:
raw_pred, _, _ = trainer.predict([model_inputs])

In [None]:
raw_pred

In [None]:
tokenizer.decode(raw_pred[0])

In [None]:
tokenizer.decode(raw_pred[0])

In [None]:
model.save_pretrained("/content")