In [1]:
import os
cache_dir = "/scratches/dialfs/alta/hln35/.cache"
os.environ['TRANSFORMERS_CACHE'] = '/scratches/dialfs/alta/hln35/.cache'

In [2]:
from tqdm.auto import tqdm
import torch

In [9]:
from datasets import load_dataset
from evaluate import load

raw_datasets = load_dataset("xsum", cache_dir=cache_dir)
metric = load("rouge")



In [10]:
model_checkpoint = "google/flan-t5-small"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [12]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)



In [13]:
max_input_length = 1024
max_target_length = 128
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [14]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

In [15]:
raw_datasets["train"][5]

{'document': 'Simone Favaro got the crucial try with the last move of the game, following earlier touchdowns by Chris Fusaro, Zander Fagerson and Junior Bulumakau.\nRynard Landman and Ashton Hewitt got a try in either half for the Dragons.\nGlasgow showed far superior strength in depth as they took control of a messy match in the second period.\nHome coach Gregor Townsend gave a debut to powerhouse Fijian-born Wallaby wing Taqele Naiyaravoro, and centre Alex Dunbar returned from long-term injury, while the Dragons gave first starts of the season to wing Aled Brew and hooker Elliot Dee.\nGlasgow lost hooker Pat McArthur to an early shoulder injury but took advantage of their first pressure when Rory Clegg slotted over a penalty on 12 minutes.\nIt took 24 minutes for a disjointed game to produce a try as Sarel Pretorius sniped from close range and Landman forced his way over for Jason Tovey to convert - although it was the lock\'s last contribution as he departed with a chest injury shor

In [16]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [17]:
tokenized_datasets["train"][5]

{'document': 'Simone Favaro got the crucial try with the last move of the game, following earlier touchdowns by Chris Fusaro, Zander Fagerson and Junior Bulumakau.\nRynard Landman and Ashton Hewitt got a try in either half for the Dragons.\nGlasgow showed far superior strength in depth as they took control of a messy match in the second period.\nHome coach Gregor Townsend gave a debut to powerhouse Fijian-born Wallaby wing Taqele Naiyaravoro, and centre Alex Dunbar returned from long-term injury, while the Dragons gave first starts of the season to wing Aled Brew and hooker Elliot Dee.\nGlasgow lost hooker Pat McArthur to an early shoulder injury but took advantage of their first pressure when Rory Clegg slotted over a penalty on 12 minutes.\nIt took 24 minutes for a disjointed game to produce a try as Sarel Pretorius sniped from close range and Landman forced his way over for Jason Tovey to convert - although it was the lock\'s last contribution as he departed with a chest injury shor

In [18]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11334
    })
})

In [19]:
labels = tokenizer.batch_decode(tokenized_datasets["test"]["labels"], skip_special_tokens=True)
labels

['There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.',
 'A man has appeared in court after firearms, ammunition and cash were seized by police in Edinburgh.',
 'Four people accused of kidnapping and torturing a mentally disabled man in a "racially motivated" attack streamed on Facebook have been denied bail.',
 'West Brom have appointed Nicky Hammond as technical director, ending his 20-year association with Reading.',
 'The pancreas can be triggered to regenerate itself through a type of fasting diet, say US researchers.',
 'Since their impending merger was announced in January, there has been remarkably little comment about the huge proposed deal to combine Essilor and Luxottica.',
 'A "medal at any cost" approach created a "culture of fear" at British Cycling, says former rider Wendy Houvenaghel.',
 'Have you heard the one about the computer programmer who bought a failing comedy club in Texas and turned it into a million dollar a year bu

In [20]:
import nltk
nltk.download('punkt')
labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in labels]
labels

[nltk_data] Downloading package punkt to /home/mifs/hln35/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.',
 'A man has appeared in court after firearms, ammunition and cash were seized by police in Edinburgh.',
 'Four people accused of kidnapping and torturing a mentally disabled man in a "racially motivated" attack streamed on Facebook have been denied bail.',
 'West Brom have appointed Nicky Hammond as technical director, ending his 20-year association with Reading.',
 'The pancreas can be triggered to regenerate itself through a type of fasting diet, say US researchers.',
 'Since their impending merger was announced in January, there has been remarkably little comment about the huge proposed deal to combine Essilor and Luxottica.',
 'A "medal at any cost" approach created a "culture of fear" at British Cycling, says former rider Wendy Houvenaghel.',
 'Have you heard the one about the computer programmer who bought a failing comedy club in Texas and turned it into a million dollar a year bu

In [21]:
import torch
import numpy as np

In [22]:
for a in tokenized_datasets["test"]["input_ids"][:10]:
    print(len(a))

777
65
364
69
825
1024
1024
1024
576
589


In [23]:
# tokenizer2 = AutoTokenizer.from_pretrained(model_checkpoint)
# inputs = tokenizer2(raw_datasets["test"]["document"], return_tensors = "pt").input_ids

In [24]:
test_input_ids = tokenized_datasets["test"]["input_ids"]

In [17]:
# for input_id in tokenized_datasets["test"]["input_ids"]:
#     print(input_id)
#     output = model.generate(input_id, max_new_tokens=max_target_length, do_sample=False)
# test_input_ids = tokenized_datasets["test"]["input_ids"]
results = {}
group_len = 20
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]])
        preds = model.generate(test_tensor, max_new_tokens=max_target_length, do_sample=False)                                                               
        preds = tokenizer.batch_decode(preds, skip_special_tokens=True)                          
        result = metric.compute(predictions=preds, references=[labels[i]], use_stemmer=True, use_aggregator=False)
        for key, value in result.items():
            if key not in results:
                results[key] = value
            else:
                results[key] += value

import json
with open("rouge_small.txt", "w") as fp:
    json.dump(results, fp)





In [18]:
print(test_tensor.shape)

torch.Size([1, 77])


In [19]:
# result = metric.compute(predictions=preds, references=labels, use_stemmer=True, use_aggregator=False)
# # Extract a few results
# result = {key: value for key, value in result.items()}

In [25]:
results


{'rouge1': [0.5625,
  0.48648648648648646,
  0.46511627906976744,
  0.3333333333333333,
  0.3636363636363636,
  0.1702127659574468,
  0.33333333333333337,
  0.20512820512820512,
  0.08163265306122448,
  0.48888888888888893,
  0.27027027027027023,
  0.16666666666666663,
  0.380952380952381,
  0.1702127659574468,
  0.31111111111111117,
  0.24242424242424243,
  0.3902439024390244,
  0.36363636363636365,
  0.4285714285714286,
  0.20408163265306123,
  0.43478260869565216,
  0.20512820512820512,
  0.3,
  0.3243243243243243,
  0.0,
  0.3913043478260869,
  0.16666666666666666,
  0.10526315789473685,
  0.2553191489361702,
  0.34285714285714286,
  0.27777777777777773,
  0.3157894736842105,
  0.30303030303030304,
  0.06896551724137931,
  0.2758620689655172,
  0.34285714285714286,
  0.3243243243243243,
  0.6875,
  0.5,
  0.20833333333333331,
  0.17777777777777776,
  0.4489795918367347,
  0.7058823529411765,
  0.25,
  0.39999999999999997,
  0.4,
  0.23255813953488372,
  0.3,
  0.19999999999999998,


In [21]:
model_large = "google/flan-t5-large"
tokenizer_large = AutoTokenizer.from_pretrained(model_large)
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_large, device_map="auto")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [24]:
results_large = {}
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]])
        preds = model_large.generate(test_tensor, max_new_tokens=max_target_length, do_sample=False)                                                               
        preds = tokenizer_large.batch_decode(preds, skip_special_tokens=True)                          
        result = metric.compute(predictions=preds, references=[labels[i]], use_stemmer=True, use_aggregator=False)
        for key, value in result.items():
            if key not in results_large:
                results_large[key] = value
            else:
                results_large[key] += value

import json
with open("rouge_large.txt", "w") as fp:
    json.dump(results_large, fp)

In [26]:
results_small_agg = {}
results_large_agg = {}

for k, v in results.items():
    results_small_agg[k] = np.average(v)
for k, v in results_large.items():
    results_large_agg[k] = np.average(v)

In [27]:
results_large_agg

{'rouge1': 0.3955498559703465,
 'rouge2': 0.16193015093937965,
 'rougeL': 0.31558695378514784,
 'rougeLsum': 0.31558695378514784}

In [28]:
results_small_agg

{'rouge1': 0.29616736139261934,
 'rouge2': 0.08533474057046328,
 'rougeL': 0.2299264804204968,
 'rougeLsum': 0.2299264804204968}

In [17]:
model_small_fintuned = "model/flant5_small_lr_10-4_qa_finetuning"
model_small_distill_qa = "model/flant5_small_lr_10-5_qa_distill_match_large_output_abcd"
model_small_fintuned = AutoModelForSeq2SeqLM.from_pretrained(model_small_fintuned, local_files_only=True).to(device)
model_small_distill_qa = AutoModelForSeq2SeqLM.from_pretrained(model_small_distill_qa, local_files_only=True).to(device)

In [21]:
results_small_fintuned = {}
progress_bar = tqdm(range(len(test_input_ids)))
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]]).to(device)
        preds = model_small_fintuned.generate(test_tensor, max_new_tokens=max_target_length, do_sample=False)                                                               
        preds = tokenizer.batch_decode(preds, skip_special_tokens=True)                          
        result = metric.compute(predictions=preds, references=[labels[i]], use_stemmer=True, use_aggregator=False)
        for key, value in result.items():
            if key not in results_small_fintuned:
                results_small_fintuned[key] = value
            else:
                results_small_fintuned[key] += value
        progress_bar.update(1)

  0%|          | 0/11334 [00:00<?, ?it/s]

In [22]:
results_small_distill_qa = {}
progress_bar = tqdm(range(len(test_input_ids)))
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]]).to(device)
        preds = model_small_distill_qa.generate(test_tensor, max_new_tokens=max_target_length, do_sample=False)                                                               
        preds = tokenizer.batch_decode(preds, skip_special_tokens=True)                          
        result = metric.compute(predictions=preds, references=[labels[i]], use_stemmer=True, use_aggregator=False)
        for key, value in result.items():
            if key not in results_small_distill_qa:
                results_small_distill_qa[key] = value
            else:
                results_small_distill_qa[key] += value
        progress_bar.update(1)

  0%|          | 0/11334 [00:00<?, ?it/s]

In [25]:
import numpy as np

In [27]:
results_small_distill_qa_agg = {}
results_small_fintuned_agg = {}

for k, v in results_small_distill_qa.items():
    results_small_distill_qa_agg[k] = np.average(v)
for k, v in results_small_fintuned.items():
    results_small_fintuned_agg[k] = np.average(v)
print(results_small_distill_qa_agg)
print(results_small_fintuned_agg)

{'rouge1': 0.29362024977235945, 'rouge2': 0.08371227026867438, 'rougeL': 0.22779907316663986, 'rougeLsum': 0.22779907316663986}
{'rouge1': 0.2758923727640912, 'rouge2': 0.0746756340070741, 'rougeL': 0.21298393905533938, 'rougeLsum': 0.21298393905533938}


In [25]:
model_small_fintuned = "model/flant5_small_lr_10-4_race_finetuning_epoch11"
model_small_fintuned = AutoModelForSeq2SeqLM.from_pretrained(model_small_fintuned, local_files_only=True).to(device)


In [26]:
results_small_fintuned = {}
progress_bar = tqdm(range(len(test_input_ids)))
for i in range(0, len(test_input_ids)):
        test_tensor = torch.tensor([test_input_ids[i]]).to(device)
        preds = model_small_fintuned.generate(test_tensor, max_new_tokens=max_target_length, do_sample=False)                                                               
        preds = tokenizer.batch_decode(preds, skip_special_tokens=True)                          
        result = metric.compute(predictions=preds, references=[labels[i]], use_stemmer=True, use_aggregator=False)
        for key, value in result.items():
            if key not in results_small_fintuned:
                results_small_fintuned[key] = value
            else:
                results_small_fintuned[key] += value
        progress_bar.update(1)

  0%|          | 0/11334 [00:00<?, ?it/s]

In [27]:
results_small_fintuned_agg = {}

for k, v in results_small_fintuned.items():
    results_small_fintuned_agg[k] = np.average(v)
print(results_small_fintuned_agg)

{'rouge1': 0.007300105121465678, 'rouge2': 5.435154870098594e-06, 'rougeL': 0.007279419578049619, 'rougeLsum': 0.007279419578049619}


In [28]:
for t in range(2,9, 3):
    model_small_fintuned = f"model/flant5_small_lr_10-4_race_finetuning_epoch{t}"
    model_small_fintuned = AutoModelForSeq2SeqLM.from_pretrained(model_small_fintuned, local_files_only=True).to(device)
    results_small_fintuned = {}
    progress_bar = tqdm(range(len(test_input_ids)))
    for i in range(0, len(test_input_ids)):
            test_tensor = torch.tensor([test_input_ids[i]]).to(device)
            preds = model_small_fintuned.generate(test_tensor, max_new_tokens=max_target_length, do_sample=False)                                                               
            preds = tokenizer.batch_decode(preds, skip_special_tokens=True)                          
            result = metric.compute(predictions=preds, references=[labels[i]], use_stemmer=True, use_aggregator=False)
            for key, value in result.items():
                if key not in results_small_fintuned:
                    results_small_fintuned[key] = value
                else:
                    results_small_fintuned[key] += value
            progress_bar.update(1)
    results_small_fintuned_agg = {}

    for k, v in results_small_fintuned.items():
        results_small_fintuned_agg[k] = np.average(v)
    print(f"After epoch {t+1}, the average score is: ")
    print(results_small_fintuned_agg)


  0%|          | 0/11334 [00:00<?, ?it/s]

After epoch 3, the average score is: 
{'rouge1': 0.012380044142516632, 'rouge2': 0.0003337182771979503, 'rougeL': 0.012345590798918346, 'rougeLsum': 0.012345590798918346}


  0%|          | 0/11334 [00:00<?, ?it/s]

After epoch 6, the average score is: 
{'rouge1': 0.015371783662934367, 'rouge2': 2.298035075528833e-05, 'rougeL': 0.015364444694882955, 'rougeLsum': 0.015364444694882955}


  0%|          | 0/11334 [00:00<?, ?it/s]

After epoch 9, the average score is: 
{'rouge1': 0.002325819743691711, 'rouge2': 0.0, 'rougeL': 0.0023185964559691214, 'rougeLsum': 0.0023185964559691214}
