In [1]:
import os
cache_dir = "/scratches/dialfs/alta/hln35/.cache"
os.environ['TRANSFORMERS_CACHE'] = '/scratches/dialfs/alta/hln35/.cache'

In [2]:
import torch
model_small = "google/flan-t5-small"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
device

device(type='cuda')

In [4]:
from datasets import load_dataset

books = load_dataset("wmt14", "fr-en", split='test', cache_dir=cache_dir)

In [5]:
books

Dataset({
    features: ['translation'],
    num_rows: 3003
})

In [6]:
books[0]

{'translation': {'en': 'Spectacular Wingsuit Jump Over Bogota',
  'fr': 'Spectaculaire saut en "wingsuit" au-dessus de Bogota'}}

In [7]:
source_lang = "en"
target_lang = "fr"
prefix = "translate English to French: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    # model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    
    return model_inputs

In [8]:
import evaluate

metric = evaluate.load("sacrebleu", cache_dir=cache_dir)

In [9]:
from transformers import AutoTokenizer
import torch
from transformers import AutoModelForQuestionAnswering, AutoModel, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained(model_small)
model = AutoModelForSeq2SeqLM.from_pretrained(model_small).to(device)



In [10]:
import json

In [10]:
# for i in range(len(books)):
scores = []
for i in range(len(books)):
    text = prefix + books[i]["translation"][source_lang]
    ref = books[i]["translation"][target_lang]
    inputs = tokenizer(text, return_tensors="pt").input_ids
    preds_tokenized = model.generate(inputs, max_new_tokens=128, do_sample=False) 
    preds = tokenizer.batch_decode(preds_tokenized)
    
    bleu_score = metric.compute(predictions=preds, references=[ref])
    scores.append(bleu_score["score"])
import json
with open("translate_bleu_small.txt", "w") as fp:
    json.dump(scores, fp)
print(sum(scores)/len(books))

9.442637790527135


In [11]:
model_large = "google/flan-t5-large"
tokenizer_large = AutoTokenizer.from_pretrained(model_large)
model_large = AutoModelForSeq2SeqLM.from_pretrained(model_large)

In [12]:
scores_large = []
for i in range(len(books)):
    text = prefix + books[i]["translation"][source_lang]
    ref = books[i]["translation"][target_lang]
    inputs = tokenizer_large(text, return_tensors="pt").input_ids
    preds_tokenized = model_large.generate(inputs, max_new_tokens=128, do_sample=False) 
    preds = tokenizer_large.batch_decode(preds_tokenized)
    
    bleu_score = metric.compute(predictions=preds, references=[ref])
    scores_large.append(bleu_score["score"])
import json
with open("translate_bleu_large.txt", "w") as fp:
    json.dump(scores_large, fp)
print(sum(scores_large)/len(books))

15.373898826506785


In [33]:
model_small_fintuned = "model/flant5_small_lr_10-4_qa_finetuning"
model_small_distill_qa = "model/flant5_small_lr_10-5_qa_distill_match_large_output_abcd"
model_small_fintuned = AutoModelForSeq2SeqLM.from_pretrained(model_small_fintuned, local_files_only=True).to(device)
model_small_distill_qa = AutoModelForSeq2SeqLM.from_pretrained(model_small_distill_qa, local_files_only=True).to(device)



In [11]:
from tqdm.auto import tqdm

In [34]:
scores_distill = []
for i in range(len(books)):
    text = prefix + books[i]["translation"][source_lang]
    ref = books[i]["translation"][target_lang]
    inputs = tokenizer(text, return_tensors="pt").to(device).input_ids
    preds_tokenized = model_small_distill_qa.generate(inputs, max_new_tokens=128, do_sample=False) 
    preds = tokenizer.batch_decode(preds_tokenized)
    
    bleu_score = metric.compute(predictions=preds, references=[ref])
    scores_distill.append(bleu_score["score"])

print(sum(scores_distill)/len(books))

9.464688062977212


In [37]:
scores_fintuned = []
progress_bar = tqdm(range(len(books)))
for i in range(len(books)):
    text = prefix + books[i]["translation"][source_lang]
    ref = books[i]["translation"][target_lang]
    inputs = tokenizer(text, return_tensors="pt").to(device).input_ids
    preds_tokenized = model_small_fintuned.generate(inputs, max_new_tokens=128, do_sample=False) 
    preds = tokenizer.batch_decode(preds_tokenized)
    
    bleu_score = metric.compute(predictions=preds, references=[ref])
    scores_fintuned.append(bleu_score["score"])
    progress_bar.update(1)
print(sum(scores_fintuned)/len(books))


  0%|          | 0/3003 [00:00<?, ?it/s]

8.798674983992763


In [13]:
model_small_fintuned = "model/flant5_small_lr_10-4_race_finetuning_epoch11"
model_small_fintuned = AutoModelForSeq2SeqLM.from_pretrained(model_small_fintuned, local_files_only=True).to(device)

In [14]:
scores_fintuned = []
progress_bar = tqdm(range(len(books)))
for i in range(len(books)):
    text = prefix + books[i]["translation"][source_lang]
    ref = books[i]["translation"][target_lang]
    inputs = tokenizer(text, return_tensors="pt").to(device).input_ids
    preds_tokenized = model_small_fintuned.generate(inputs, max_new_tokens=128, do_sample=False) 
    preds = tokenizer.batch_decode(preds_tokenized)
    
    bleu_score = metric.compute(predictions=preds, references=[ref])
    scores_fintuned.append(bleu_score["score"])
    progress_bar.update(1)
print(sum(scores_fintuned)/len(books))

  0%|          | 0/3003 [00:00<?, ?it/s]

0.00694616067271619


In [17]:
for t in range(2,12,3):
    model_small_fintuned = f"model/flant5_small_lr_10-4_race_finetuning_epoch{t}"
    model_small_fintuned = AutoModelForSeq2SeqLM.from_pretrained(model_small_fintuned, local_files_only=True).to(device)
    scores_fintuned = []
    progress_bar = tqdm(range(len(books)))
    for i in range(len(books)):
        text = prefix + books[i]["translation"][source_lang]
        ref = books[i]["translation"][target_lang]
        inputs = tokenizer(text, return_tensors="pt").to(device).input_ids
        preds_tokenized = model_small_fintuned.generate(inputs, max_new_tokens=128, do_sample=False) 
        preds = tokenizer.batch_decode(preds_tokenized)
        
        bleu_score = metric.compute(predictions=preds, references=[ref])
        scores_fintuned.append(bleu_score["score"])
        progress_bar.update(1)
    print(f"After epoch {t+1} the average score on the test set is {sum(scores_fintuned)/len(books)}")

  0%|          | 0/3003 [00:00<?, ?it/s]

After epoch 3 the average score on the test set is 0.08576216846026037


  0%|          | 0/3003 [00:00<?, ?it/s]

After epoch 6 the average score on the test set is 0.07576191818667471


  0%|          | 0/3003 [00:00<?, ?it/s]

After epoch 9 the average score on the test set is 0.0016246515293337417


  0%|          | 0/3003 [00:00<?, ?it/s]

After epoch 12 the average score on the test set is 0.00694616067271619


In [13]:
for t in range(2,3,1):
    model_small_distill = f"model/flant5_small_lr_10-4_race_distill_epoch{t}"
    model_small_distill = AutoModelForSeq2SeqLM.from_pretrained(model_small_distill, local_files_only=True).to(device)
    scores_distill = []
    progress_bar = tqdm(range(len(books)))
    for i in range(len(books)):
        text = prefix + books[i]["translation"][source_lang]
        ref = books[i]["translation"][target_lang]
        inputs = tokenizer(text, return_tensors="pt").to(device).input_ids
        preds_tokenized = model_small_distill.generate(inputs, max_new_tokens=128, do_sample=False) 
        preds = tokenizer.batch_decode(preds_tokenized)
        
        bleu_score = metric.compute(predictions=preds, references=[ref])
        scores_distill.append(bleu_score["score"])
        progress_bar.update(1)
    print(f"After epoch {t+1} the average score on the test set is {sum(scores_distill)/len(books)}")

  0%|          | 0/3003 [00:00<?, ?it/s]

After epoch 3 the average score on the test set is 0.32699462441924343


In [12]:
for importance in [1e-4, 1e-2, 1e-0]:
    model_small_ewc = f"model/flant5_small_lr_10-4_race_ewc_importance_{'{:.0e}'.format(importance)}_epoch2"
    model_small_ewc = AutoModelForSeq2SeqLM.from_pretrained(model_small_ewc, local_files_only=True).to(device)
    scores_ewc = []
    progress_bar = tqdm(range(len(books)))
    for i in range(len(books)):
        text = prefix + books[i]["translation"][source_lang]
        ref = books[i]["translation"][target_lang]
        inputs = tokenizer(text, return_tensors="pt").to(device).input_ids
        preds_tokenized = model_small_ewc.generate(inputs, max_new_tokens=128, do_sample=False) 
        preds = tokenizer.batch_decode(preds_tokenized)
        
        bleu_score = metric.compute(predictions=preds, references=[ref])
        scores_ewc.append(bleu_score["score"])
        progress_bar.update(1)
    print(f"For importance {importance} the average score on the test set is {sum(scores_ewc)/len(books)}")

  0%|          | 0/3003 [00:00<?, ?it/s]

For importance 0.0001 the average score on the test set is 0.35055862012379657


  0%|          | 0/3003 [00:00<?, ?it/s]

For importance 0.01 the average score on the test set is 0.5082259002421322


  0%|          | 0/3003 [00:00<?, ?it/s]

For importance 1.0 the average score on the test set is 0.2778737692090612
