In [16]:
!pip install transformers -q

In [17]:
import os
os.chdir('/content/drive/MyDrive/doutorado/P_IA368DD_2023S/aula4.5')

import gc

In [18]:
import glob

import pickle

import numpy as np

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, default_data_collator, TrainerCallback
from transformers import EarlyStoppingCallback

from transformers import pipeline

In [20]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [11]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, samples_filenames, sampling_size=None):
        self.samples_filenames = samples_filenames
        self.sampling_size = sampling_size

        self.count = -1

        self.data = None

        self.read_sample()

    def read_sample(self):
      self.count = self.count+1

      if self.count >= len(self.samples_filenames):
        self.count = 0
      
      with open(self.samples_filenames[self.count], "rb") as inputFile:
        data = pickle.load(inputFile)
      selected_samples = np.random.choice(list(range(len(data))), self.sampling_size, replace=False)
      self.data = [data[i] for i in selected_samples]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        return {'input_ids': self.data[idx]['input_ids'],
                'attention_mask': self.data[idx]['attention_mask'],
                'labels': self.data[idx]['input_ids'].copy()}

In [7]:
data_files = glob.glob("./normalized/*")

train_dataset = CustomDataset(data_files[:-1], sampling_size=3000)

#fix one file to be used as validation
eval_dataset = CustomDataset(data_files[-1:], sampling_size=1000)

In [8]:
class DatasetUpdaterCallback(TrainerCallback):

    def on_epoch_end(self, args, state, control, **kwargs):
        train_dataset.read_sample()
trainer_callback = DatasetUpdaterCallback()

In [9]:
batch_size=12

model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
training_params = TrainingArguments(output_dir="./trained_model_default_loss",
                                    num_train_epochs=25,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='epoch',
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    learning_rate=2e-4,
                                    weight_decay=1e-2,
                                    dataloader_num_workers=4,
                                    dataloader_pin_memory=False,
                                    optim='adamw_torch',
                                    fp16=True,
                                    load_best_model_at_end=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/651 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [10]:
trainer = Trainer(model=model,
                     args=training_params,
                     train_dataset=train_dataset,
                     eval_dataset=eval_dataset,
                     callbacks=[trainer_callback,EarlyStoppingCallback(early_stopping_patience=3)]
                     )
train_result = trainer.train()



Epoch,Training Loss,Validation Loss
1,2.987,2.895562
2,2.8154,2.803
3,2.7983,2.72731
4,2.7469,2.682158
5,2.7691,2.637743
6,2.7571,2.597018
7,2.6495,2.571693
8,2.5778,2.543132
9,2.5988,2.519186
10,2.6198,2.492544




In [14]:
from torch import nn
from transformers import Trainer

perps = []

class CustomTrainer(Trainer):
    global perps;
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            #if unwrap_model(model)._get_name() in super().MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
            #    loss = self.label_smoother(outputs, labels, shift_labels=True)
            #else:
            loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        exp_loss = torch.exp(loss)
        
        perps.append(exp_loss)
        return (exp_loss, outputs) if return_outputs else exp_loss

In [15]:
batch_size=12

model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
training_params = TrainingArguments(output_dir="./trained_model_exp_loss",
                                    num_train_epochs=25,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    evaluation_strategy='epoch',
                                    save_strategy='epoch',
                                    logging_strategy='steps',
                                    logging_steps=10,
                                    save_total_limit=10,
                                    # report_to='comet_ml',
                                    learning_rate=2e-4,
                                    weight_decay=1e-2,
                                    dataloader_num_workers=4,
                                    dataloader_pin_memory=False,
                                    optim='adamw_torch',
                                    fp16=True,
                                    load_best_model_at_end=True)


In [16]:
#resume_from_checkpoint="trained_model/checkpoint-5000-1.9466"
checkpoints = glob.glob("./trained_model_exp_loss/*")
checkpoints

['./trained_model_exp_loss/runs',
 './trained_model_exp_loss/checkpoint-250',
 './trained_model_exp_loss/checkpoint-500',
 './trained_model_exp_loss/checkpoint-750',
 './trained_model_exp_loss/checkpoint-1000',
 './trained_model_exp_loss/checkpoint-1250']

In [17]:
trainer = CustomTrainer(model=model,
                     args=training_params,
                     train_dataset=train_dataset,
                     eval_dataset=eval_dataset,
                     callbacks=[trainer_callback,EarlyStoppingCallback(early_stopping_patience=3)]
                     )
train_result = trainer.train(resume_from_checkpoint=checkpoints[-1])



0it [00:00, ?it/s]



Epoch,Training Loss,Validation Loss
6,14.8874,14.002252
7,12.6591,13.496645
8,13.5714,13.160523
9,12.6765,12.818531
10,12.4387,12.553758
11,12.1006,12.281002
12,12.6543,12.037523
13,13.0456,11.825619
14,11.4386,11.654285
15,11.2986,11.496914




In [4]:
%%markdown
# Visualize results

# Visualize results


In [21]:
frase = "Eu não entreguei a atividade porque,"


#load model with custom Loss

checkpoints = glob.glob("./trained_model_exp_loss/*")
checkpoints

model_exp = AutoModelForCausalLM.from_pretrained(checkpoints[-1])

[{'generated_text': 'Eles se atrasaram para o treino porque o time de futebol não conseguiu. O time de futebol não conseguiu. O time de futebol n'}]

In [25]:
generator = pipeline('text-generation', model=model_exp, tokenizer=tokenizer, max_length=30)
print(generator(frase))

[{'generated_text': 'Eu não entreguei a atividade porque, por exemplo, tenho que fazer o que eu quero'}]

In [None]:
#load model with default Loss

checkpoints = glob.glob("./trained_model_default_loss/*")
checkpoints

model_default = AutoModelForCausalLM.from_pretrained(checkpoints[-1])

In [31]:
generator = pipeline('text-generation', model=model_default, tokenizer=tokenizer, max_length=37)
print(generator(frase))

[{'generated_text': 'Eu não entreguei a atividade porque, como eu, não tenho nenhuma experiência de trabalho.'}]

In [32]:
#load model from "facebook/opt-125m"
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [35]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=35)
print(generator(frase))

[{'generated_text': 'Eu não entreguei a atividade porque, porque não sei, não sei, não sei, não se'}]