In [2]:
import sys

# cd to ../src
sys.path.append("../src")

In [2]:
from model.load_model import load_model_for_training
import random
import tempfile
import json

from torch.utils.data import DataLoader
from src.dataset.dataset import DataCollatorForCoLLIE

import logging
import os


import torch
import torch.nn as nn


from src.dataset.dataset import CollieDataset
from transformers import (
    PreTrainedTokenizerBase,
)
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES

logging.basicConfig(level=logging.INFO)

In [3]:
model, tokenizer = load_model_for_training(
    model_weights_name_or_path="EleutherAI/gpt-neo-125m",
    int8_quantization=False,
    use_lora=False,
    torch_dtype="float32",
)

INFO:root:Loading model model from EleutherAI/gpt-neo-125m
INFO:root:Loading model with dtype: torch.float32
INFO:root:---> Trainable params: 125198592 || all params: 125198592 || trainable%: 100.0



In [4]:
def get_dataset(
    tokenizer: PreTrainedTokenizerBase,
    is_encoder_decoder: bool,
    inference: bool,
    prompt_loss_weight: float,
    num_epochs: int = -1,
) -> (CollieDataset, str, str):
    text = """@dataclass
class EnergyAndInfrastructureEvent:
    \"\"\"This class is used to instantiate events that involve Chinese energy and infrastructure projects.\"\"\"
    meeting_attendees: Union[List[str], None] # Persons or organizations that attended the meeting.
    meeting_location: Union[List[str], None] # Location where the meeting happened.
    meeting_topic: Union[List[str], None] # Topic discussed on the meeting
    project_location: Union[List[str], None] # Location of the project
    project_name: Union[List[str], None] # Name of the project

# This is the sentence to analyze
sentence = "The Chinese and Rongovian delegations met at the sidelines of the Berlin Development Futures conference to discuss Rongovia's proposed Pangean Reunification Facility.

# The following list contains the events instances that happens in the sentence defined above
result = [
    EnergyAndInfrastructureEvent(
        meeting_attendees=["Chinese", "Rongovian"],
        meeting_location=["Berlin"],
        meeting_topic=["Pangean Reunification Facility"],
        project_location=["Rongovia"],
        project_name=["Pangean Reunification Facility"]
    ),
]"""

    prompt = """@dataclass
class EnergyAndInfrastructureEvent:
    \"\"\"This class is used to instantiate events that involve Chinese energy and infrastructure projects.\"\"\"
    meeting_attendees: Union[List[str], None] # Persons or organizations that attended the meeting.
    meeting_location: Union[List[str], None] # Location where the meeting happened.
    meeting_topic: Union[List[str], None] # Topic discussed on the meeting
    project_location: Union[List[str], None] # Location of the project
    project_name: Union[List[str], None] # Name of the project

# This is the sentence to analyze
sentence = "The Chinese and Rongovian delegations met at the sidelines of the Berlin Development Futures conference to discuss Rongovia's proposed Pangean Reunification Facility.

# The following list contains the events instances that happens in the sentence defined above
result = ["""
    result = """
    EnergyAndInfrastructureEvent(
        meeting_attendees=["Chinese", "Rongovian"],
        meeting_location=["Berlin"],
        meeting_topic=["Pangean Reunification Facility"],
        project_location=["Rongovia"],
        project_name=["Pangean Reunification Facility"]
    ),
]"""
    if num_epochs == -1:
        with tempfile.TemporaryDirectory() as tmpdirname:
            with open(os.path.join(tmpdirname, "tmp.ee.train.jsonl"), "w", encoding="utf8") as f:
                print(json.dumps({"text": text}, ensure_ascii=False), file=f)

            dataset = CollieDataset(
                tokenizer=tokenizer,
                dataset_path=os.path.join(tmpdirname, "tmp.ee.train.jsonl"),
                is_encoder_decoder=is_encoder_decoder,
                max_length=2048,
                inference=inference,
                prompt_loss_weight=prompt_loss_weight,
            )

    else:
        # List of random integers with len = num_epochs
        random_seeds = random.sample(range(0, 100000), num_epochs)
        with tempfile.TemporaryDirectory() as tmpdirname:
            for epoch in random_seeds:
                with open(os.path.join(tmpdirname, f"tmp.ee.train.{epoch}.jsonl"), "w", encoding="utf8") as f:
                    print(json.dumps({"text": text}, ensure_ascii=False), file=f)

            dataset = CollieDataset(
                tokenizer=tokenizer,
                dataset_path=os.path.join(tmpdirname, "tmp.ee.train.jsonl"),
                is_encoder_decoder=is_encoder_decoder,
                max_length=2048,
                inference=inference,
                prompt_loss_weight=prompt_loss_weight,
            )

    return dataset, prompt, result

In [6]:
dataset, promt, result = get_dataset(
    tokenizer=tokenizer,
    is_encoder_decoder=False,
    inference=False,
    prompt_loss_weight=0.0,
)
datacollator = DataCollatorForCoLLIE(
    tokenizer,
    pad_to_multiple_of=9,
    return_tensors="pt",
    padding=True,
    label_pad_token_id=-100,
)
dataloder = DataLoader(dataset, batch_size=1, collate_fn=datacollator, shuffle=False)
inputs = list(dataloder)[0]
inputs

INFO:root:Loaded [1] examples from /tmp/tmpbfnhlt4f/tmp.ee.train.jsonl


{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256,    31, 19608,   330, 31172,
           198,  4871,  6682,  1870, 18943,  6410,  9237,    25,   198,   220,
           220,   220, 37227,  1212,  1398,   318,   973,   284,  9113,  9386,
          2995,   326,  6211,  3999,  2568,   290,  6884,  4493,   526, 15931,
           198,   220,   220,   220,  3249,    62,  1078,   437,  2841,    25,
          4479,    58,  8053,    58,  2536,  4357,  6045,    60,  1303, 32884,
           393,  5745,   326,  9141,   262,  3249,    13,   198,   220,   220,
           220,  3249,    62, 24886,    25,  4479,    58,  8053,    58,  2536,
          4357,  6045,    60,  1303, 13397,   810,   262,  3249,  3022,    13,
           198,   220,   220,   220,  3249,    62, 26652,    25,  4479,    58,
          8053,    58,  2536,  4357,  6045,    60,  1303, 47373,  6693,   319,
           262,  3249,   198,   220,   220,   220,  1628,    62, 24886,    25,
          4479,    58,  8053,    58,  

In [7]:
if "labels" in inputs:
    labels = inputs.pop("labels")
else:
    raise ValueError("You should supply a labels key to compute the loss")

if "loss_weight_mask" in inputs:
    loss_weight_mask = inputs.pop("loss_weight_mask")
else:
    raise ValueError("You should supply a loss_weight_mask key to compute the loss")

print(labels.size())
print(loss_weight_mask.size())

torch.Size([1, 342])
torch.Size([1, 342])


In [8]:
outputs = model(**inputs)

In [9]:
logits = outputs["logits"] if isinstance(outputs, dict) else outputs[0]
logits.size()

torch.Size([1, 342, 50257])

In [10]:
if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
    logits = logits[..., :-1, :].contiguous()
    labels = labels[..., 1:].contiguous()
    loss_weight_mask = loss_weight_mask[..., 1:].contiguous()

print(logits.size())
print(labels.size())
print(loss_weight_mask.size())

torch.Size([1, 341, 50257])
torch.Size([1, 341])
torch.Size([1, 341])


In [11]:
logits = logits.view(-1, logits.size(-1))
labels = labels.view(-1)
loss_weight_mask = loss_weight_mask.view(-1)
print(logits.size())
print(labels.size())
print(loss_weight_mask.size())

torch.Size([341, 50257])
torch.Size([341])
torch.Size([341])


In [12]:
loss_fct = nn.CrossEntropyLoss(reduction="none", ignore_index=-100)

In [13]:
loss = loss_fct(logits, labels)

In [14]:
loss

tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 9.8315e+00,
        1.6074e+01, 1.6043e+01, 1.0794e+01, 8.3698e+00, 1.1856e+01, 1.9735e+01,
        1.6643e+01, 1.3181e+01, 8.2464e+00, 1.3900e+01, 9.9957e+00, 5.1374e+00,
        5.3391e+00, 9.6167e-03, 3.7920e-03, 9.6166e+00, 8.7823e+00, 6.6886e+00,
        1.2510e+00, 4.2652e+00, 4.9724e-01, 7.0480e+00, 1.5290e-01, 7.4130e+00,
        2.5229e+00, 7.7785e+00, 1.4144e+01, 8.8306e+00, 5.5952e+00, 1.0215e+01,
        5.8465e+00, 5.6785e+00, 1.2618e+00, 3.1476e+00, 1.0898e-01, 1.9258e-02,
        2.0246e-03, 1.7474e+01, 3.1969e+00, 5.5917e+00, 2.4213e+00, 3.8305e+00,
        1.2278e+00, 1.3705e+01, 6.0360e+00, 4.1070e+00, 2.2169e-01, 9.5657e+00,
        2.9601e+00, 5.6839e+00, 1.3160e+00, 5.4705e+00, 1.2867e+01, 6.4238e+00,
        8.1995e+00, 2.7307e+00, 8.1028e+00, 2.7980e+00, 6.8936e-02, 1.8049e+00,
        5.0291e-01, 2.7355e-02, 6.3982e-03, 7.1464e-04, 7.8822e-01, 2.3601e-01,
        9.1245e+00, 5.8066e-02, 1.6049e+

In [15]:
loss_w = torch.sum(loss * loss_weight_mask) / torch.sum(loss_weight_mask)
print(loss_w)

tensor(0.8267, grad_fn=<DivBackward0>)


In [16]:
loss_m = torch.sum(loss) / loss.size(0)
print(loss_m)

tensor(2.7123, grad_fn=<DivBackward0>)


# Test trainer loss

In [3]:
from src.tests.test_dataset import get_dataset
from src.trainer import CollieTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, Seq2SeqTrainingArguments, Trainer
import torch
from torch.utils.data import DataLoader
from src.dataset.dataset import DataCollatorForCoLLIE
import os
from model.load_model import load_model_for_training

In [30]:
model, tokenizer = load_model_for_training(
    model_weights_name_or_path="EleutherAI/gpt-neo-125m",
    int8_quantization=False,
    use_lora=False,
    torch_dtype="float32",
)



In [32]:
training_args = Seq2SeqTrainingArguments(
    output_dir="test",
)

In [33]:
collie_trainer = CollieTrainer(model=model, args=training_args)
collie_trainer.model = collie_trainer.model.to("cpu")
trainer = Trainer(model=model, args=training_args)
trainer.model = trainer.model.to("cpu")

dataset, prompt, result = get_dataset(
    tokenizer=tokenizer,
    is_encoder_decoder=False,
    inference=False,
    prompt_loss_weight=0.0,
)

datacollator = DataCollatorForCoLLIE(
    tokenizer,
    pad_to_multiple_of=2048,
    return_tensors="pt",
    padding=True,
    label_pad_token_id=-100,
)

dataloader = DataLoader(dataset, batch_size=1, collate_fn=datacollator, shuffle=False)
inputs = list(dataloader)[0]

Output()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [35]:
inputs.input_ids

tensor([[50256, 50256, 50256,  ...,   198,    60, 50256]])

In [36]:
collie_loss = collie_trainer.compute_loss(model=model, inputs=inputs.copy(), return_outputs=False)
collie_loss

tensor(1.1928, grad_fn=<DivBackward0>)

In [40]:
_ = [
    print(repr(tokenizer.decode(x.item()) if x > 0 else "PAD"), x.item(), y.item(), z)
    for x, y, z in zip(
        labels[0],
        inputs.loss_weight_mask[0],
        tokenizer(
            prompt + result, return_tensors=None, add_special_tokens=True, padding=True, pad_to_multiple_of=2048
        )["input_ids"][1:],
    )
]

'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.0 50256
'PAD' -100 0.

In [41]:
labels = inputs.labels.clone()
not_result_mask = inputs["loss_weight_mask"] < 1.0
labels[not_result_mask] = -100
_ = [
    print(repr(tokenizer.decode(x.item()) if x > 0 else "PAD"), x.item(), y.item(), z)
    for x, y, z in zip(
        labels[0],
        inputs.loss_weight_mask[0],
        tokenizer(
            prompt + result, return_tensors=None, add_special_tokens=True, padding=True, pad_to_multiple_of=2048
        )["input_ids"][1:],
    )
    if x.item() >= 0
]

'\n' 198 1.0 198
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' Energy' 6682 1.0 6682
'And' 1870 1.0 1870
'Inf' 18943 1.0 18943
'rastructure' 6410 1.0 6410
'Event' 9237 1.0 9237
'(' 7 1.0 7
'\n' 198 1.0 198
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' meeting' 3249 1.0 3249
'_' 62 1.0 62
'att' 1078 1.0 1078
'end' 437 1.0 437
'ees' 2841 1.0 2841
'=' 28 1.0 28
'["' 14692 1.0 14692
'Chinese' 23604 1.0 23604
'",' 1600 1.0 1600
' "' 366 1.0 366
'R' 49 1.0 49
'ong' 506 1.0 506
'ov' 709 1.0 709
'ian' 666 1.0 666
'"],' 33116 1.0 33116
'\n' 198 1.0 198
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' meeting' 3249 1.0 3249
'_' 62 1.0 62
'location' 24886 1.0 24886
'=' 28 1.0 28
'["' 14692 1.0 14692
'Ber' 24814 1.0 24814
'lin' 2815 1.0 2815
'"],' 33116 1.0 33116
'\n' 198 1.0 198
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 220 1.0 220
' ' 

In [42]:
model_loss = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, labels=labels).loss
model_loss

tensor(1.1928, grad_fn=<NllLossBackward0>)