In [2]:
from datasets import load_dataset

# Path to your JSON files
train_file = 'data_tp_train.json'
validation_file = 'data_tp_validation.json'
test_file = 'data_tp_test.json'

# Load the datasets from JSON files
train_dataset = load_dataset("json", data_files=train_file, split='train')
validation_dataset = load_dataset("json", data_files=validation_file, split='train')
test_dataset = load_dataset("json", data_files=test_file, split='train')

# Organize the datasets into a dictionary for easy access
split_datasets = {
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
}

# Example: Access and print the number of examples in each dataset
print(f"Number of training examples: {len(split_datasets['train'])}")
print(f"Number of validation examples: {len(split_datasets['validation'])}")
print(f"Number of test examples: {len(split_datasets['test'])}")


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Number of training examples: 7088
Number of validation examples: 917
Number of test examples: 948


In [5]:
# prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
added_toks = {'sep_token': "<SEP>"}
tokenizer.add_special_tokens(added_toks)
max_length = 256

# preprocess function tokenizes the input and target fields
def preprocess_function(examples):
    inputs = [ex['tag_description'] for ex in examples['translation']]
    targets = [ex['thing_property'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
    model_inputs['labels'] = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length").input_ids
    return model_inputs

# Apply tokenization and preprocessing to all splits of the dataset
tokenized_datasets = {split: split_datasets[split].map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets[split].column_names
) for split in split_datasets}

# Access and print the first tokenized example from each dataset to verify
for split in tokenized_datasets:
    print(f"First tokenized example from {split}:")
    print(tokenized_datasets[split][0])

Map:   0%|          | 0/7088 [00:00<?, ? examples/s]

Map:   0%|          | 0/917 [00:00<?, ? examples/s]

Map:   0%|          | 0/948 [00:00<?, ? examples/s]

First tokenized example from train:
{'input_ids': [283, 87, 427, 332, 87, 254, 391, 6218, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [6]:

# we use the pre-trained t5-base model
from transformers import AutoModelForSeq2SeqLM
model_checkpoint = "t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# data collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# evaluation 
import evaluate
metric = evaluate.load("sacrebleu")
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

from transformers import Seq2SeqTrainingArguments

# load environment variables to disable GPU p2p mode for multi-gpu training without p2p mode
# not required for single-gpu training
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'

args = Seq2SeqTrainingArguments(
    f"tag_description_to_thing",
    evaluation_strategy="no",
    logging_dir="tensorboard-log",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=1, # sets number of checkpoints to save
    num_train_epochs=40, # number of epochs to train
    predict_with_generate=True,
    bf16=True, # disable if gpu doesn't support bfloat16
    push_to_hub=False,
)

from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/8880 [00:00<?, ?it/s]