In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# model_checkpoint = 'Salesforce/codet5-small'
model_checkpoint = 'Salesforce/codet5-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

2023-02-09 00:34:35.990379: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import os
from datasets import Dataset

save_root = os.path.join(os.getenv('HOME'), 'datasets', 'methods2test', 'preprocessed')
train_data_folder = os.path.join(save_root, 'train')
train_dataset = Dataset.load_from_disk(train_data_folder)

eval_data_folder = os.path.join(save_root, 'eval')
eval_dataset = Dataset.load_from_disk(eval_data_folder)

In [3]:
from datasets import DatasetDict

dataset = DatasetDict({"train": train_dataset, "eval": eval_dataset})

In [4]:
max_input_length = 128
max_target_length = 256


def tokenize_function(example):
    model_inputs = tokenizer(example['source'], max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example['target'], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


example_input = tokenize_function(dataset['train'][666])
print(example_input)
print()
print(tokenizer.decode(example_input['input_ids']))
print()
print(tokenizer.decode(example_input['labels']))

{'input_ids': [1, 36, 6618, 1071, 918, 1086, 861, 12212, 12, 780, 1981, 16, 514, 17833, 13, 1216, 25793, 321, 2278, 10215, 503, 288, 25852, 1719, 1482, 273, 13024, 1482, 12, 4937, 16, 17833, 1769, 309, 16051, 23422, 18, 20305, 18, 14963, 12, 4688, 1482, 3719, 288, 604, 394, 25793, 321, 2278, 10215, 503, 12, 3589, 67, 5572, 67, 16234, 67, 5519, 16, 514, 18, 2139, 2932, 169, 112, 103, 165, 126, 255, 165, 121, 253, 9275, 87, 13, 165, 121, 240, 166, 102, 231, 165, 123, 241, 20305, 168, 237, 119, 167, 227, 228, 176, 125, 239, 165, 121, 240, 169, 230, 126, 168, 109, 238, 166, 240, 116, 167, 236, 105, 169, 99, 239, 3113, 17833, 10019, 289, 987, 32, 2278, 2081, 1482, 34, 1719, 11913, 1482, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1



In [5]:
tokenized_datasets = dataset.map(tokenize_function)
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 148570
    })
    eval: Dataset({
        features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 78534
    })
})

In [6]:
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

batch_size = 4
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-junit",
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using cuda_amp half precision backend


In [7]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 148570
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 371430
  Number of trainable parameters = 222882048
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.3327,1.573798
2,1.1309,1.556517
3,1.0226,1.563772
4,0.9429,1.581386
5,0.8766,1.587648
6,0.822,1.595827
7,0.7725,1.610999
8,0.7726,1.617696
9,0.7301,1.628309
10,0.7047,1.634362


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: target, source. If target, source are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 78534
  Batch size = 4
Saving model checkpoint to codet5-base-finetuned-junit/checkpoint-37143
Configuration saved in codet5-base-finetuned-junit/checkpoint-37143/config.json
Configuration saved in codet5-base-finetuned-junit/checkpoint-37143/generation_config.json
Model weights saved in codet5-base-finetuned-junit/checkpoint-37143/pytorch_model.bin
tokenizer config file saved in codet5-base-finetuned-junit/checkpoint-37143/tokenizer_config.json
Special tokens file saved in codet5-base-finetuned-junit/checkpoint-37143/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have bee

TrainOutput(global_step=371430, training_loss=0.9517405450868833, metrics={'train_runtime': 59643.5073, 'train_samples_per_second': 24.91, 'train_steps_per_second': 6.228, 'total_flos': 2.109574760398848e+17, 'train_loss': 0.9517405450868833, 'epoch': 10.0})