In [1]:
import gc
import os
from pathlib import Path
from dataclasses import replace
import json

In [2]:
import transformers
import evaluate
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from transformers import TrainerCallback

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [3]:
from datasets import load_dataset
import torch
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader

In [4]:
class Vetorizer(IterableDataset):
    def __init__(self, tokenizer, dataset, seq_length, total_count):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.seq_length = seq_length
        self.total_count = total_count
    
    def __iter__(self):
        iterator = iter(self.dataset)
        while True:
            try:
                data = next(iterator)
                data['text'][0] = "summarize: " + data['text'][0]
                text_concatenated = " ".join(data['text'])
                label = data['label']
                text_tokenized = tokenizer(text_concatenated, padding='max_length', max_length=self.seq_length['encoder'], truncation=True)
                label_tokenized = tokenizer(label, padding='max_length', max_length=self.seq_length['decoder'], truncation=True)
                data = {
                    'input_ids': text_tokenized['input_ids'],
                    'attention_mask': text_tokenized['attention_mask'],
                    'labels': label_tokenized['input_ids'],
                }
                yield data
            except StopIteration:
                iterator = iter(self.dataset)

    def __len__(self):
        return self.total_count

In [5]:
def create_dataset(tokenizer, domain_data, args, seq_length):
    train_data = load_dataset('json', data_files=domain_data, split='train', streaming=True)
    no_iter_train_data = load_dataset('json', data_files=domain_data, split='train', streaming=False)
    total_train_data_cnt = len(no_iter_train_data)
    del no_iter_train_data
    gc.collect()

    eval_data = load_dataset('json', data_files=domain_data, split='valid', streaming=True)
    no_iter_eval_data = load_dataset('json', data_files=domain_data, split='valid', streaming=False)
    total_eval_data_cnt = len(no_iter_eval_data)
    del no_iter_eval_data
    gc.collect()
    
    train_dataset = Vetorizer(tokenizer, train_data, seq_length, total_train_data_cnt)
    eval_dataset = Vetorizer(tokenizer, eval_data, seq_length, total_train_data_cnt)
    
    return train_dataset, eval_dataset

In [6]:
def compute_rouge_scores(references, candidate):
    rouge = evaluate.load("rouge")
    scores = rouge.compute(
        predictions=candidate,
        references=references,
        rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"],
        use_stemmer=True,
    )
    return scores

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    raw_scores = compute_rouge_scores(
        references=decoded_labels,
        candidate=decoded_preds
    )

    flat_scores = {}
    for key, score in raw_scores.items():
        f = getattr(score.mid, "fmeasure", None) or getattr(score, "fmeasure")
        flat_scores[key] = f * 100

    return flat_scores

In [9]:
model_ckpt = 'paust/pko-t5-small'
tokenizer = T5TokenizerFast.from_pretrained(model_ckpt)

domain_name = 'law'

domain_data = {
    'train': f'{domain_name}/train.jsonl',
    'valid': f'{domain_name}/valid.jsonl',
    'test': f'{domain_name}/test.jsonl'
}

seq_length = {
    'encoder': 2048,
    'decoder': 512
}

final_args = Seq2SeqTrainingArguments(
    output_dir="./new_text_summarize_model",
    evaluation_strategy="steps",
    eval_steps=200,
    logging_steps=100,
    save_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    warmup_steps=500,
    fp16=torch.cuda.is_available(),
    predict_with_generate=True,
    generation_max_length=512,
    generation_num_beams=4
)

train_dataset, eval_dataset = create_dataset(tokenizer, domain_data, final_args, seq_length)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
final_model = T5ForConditionalGeneration.from_pretrained(model_ckpt).to(device)

final_trainer = Seq2SeqTrainer(
    model=final_model,
    args=final_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer,
        model=None,
        label_pad_token_id=tokenizer.pad_token_id
    ),
    compute_metrics=compute_metrics
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
print(torch.__version__)           # 예: 2.x.x+cu12x
print("CUDA available:", torch.cuda.is_available())
print("GPU count:", torch.cuda.device_count())
# Trainer 내부 캐시 확인
print("Trainer n_gpu:", final_trainer.args.n_gpu)

2.5.1
CUDA available: False
GPU count: 0
Trainer n_gpu: 0


In [12]:
import evaluate
accuracy_score = evaluate.load("accuracy")

# mapvar = {}
# optim_type = "sahur"
# mapvar[optim_type] = "good"
# mapvar[optim_type].update("better")
# mapvar

AttributeError: 'str' object has no attribute 'update'

In [None]:
final_trainer.train()

backup_dir = "./text_summarize_model"
os.makedirs(backup_dir, exist_ok=True)
trainer.save_checkpoint(backup_dir)
args_path = os.path.join(backup_dir, "training_args.json")
trainer.args.to_json_file(args_path)