In [4]:
import pandas as pd
from datasets import Dataset
import torch
import json
import os

# 파일 불러오기
# df = pd.read_csv("Seq2Seq_Training_Data.csv")
df = pd.read_csv("Seq2Seq_Training_Data_CorrectionOnly.csv")
dataset = Dataset.from_pandas(df)

In [5]:
from transformers import AutoTokenizer,  AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=32, truncation=True, padding="max_length")
    labels = tokenizer(example["target"], max_length=32, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

Map: 100%|██████████| 332/332 [00:00<00:00, 7537.55 examples/s]


In [6]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce GTX 970


In [7]:
from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments

class SaveEveryNEpochsCallback(TrainerCallback):
    def __init__(self, model, tokenizer, n=5):
        self.n = n
        self.model = model
        self.tokenizer = tokenizer

    def on_epoch_end(self, args, state, control, **kwargs):
        if int(state.epoch) % self.n == 0:
            output_dir = os.path.join(args.output_dir, f"checkpoint-epoch-{int(state.epoch)}")
            self.model.save_pretrained(output_dir)
            self.tokenizer.save_pretrained(output_dir)
            print(f"✅ Model saved at {output_dir}")

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./ocr_seq2seq_model_correctiononly",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=50,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_strategy="no",
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


class PredictionPrinterCallback(TrainerCallback):
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def on_epoch_end(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        examples = ["APO", "KPIC", "102", "L102", "CJ1", "I5S", "G2I"]
        inputs = tokenizer(examples, return_tensors="pt", padding=True).input_ids.to(model.device)
        outputs = model.generate(inputs)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for i, pred in enumerate(decoded_preds):
          if pred.strip() == "":
            print(f"❌ Input: {examples[i]} → No output generated.")
          else:
            print(f"✅ Input: {examples[i]} → Output: {pred}")

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    eval_dataset = tokenized_dataset.select(range(100)),
    train_dataset = tokenized_dataset.select(range(100, len(tokenized_dataset))),
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[PredictionPrinterCallback(model=model, tokenizer=tokenizer), 
              SaveEveryNEpochsCallback(model=model, tokenizer = tokenizer, n=5)
              ]
)

trainer.train(resume_from_checkpoint=False)

model.save_pretrained("./ocr_seq2seq_model_correctiononly")
tokenizer.save_pretrained("./ocr_seq2seq_model_correctiononly")

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,0.6438,0.484832
2,0.6113,0.456682
3,0.582,0.429318
4,0.5326,0.405819
5,0.5324,0.388739
6,0.5007,0.378689
7,0.4718,0.370893
8,0.4647,0.362729
9,0.4498,0.355394
10,0.4396,0.350513


✅ Input: APO → Output: AU
❌ Input: KPIC → No output generated.
✅ Input: 102 → Output: 102
✅ Input: L102 → Output: 0
❌ Input: CJ1 → No output generated.
✅ Input: I5S → Output: I5S
❌ Input: G2I → No output generated.
✅ Input: APO → Output: AU
✅ Input: KPIC → Output: KPIC
✅ Input: 102 → Output: 102
✅ Input: L102 → Output: L102
❌ Input: CJ1 → No output generated.
✅ Input: I5S → Output: 0
❌ Input: G2I → No output generated.
✅ Input: APO → Output: AAP
✅ Input: KPIC → Output: KPIC
✅ Input: 102 → Output: 102
❌ Input: L102 → No output generated.
❌ Input: CJ1 → No output generated.
❌ Input: I5S → No output generated.
❌ Input: G2I → No output generated.
✅ Input: APO → Output: APO
✅ Input: KPIC → Output: KPIC
✅ Input: 102 → Output: 102
❌ Input: L102 → No output generated.
✅ Input: CJ1 → Output: CCJ
✅ Input: I5S → Output: I5S
❌ Input: G2I → No output generated.
✅ Input: APO → Output: APO
✅ Input: KPIC → Output: KPIC
✅ Input: 102 → Output: 102
✅ Input: L102 → Output: L102
✅ Input: CJ1 → Output: CC1


('./ocr_seq2seq_model_correctiononly\\tokenizer_config.json',
 './ocr_seq2seq_model_correctiononly\\special_tokens_map.json',
 './ocr_seq2seq_model_correctiononly\\tokenizer.json')