## 데이터 불러오고 나누기

In [1]:
import json
import pandas as pd
from datasets import Dataset

def load_data(file_path):
    """
    JSON 데이터를 불러와 Pandas DataFrame으로 변환
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        
    data = []
    for item in json_data['data']:
        if 'annotation' in item:
            annotation = item['annotation']
            
            if 'err_sentence' in annotation and 'cor_sentence' in annotation:
                data.append({
                    'err_sentence': annotation['err_sentence'],
                    'cor_sentence': annotation['cor_sentence']
                })
                
    df = pd.DataFrame(data)
    print(df.info())
    # None 값을 빈 문자열로 대체
    df = df.fillna('')
    return df

# JSON 파일 경로
file_path = "/home/yjtech2/Desktop/yurim/LLM/Data/spelling/train/combined_data2.json"
# 데이터 불러오기
df = load_data(file_path)

# 데이터셋 변환 및 분할
data_dict = df.to_dict(orient='list')
train_dataset = Dataset.from_dict(data_dict)
valid_dataset= Dataset.from_dict({'err_sentence': [], 'cor_sentence': []}) 

  from .autonotebook import tqdm as notebook_tqdm


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19653 entries, 0 to 19652
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   err_sentence  19653 non-null  object
 1   cor_sentence  19653 non-null  object
dtypes: object(2)
memory usage: 307.2+ KB
None


In [2]:
# 데이터 길이 확인
for sample in valid_dataset:
    print(f"err_sentence length: {len(sample['err_sentence'])}, cor_sentence length: {len(sample['cor_sentence'])}")


## 모델 학습

In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from transformers import get_scheduler
from evaluate import load
import numpy as np
import torch
from torch.optim import AdamW
from typing import Dict
from tqdm import tqdm
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import DataCollatorForSeq2Seq
from torch.cuda.amp import GradScaler, autocast

import json


2024-11-22 13:55:36.953280: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-22 13:55:36.978603: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import os
import torch
from tqdm import tqdm
from typing import Dict
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    AdamW,
    DataCollatorForSeq2Seq,
)


class CustomT5Trainer:
    def __init__(self, model_name: str, **kwargs):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.optimizer = AdamW(self.model.parameters(), lr=kwargs.get("learning_rate", 2e-5))
        self.max_length = kwargs.get("max_length", 128)
        self.training_args = kwargs
        self.save_dir = kwargs.get("save_dir", "/home/yjtech2/Desktop/yurim/LLM/Pre_processing/spelling/et5/best_model5")
        
        # 저장 경로 설정
        self.best_model_path = os.path.join(self.save_dir, "pytorch_model.bin")
        self.tokenizer_path = self.save_dir

        # 디렉터리 생성
        os.makedirs(self.save_dir, exist_ok=True)

    def save_model_and_tokenizer(self):
        """모델과 토크나이저 저장"""
        # 모델과 구성 저장
        self.model.save_pretrained(self.save_dir)
        print(f"Model and config saved at {self.save_dir}")

        # 토크나이저 저장
        self.tokenizer.save_pretrained(self.save_dir)
        print(f"Tokenizer saved at {self.save_dir}")


    def train(self, train_dataset, valid_dataset=None):
        # 데이터 전처리
        tqdm.pandas(desc="Processing spacing correction")
        train_dataset = train_dataset.map(
            self.preprocess_data,
            batched=True,
            remove_columns=train_dataset.column_names,
        )

        # DataCollatorForSeq2Seq로 배치 내 샘플 크기 맞춤
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding="longest",
            return_tensors="pt"
        )

        # DataLoader 준비
        train_dataloader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.training_args["batch_size"],
            shuffle=True,
            collate_fn=data_collator
        )

        # 총 학습 단계 계산
        total_steps = len(train_dataloader) * self.training_args["num_epochs"]

        # 학습률 스케줄러 설정
        scheduler = get_scheduler(
            "linear",  # 선형 감소 스케줄러
            optimizer=self.optimizer,
            num_warmup_steps=0,  # 워밍업 단계
            num_training_steps=total_steps
        )

        self.model.train()
        for epoch in range(self.training_args["num_epochs"]):
            epoch_loss = 0
            for batch in train_dataloader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                )
                loss = outputs.loss
                loss.backward()

                self.optimizer.step()
                scheduler.step()  # 학습률 업데이트
                self.optimizer.zero_grad()

                epoch_loss += loss.item()

            print(f"Epoch {epoch + 1} completed. Loss: {epoch_loss:.4f}")


        # 모델과 토크나이저 저장
        self.save_model_and_tokenizer()

    def preprocess_data(self, examples: Dict) -> Dict:
        """데이터 전처리 함수"""
        inputs = [f"맞춤법 교정: {self._normalize_text(text)}" for text in examples["err_sentence"]]
        model_inputs = self.tokenizer(
            inputs,
            max_length=self.max_length,
            padding="longest",
            truncation=True,
            return_tensors="pt",
        ).to(self.device)

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                examples["cor_sentence"],
                max_length=self.max_length,
                padding="longest",
                truncation=True,
                return_tensors="pt",
            ).to(self.device)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    def predict(self, text: str) -> str:
        """모델 추론"""
        inputs = self.tokenizer(
            f"맞춤법 교정: {self._normalize_text(text)}",
            return_tensors="pt",
            max_length=self.max_length,
            truncation=True,
        ).to(self.device)

        outputs = self.model.generate(
            inputs["input_ids"],
            max_length=self.max_length,
            num_beams=5
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


    def _normalize_text(self, text: str) -> str:
        return text.strip()


In [5]:
trainer = CustomT5Trainer(
    model_name="j5ng/et5-typos-corrector",
    max_length=128,
    learning_rate=2e-5,
    batch_size=8,
    num_epochs=5,
    gradient_accumulation_steps=8
)



if __name__ == "__main__":
    torch.cuda.empty_cache()  # GPU 메모리 초기화
    trainer.train(train_dataset, valid_dataset)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 19653/19653 [00:01<00:00, 12717.81 examples/s]
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 completed. Loss: 258.6538
Epoch 2 completed. Loss: 97.9731
Epoch 3 completed. Loss: 51.1471
Epoch 4 completed. Loss: 29.7159
Epoch 5 completed. Loss: 19.7263
Model and config saved at /home/yjtech2/Desktop/yurim/LLM/Pre_processing/spelling/et5/best_model5
Tokenizer saved at /home/yjtech2/Desktop/yurim/LLM/Pre_processing/spelling/et5/best_model5


## 테스트

In [6]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("/home/yjtech2/Desktop/yurim/LLM/Pre_processing/spelling/et5/best_model5")
tokenizer = T5Tokenizer.from_pretrained("/home/yjtech2/Desktop/yurim/LLM/Pre_processing/spelling/et5/best_model5")
print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# 저장된 모델 경로
saved_model_path = "/home/yjtech2/Desktop/yurim/LLM/Pre_processing/spelling/et5/best_model5"

# 모델과 토크나이저 불러오기
tokenizer = T5Tokenizer.from_pretrained(saved_model_path, local_files_only=True)
model = T5ForConditionalGeneration.from_pretrained(saved_model_path, local_files_only=True).to('cpu')

# 예측 테스트
text = "음씩물 쓰래기냄세가 확 난다."
inputs = tokenizer(
    f"맞춤법 교정: {text}",
    return_tensors="pt",
    max_length=128,
    truncation=True
).to("cpu")

# attention_mask를 추가하여 모델이 불필요한 패딩 위치를 무시하도록 설정
outputs = model.generate(
    inputs["input_ids"],
    max_length=128,
    num_beams=5,
    length_penalty=0.6,
    early_stopping=True
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("원래 문장: ", text)
print("교정 후 문장: ", result)


원래 문장:  음씩물 쓰래기냄세가 확 난다.
교정 후 문장:  조금씩 쓰레기 냄새가 확 난다.


: 