In [2]:
import torch
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
from datasets import Dataset
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForSeq2Seq
import json

# GPU가 사용 가능한지 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # 출력: 'cuda'

2024-11-30 22:17:27.129249: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-30 22:17:27.145669: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-30 22:17:27.145689: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-30 22:17:27.146199: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-30 22:17:27.149209: I tensorflow/core/platform/cpu_feature_guar

Using device: cuda


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
# 모델 및 토크나이저 설정
MODEL_NAME = "hyunwoongko/kobart"
tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME)
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
  return self.fget.__get__(instance, owner)()


In [3]:
# 2. JSON 데이터 로드 및 전처리

# JSON 파일 경로 설정
json_file = "jeju_data2.json"  # JSON 파일 이름
with open(json_file, "r", encoding="utf-8-sig") as f:
    data = json.load(f)

# Dataset 객체로 변환
dataset = Dataset.from_list(data)

In [4]:
# 전처리 함수
def preprocess_function(examples):
    inputs = tokenizer(examples["source"], max_length=128, truncation=True, padding="max_length")
    targets = tokenizer(examples["target"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

In [5]:
# 데이터셋 전처리
processed_dataset = dataset.map(preprocess_function, batched=False)

# 학습/평가 데이터 분리
train_test_split = processed_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]


Map:   0%|          | 0/196197 [00:00<?, ? examples/s]

In [6]:
training_args = TrainingArguments(
    output_dir="/home/billy/snap/geuneon/kobart-finetuned-jeju3",  # 모델 출력 경로
    evaluation_strategy="epoch",  # 평가 전략 (매 epoch마다 평가)
    save_strategy="epoch",  # 체크포인트 저장 전략 (매 epoch마다 저장)
    learning_rate=5e-5,  # 학습률
    per_device_train_batch_size=8,  # 학습 시 배치 크기
    per_device_eval_batch_size=8,  # 평가 시 배치 크기
    num_train_epochs=3,  # 총 학습 epoch 수
    weight_decay=0.01,  # weight decay
    save_total_limit=2,  # 저장할 체크포인트의 최대 개수
    logging_dir="/home/billy/snap/geuneon/logs3",  # 로그 파일 저장 디렉토리
    logging_steps=10000,  # 로그 작성 빈도 (step 단위)
    report_to="none",  # 외부 로깅 사용 안 함 (TensorBoard 등 제외)
    load_best_model_at_end=True,  # 학습 종료 시 최상의 모델 로드
    metric_for_best_model="eval_loss",  # 최상의 모델 선택 기준
    greater_is_better=False,  # 낮은 eval_loss일수록 좋은 모델
    fp16=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
# 데이터 Collator 정의 (torch 관련 제거)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 4. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,  # DataCollatorForSeq2Seq를 사용
)

# 5. 모델 학습
trainer.train()

# 6. 학습된 모델 저장
trainer.save_model("/home/billy/snap/geuneon/kobart-finetuned-jeju3")
tokenizer.save_pretrained("/home/billy/snap/geuneon/kobart-finetuned-jeju3")


Epoch,Training Loss,Validation Loss
1,0.0325,0.054084
2,0.0211,0.052214
3,0.013,0.053349


Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
Non-default generation parameters: {'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
Non-default generation parameters: {'forced_eos_token_id': 1}


('/home/billy/snap/geuneon/kobart-finetuned-jeju3/tokenizer_config.json',
 '/home/billy/snap/geuneon/kobart-finetuned-jeju3/special_tokens_map.json',
 '/home/billy/snap/geuneon/kobart-finetuned-jeju3/tokenizer.json')

In [24]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast

# 저장된 모델 경로
model_path = "./kobart-finetuned-jeju3"

# 모델과 토크나이저 로드
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)

# 테스트 입력 데이터
input_text = "서껑 허는 건 뭐 , 보리ᄊᆞᆯ에 ᄑᆞᆺ도 서끄곡 , 콩도 서끄곡 ."

# 입력 텍스트를 토크나이즈
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# 모델 추론
output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)

# 생성된 텍스트 디코딩
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 결과 출력
print("Input Text:", input_text)
print("Generated Text:", output_text)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Input Text: 서껑 허는 건 뭐 , 보리ᄊᆞᆯ에 ᄑᆞᆺ도 서끄곡 , 콩도 서끄곡 .
Generated Text: 섞어서 하는 건 뭐, 보리쌀에 팥도 섞고, 콩도 섞고.


In [23]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast
from nltk.translate.bleu_score import corpus_bleu

# 저장된 모델 경로
model_path = "./kobart-finetuned-jeju3"

# 모델과 토크나이저 로드
model = BartForConditionalGeneration.from_pretrained(model_path)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_path)

# 테스트 입력 데이터 (여러 개의 문장으로 테스트)
input_texts = [
    "ᄋᆞ , ᄋᆞ , 기여 . ᄋᆞ , 경덜 헤세 . 거 막 좋은 거여 . ᄎᆞᆷ 범벅 감제뻿데기 헹은에 ᄀᆞᆯ앙 범벅허민 ᄎᆞᆷ 맛좋나 . 그거 고급이라 .",  # 원문
    "ᄀᆞᆯ앙은 ᄉᆞᆯᄇᆞᆯ ᄀᆞᆯᆞᆯ ᄒᆞᆯᄆᆞᆯ .",  # 원문
]

# 참조 번역 (실제 번역, 각 문장의 번역 결과)
reference_texts = [
    ["Ah, yes, contribution. Yes, it’s very good. The mixture of it is very tasty. That’s expensive."],  # 참조 번역
    ["The mixture is very delicious."],  # 참조 번역
]

# 입력 텍스트를 토크나이즈하고 모델에서 번역 생성
generated_texts = []
for input_text in input_texts:
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    generated_texts.append(output_text)

# BLEU 점수 계산
# nltk의 corpus_bleu는 참조 번역과 생성된 번역을 비교하여 BLEU 점수를 계산
bleu_score = corpus_bleu(reference_texts, generated_texts)

# 결과 출력
print("Generated Texts:")
for text in generated_texts:
    print(text)

print(f"\nBLEU Score: {bleu_score:.4f}")


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Generated Texts:
어, 어, 그래. 어, 그렇게들 했지. 거 아주 좋은 거야. 참 범벅 절간고구마 해서 갈아서 범벅하면 참 맛있나. 그거 고급이라.
갈아서 살ᄇᆞᆯ, ᄒᆞᆯᄆᆞᆯ.

BLEU Score: 0.0000
