In [1]:
import pandas as pd
import numpy as np
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

from torch.utils.data import Dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from konlpy.tag import Okt
import sacrebleu


train = pd.read_csv('/home/apitrain2/hy_agi/phyEn2Ko/0902.tsv', sep='\t', header=0, names=['input', 'relation', 'output', 'sentence', 'vera_score', 'input_ko', 'output_ko', 'relation_ko'])
dev = pd.read_csv('/home/apitrain2/hy_agi/phyEn2Ko/dev_ko.tsv', sep='\t', header=0, names=['input', 'relation', 'output', 'sentence', 'vera_score', 'input_ko', 'output_ko', 'relation_ko'])
test = pd.read_csv('/home/apitrain2/hy_agi/phyEn2Ko/test_ko.tsv', sep='\t', header=0, names=['input', 'relation', 'output', 'input_ko', 'output_ko', 'relation_ko']) 

train = train[["input_ko", "relation_ko", "output_ko"]]
dev = dev[["input_ko", "relation_ko", "output_ko"]]
test = test[["input_ko", "relation_ko", "output_ko"]]

def drop_duplicates_and_report(df, name):
    before = len(df)
    df_dedup = df.drop_duplicates(subset=["input_ko", "relation_ko", "output_ko"])
    after = len(df_dedup)
    print(f"{name}에서 중복 제거: {before - after}개 삭제 (최종 {after}개)")
    return df_dedup

train = drop_duplicates_and_report(train, "train")
dev = drop_duplicates_and_report(dev, "dev")
test = drop_duplicates_and_report(test, "test")


  from .autonotebook import tqdm as notebook_tqdm
2025-09-09 08:07:34.156960: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-09 08:07:34.177224: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757405254.197310 3249787 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757405254.203679 3249787 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757405254.219798 3249787 computation_placer.cc:177] computation placer already r

train에서 중복 제거: 10203개 삭제 (최종 195224개)
dev에서 중복 제거: 205개 삭제 (최종 5031개)
test에서 중복 제거: 1187개 삭제 (최종 22796개)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
import torch
import torch.nn as nn

login("") # huggingface token

# ---------- 모델/토크나이저 로드 ----------
model_name = "Qwen/Qwen2.5-1.5B-Instruct" #"Bllossom/llama-3.2-Korean-Bllossom-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# 모델을 로드할 때는 low_cpu_mem_usage=True 권장
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    trust_remote_code=False
)
model.resize_token_embeddings(len(tokenizer))


Embedding(151665, 1536)

In [3]:
from torch.utils.data import Dataset


def make_messages(head, relation, tail=None):
    """
    tail=None이면 predict용, tail=str이면 학습용
    """
    messages = [
        {"role": "system", "content": "당신은 개체와 관련된 정보를 정확히 생성하는 유용한 어시스턴트입니다."},
        {"role": "user", "content": f"'{head}'의 '{relation}'에 해당하는 결과를 한국어 명사구로 알려줘."}
    ]
    # 학습용일 때만 assistant content 추가
    if tail is not None:
        messages.append({"role": "assistant", "content": tail})
    return messages

class HeadRelationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=64):
        """
        data: pandas DataFrame with columns ['input_ko', 'relation_ko', 'output_ko']
        """
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        head, relation, tail = row['input_ko'], row['relation_ko'], row['output_ko']

        messages = make_messages(head, relation, tail)  # tail 포함

        # prompt 문자열 (assistant 제외)
        prompt = self.tokenizer.apply_chat_template(
            messages[:-1],  # system + user
            tokenize=False,
            add_generation_prompt=True
        )

        input_ids = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        ).input_ids[0]

        # labels: assistant content만 학습
        labels = self.tokenizer(
            messages[-1]['content'],
            truncation=True,
            max_length=self.max_length,
            padding="max_length",
            return_tensors="pt"
        ).input_ids[0]

        # prompt 부분은 학습 제외
        labels[:len(input_ids)] = -100

        return {"input_ids": input_ids, "labels": labels}

train_dataset = HeadRelationDataset(train.iloc[:1000], tokenizer)
dev_dataset = HeadRelationDataset(dev.iloc[:100], tokenizer)
test_dataset = HeadRelationDataset(test, tokenizer)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [4]:
import numpy as np
import torch
import torch.nn.functional as F
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from bert_score import score as bert_score
from konlpy.tag import Okt
import sacrebleu

okt = Okt()

def compute_metrics(eval_pred):
    """
    BLEU-1~4, sacreBLEU, chrF, chrF++, CIDEr, BERTScore, eval_loss 계산
    """
    try:
        predictions, labels = eval_pred

        # tensor 변환
        predictions = torch.tensor(predictions, dtype=torch.float32) if not isinstance(predictions, torch.Tensor) else predictions
        labels = torch.tensor(labels, dtype=torch.long) if not isinstance(labels, torch.Tensor) else labels

        # Shift for causal LM
        shift_logits = predictions[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        shift_logits = shift_logits.view(-1, shift_logits.size(-1))
        shift_labels = shift_labels.view(-1)
        valid_mask = shift_labels != -100
        valid_logits = shift_logits[valid_mask]
        valid_labels = shift_labels[valid_mask]

        if len(valid_labels) == 0:
            return {
                "bleu1": 0.0, "bleu2": 0.0, "bleu3": 0.0, "bleu4": 0.0,
                "sacrebleu": 0.0, "chrf": 0.0, "chrfpp": 0.0,
                "cider": 0.0, "bertscore": 0.0, "eval_loss": float("inf")
            }

        # eval_loss
        loss = F.cross_entropy(valid_logits, valid_labels)
        eval_loss = loss.item()

        # === Generation for NLG metrics ===
        from transformers import AutoTokenizer
        global tokenizer
        if tokenizer is None:
            raise ValueError("tokenizer must be defined globally for compute_metrics")

        pred_token_ids = torch.argmax(predictions, dim=-1)
        label_token_ids = labels

        pred_texts, label_texts = [], []
        for pred_ids, label_ids in zip(pred_token_ids, label_token_ids):
            label_ids = [i for i in label_ids.tolist() if i != -100 and i != tokenizer.pad_token_id]
            pred_ids = [i for i in pred_ids.tolist() if i != -100 and i != tokenizer.pad_token_id]
            pred_texts.append(tokenizer.decode(pred_ids, skip_special_tokens=True).strip())
            label_texts.append(tokenizer.decode(label_ids, skip_special_tokens=True).strip())

        # BLEU-1~4
        bleu1s, bleu2s, bleu3s, bleu4s = [], [], [], []
        smoothie = SmoothingFunction().method1
        for ref, hyp in zip(label_texts, pred_texts):
            ref_tokens = [ref.split()]
            hyp_tokens = hyp.split()
            bleu1s.append(sentence_bleu(ref_tokens, hyp_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothie))
            bleu2s.append(sentence_bleu(ref_tokens, hyp_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie))
            bleu3s.append(sentence_bleu(ref_tokens, hyp_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie))
            bleu4s.append(sentence_bleu(ref_tokens, hyp_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))

        # sacreBLEU (sentence-level)
        sacrebleu_scores = []
        for ref, hyp in zip(label_texts, pred_texts):
            try:
                hyp_tok = ' '.join(okt.morphs(hyp))
                ref_tok = ' '.join(okt.morphs(ref))
                bleu_score = sacrebleu.sentence_bleu(hyp_tok, [ref_tok])
                sacrebleu_scores.append(bleu_score.score)
            except:
                sacrebleu_scores.append(0.0)

        # chrF / chrF++
        try:
            chrf_score = sacrebleu.corpus_chrf(pred_texts, [label_texts], beta=2.0).score
            chrfpp_score = sacrebleu.corpus_chrf(pred_texts, [label_texts], beta=2.0, word_order=2).score
        except:
            chrf_score = 0.0
            chrfpp_score = 0.0

        # CIDEr
        try:
            from pycocoevalcap.cider.cider import Cider
            gts = {i: [label_texts[i]] for i in range(len(label_texts))}
            res = {i: [pred_texts[i]] for i in range(len(pred_texts))}
            cider_scorer = Cider()
            cider_score, _ = cider_scorer.compute_score(gts, res)
        except:
            cider_score = 0.0

        # BERTScore
        try:
            P, R, F1 = bert_score(pred_texts, label_texts, lang="ko", verbose=False)
            bertscore = float(F1.mean())
        except:
            bertscore = 0.0

        return {
            "bleu1": float(np.mean(bleu1s)),
            "bleu2": float(np.mean(bleu2s)),
            "bleu3": float(np.mean(bleu3s)),
            "bleu4": float(np.mean(bleu4s)),
            "sacrebleu": float(np.mean(sacrebleu_scores)),
            "chrf": float(chrf_score),
            "chrfpp": float(chrfpp_score),
            "cider": float(cider_score),
            "bertscore": float(bertscore),
            "eval_loss": eval_loss,
        }

    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {
            "bleu1": 0.0, "bleu2": 0.0, "bleu3": 0.0, "bleu4": 0.0,
            "sacrebleu": 0.0, "chrf": 0.0, "chrfpp": 0.0,
            "cider": 0.0, "bertscore": 0.0, "eval_loss": float("inf")
        }


In [5]:
from transformers import Trainer, TrainingArguments
import torch

torch.cuda.empty_cache()


training_args = TrainingArguments(
    output_dir="./outputs",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    logging_steps=100,
    eval_strategy="steps",
    learning_rate = 5e-5,
    weight_decay = 0.01,
    save_steps=500,
    save_total_limit=2,
    fp16 = True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=collator
)

trainer.train()

  trainer = Trainer(


[2025-09-09 08:07:46,182] [INFO] [real_accelerator.py:260:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/apitrain2/miniconda3/envs/hyenv/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/apitrain2/miniconda3/envs/hyenv/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


[2025-09-09 08:07:47,848] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


Step,Training Loss,Validation Loss,Bleu1,Bleu2,Bleu3,Bleu4,Sacrebleu,Chrf,Chrfpp,Cider,Bertscore
100,0.0656,0.394956,0.810119,0.723854,0.672577,0.620261,73.130777,82.390199,82.246909,0.031777,0.947924




TrainOutput(global_step=167, training_loss=0.05355669495588291, metrics={'train_runtime': 104.2228, 'train_samples_per_second': 9.595, 'train_steps_per_second': 1.602, 'total_flos': 503170793472000.0, 'train_loss': 0.05355669495588291, 'epoch': 1.0})

In [6]:
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# -----------------------------
# Prompt 생성
# -----------------------------
def make_prompt_str(head, relation):
    system = "당신은 개체와 관련된 정보를 정확히 생성하는 유용한 어시스턴트입니다."
    user = f"'{head}'의 '{relation}'에 해당하는 결과를 한국어 명사구로 알려줘."
    # tail 생성 유도
    prompt = f"{system}\n{user}\nAssistant:"
    return prompt

# -----------------------------
# Dataset 정의
# -----------------------------
class TestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        return row['input_ko'], row['relation_ko']

# -----------------------------
# Batch predict 함수
# -----------------------------
def predict_batch(model, tokenizer, test_df, batch_size=4, max_new_tokens=64):
    dataset = TestDataset(test_df)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    results = []

    for batch in tqdm(dataloader, desc="Predicting"):
        heads, relations = batch
        for head, relation in zip(heads, relations):
            prompt = make_prompt_str(head, relation)
            input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

            with torch.no_grad():
                output_ids = model.generate(
                    input_ids,
                    max_new_tokens=max_new_tokens,  # tail 길이
                    do_sample=False,               # greedy
                    pad_token_id=tokenizer.eos_token_id
                )

            output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

            # Assistant: 뒤의 tail만 추출
            if "Assistant:" in output_text:
                output_text = output_text.split("Assistant:")[-1].strip()

            results.append(output_text)

            # GPU 메모리 정리
            del input_ids, output_ids
            torch.cuda.empty_cache()

    return results

# -----------------------------
# Predict 실행
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# test 데이터 일부만 예시
predictions = predict_batch(model, tokenizer, test.iloc[:10], batch_size=2, max_new_tokens=64)

for i, row in enumerate(test.iloc[:10].itertuples()):
    print(f"HEAD: {row.input_ko}, RELATION: {row.relation_ko}, PREDICT: {predictions[i]}")


Predicting:   0%|          | 0/5 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Predicting: 100%|██████████| 5/5 [00:18<00:00,  3.70s/it]

HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로 알려줘. '개체용도'에 해당하는 결과를 한국어 명사구로
HEAD: 은행계좌, RELATION: 개체용도, PREDICT: '개체용도'의 '개체용도'에 해당하는 결과를 한국어 명사구


