In [None]:
# 1. 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# 2. 파일 업로드
from google.colab import files
import json

uploaded = files.upload()

In [None]:
# 3. 필요한 패키지 설치
!pip install --upgrade transformers datasets accelerate fsspec==2025.3.2 --quiet

In [None]:
# 4. 주요 라이브러리 로딩
import json
import torch
import wandb
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, EarlyStoppingCallback

In [None]:
# 5. 모델 및 토크나이저 로딩
model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

tokenizer.add_special_tokens({'eos_token': '<END>', 'pad_token': '<pad>'})
model.resize_token_embeddings(len(tokenizer))

In [None]:
# 6. Train Set 로딩
with open("augmented_logic.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

In [None]:
# 7. Validation Set 로딩
with open("logic.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

In [None]:
# 8. 문제생성용 프롬프트
def format_data(data):
    return [
        {
            "Text": f"문제: {item['Question']}<END>",
            "Mask": f"문제: "
        }
        for item in data
    ]

formatted_train_data = format_data(train_data)
formatted_val_data = format_data(val_data)

In [None]:
# 9. 마스킹 토크나이즈 함수
def tokenize_with_mask(example):
    full_input = tokenizer(example["Text"], padding="max_length", truncation=True, max_length=512)
    q_ids = tokenizer(example["Mask"], truncation=True, max_length=512)["input_ids"]
    q_len = len(q_ids)

    labels = full_input["input_ids"][:]
    labels[:q_len] = [-100] * q_len
    full_input["labels"] = labels

    return full_input

In [None]:
# 10. Dataset 변환 및 전처리
train_dataset = Dataset.from_list(formatted_train_data)
val_dataset = Dataset.from_list(formatted_val_data)

tokenized_train_dataset = train_dataset.map(tokenize_with_mask, batched=False)
tokenized_val_dataset = val_dataset.map(tokenize_with_mask, batched=False)

tokenized_train_dataset = train_dataset.map(tokenize_with_mask, batched=False, remove_columns=["Text", "Mask"])
tokenized_val_dataset = val_dataset.map(tokenize_with_mask, batched=False, remove_columns=["Text", "Mask"])

In [None]:
# 11. Trainer 준비
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/KoGPT/CheckPoint/Logic/Problem",
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_strategy="no",
    save_steps=50,
    save_total_limit=1,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    fp16=True,
    overwrite_output_dir=True,
    learning_rate=1e-5,
    warmup_steps=50
)

In [None]:
# 12. Trainer 구성
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# 13. 학습 실행
trainer.train()

In [None]:
# 14. 모델 저장
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/KoGPT/Model/Logic/Problem")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/KoGPT/Model/Logic/Problem")