In [None]:
from huggingface_hub import login

login()

In [None]:
model_path = "model/gemma-3-1b-it"

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

datasets = load_dataset("json", data_files = {"train": "train_data.json", "validation" : "validation_data.json"}) # datafile 가져오기
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [None]:
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

model_name = model_path.split("/")[-1]

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 32,
    target_modules
     = ["q_proj", "v_proj"],
    lora_dropout = 0.05,
    bias = "none",
    task_type = "CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

def preprocess_function(examples):
    texts = []
    for conv in examples["messages"]:
        # conv는 이미 리스트임
        text = "\n".join([msg["role"] + ": " + msg["content"] for msg in conv])
        texts.append(text)
    model_inputs = tokenizer(texts, max_length=512, truncation=True, padding = True)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# 4. 데이터셋에 토크나이징 적용
tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets["train"].column_names)


training_args = TrainingArguments(
    f"{model_name}_trained",
    eval_strategy = "epoch",
    learning_rate=2e-4,
    save_strategy="no",   # epoch마다 저장
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=20,
)


In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["validation"],
)

trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
# trainer.push_to_hub()
model.save_pretrained("model/trained_model")
tokenizer.save_pretrained("model/trained_model")

In [None]:
model_path = "model/trained_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

text = input()

messages = [
    {"role" : "user", "content" : text},
]

inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))