In [None]:
#1. 설치
!pip install transformers datasets accelerate --quiet

#2. 라이브러리 로딩
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
import json

#3. 아주 가벼운 한국어 모델 (KoGPT2 base)
model_name = "skt/kogpt2-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = AutoModelForCausalLM.from_pretrained(model_name)

#4. 훈련용 최소 데이터셋 (코드 내 정의)
examples = [
  {"instruction": "안녕?", "input": "", "output": "안녕하세요! 만나서 반가워요."},
  {"instruction": "이름이 뭐야?", "input": "", "output": "저는 인공지능 챗봇이에요."},
  {"instruction": "날씨 어때?", "input": "", "output": "오늘은 맑고 따뜻한 날씨예요."},
]

dataset = Dataset.from_list(examples)

#5. 전처리 (최대 토큰 32)
def preprocess(example):
  prompt = f"질문: {example['instruction']} {example['input']}\n답변:"
  inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=32)
  labels = tokenizer(example["output"], padding="max_length", truncation=True, max_length=32)
  inputs["labels"] = labels["input_ids"]
  return inputs

tokenized_dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

#6. 훈련 설정 (배치 1, 에폭 1, 로깅 최소화)
training_args = TrainingArguments(
  output_dir="./kogpt2-ultralight",
  per_device_train_batch_size=1,
  num_train_epochs=30,
  learning_rate=5e-5,
  save_steps=5,
  logging_dir="./logs",
  logging_steps=1,
  save_total_limit=1,
  report_to="none"
)

#7. Trainer 학습
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset,
  tokenizer=tokenizer
)


trainer.train()

#8. 모델 저장
trainer.save_model("./kogpt2-ultralight")
tokenizer.save_pretrained("./kogpt2-ultralight")
print("초경량 파인튜닝 완료")

#9. 챗봇 생성
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_dir = "kogpt2-ultralight"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir)

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/513M [00:00<?, ?B/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 1}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,14.6233
2,11.9248
3,7.8686
4,3.1171
5,2.3061
6,1.6661
7,1.3936
8,0.4394
9,2.1321


In [None]:
print("초경량 챗봇을 시작합니다. '종료'라고 입력하면 끝납니다.\n")
while True:
  user_input = input("사용자: ")
  if user_input.strip().lower() == "종료":
    break

  prompt = f"질문: {user_input}\n답변:"
  response = chatbot(prompt, max_new_tokens=50, do_sample=True, temperature=0.8)[0]["generated_text"]
  answer = response.split("답변:")[-1].strip() if "답변:" in response else response.strip()
  print("🤖 챗봇:", answer)