In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

# 모델과 토크나이저 로드
model_name = 'skt/kogpt2-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # eos_token을 pad_token으로 설정
model = AutoModelForCausalLM.from_pretrained(model_name)

# 모델을 mps 장치로 이동
device = torch.device('mps')
model.to(device)

# 데이터셋 준비
train_texts = ["여기에 학습용 텍스트를 추가하세요."]
train_encodings = tokenizer(train_texts, return_tensors='pt', padding=True, truncation=True)

# labels 추가
train_encodings['labels'] = train_encodings.input_ids.clone()

class TextDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    return {key: tensor[idx] for key, tensor in self.encodings.items()}

  def __len__(self):
    return len(self.encodings.input_ids)

train_dataset = TextDataset(train_encodings)

# 학습 설정
training_args = TrainingArguments(
  output_dir='./results',
  num_train_epochs=3,
  per_device_train_batch_size=2,
  save_steps=10_000,
  save_total_limit=2,
  logging_dir='./logs',
)

# Trainer 설정
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
)

# 학습 시작
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 3/3 [00:03<00:00,  1.06s/it]

{'train_runtime': 3.1915, 'train_samples_per_second': 0.94, 'train_steps_per_second': 0.94, 'train_loss': 1.9230230649312336, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=1.9230230649312336, metrics={'train_runtime': 3.1915, 'train_samples_per_second': 0.94, 'train_steps_per_second': 0.94, 'total_flos': 13779072000.0, 'train_loss': 1.9230230649312336, 'epoch': 3.0})

In [2]:
# 텍스트 생성 함수 정의
def generate_text(prompt, max_length=50):
  # 입력 텍스트를 토큰화
  inputs = tokenizer(prompt, return_tensors='pt').to(device)
  
  # 모델을 사용하여 텍스트 생성
  outputs = model.generate(inputs.input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
  
  # 생성된 텍스트 디코딩
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  
  return generated_text

# 예시 텍스트 생성
prompt = "여기에"
generated_text = generate_text(prompt)
print(generated_text)

여기에 학습용 텍스트를 추가하세요. 학습용 텍스트를 추가하세요..
#학습용 텍스트를 추가하세요..
#학습용 텍스트를 추가하세요..
#학습용 텍스트를 추가하세요..
#
