In [9]:
#====================================================================================================
# SKT Ko-GPT2 Text Generation 예제 
# => https://github.com/SKT-AI/KoGPT2
#====================================================================================================
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, GPT2TokenizerFast

model_path='../model/gpt-2/gpt-2/'
#model_path='skt/kogpt2-base-v2'
device = torch.device("cuda:0")

In [10]:
# bos_token = </s> 인 이유는 => 보통 훈련된 모델들은 </s>를 시작 과 종료 토큰으로 모두 사용한다.
tokenizer = GPT2TokenizerFast.from_pretrained(model_path,
                                                   bos_token='</s>',
                                                   eos_token='</s>',
                                                   unk_token='<unk>',
                                                   pad_token='<pad>',
                                                   mask_token='<mask>')

tokenizer.tokenize("<s>안녕하세요. 한국어 GPT-2 입니다.")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['<',
 's',
 '>',
 'ì',
 'ķ',
 'Ī',
 'ë',
 'ħ',
 'ķ',
 'íķ',
 'ĺ',
 'ì',
 'Ħ',
 '¸',
 'ì',
 'ļ',
 'Ķ',
 '.',
 'Ġ',
 'íķ',
 'ľ',
 'ê',
 'µ',
 'Ń',
 'ì',
 'ĸ',
 '´',
 'ĠG',
 'PT',
 '-',
 '2',
 'Ġì',
 'ŀ',
 'ħ',
 'ëĭ',
 'Ī',
 'ëĭ',
 '¤',
 '.']

In [11]:
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [4]:
model.num_parameters()

124439808

In [12]:
text = '날씨'
input_ids = tokenizer.encode(text, return_tensors='pt')
print(input_ids)

gen_ids = model.generate(input_ids.to(device),
                         max_length=128,
                         repetition_penalty=2.0,
                         pad_token_id=tokenizer.pad_token_id,
                         eos_token_id=tokenizer.eos_token_id,
                         bos_token_id=tokenizer.bos_token_id,
                         use_cache=True)
print(gen_ids.shape)
print(gen_ids[0])

# skip_special_tokens=True 로 해서 <s>, </s> 토큰들은 출력안 시킬수도 있음
generated = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
print(generated)

tensor([[167, 224, 254, 168, 242, 101]])
torch.Size([1, 128])
tensor([  167,   224,   254,   168,   242,   101,   166,   111,   113, 47991,
          246, 46695,    97,    13,   198,   464,   717,   640,   314,  2497,
          262,  2008,    11,   340,   373,   257,  1310,  1643,  1180,   422,
          644,   345,   766,   287,   584,  5861,   286,   428,  1611,    25,
          383,  3516,   318,  5762,   281, 10912, 10147,   290,  2042, 12581,
          351,  2330, 24359,   319,  1353,    26,   339,   338,  4769,   465,
         3072,   510,   284,   905,   502,   326,   612,   389,   645,  9073,
         1088,   683,   357,   258,  1595,   470,   423,   530,   737,   679,
         3073,   588,  2130,   508,   468,   587,  2823,   416,  1644,   393,
         1223,   475,   788,  6451,  9911,  3375,   780,   484,   821, 12008,
          329,   511,  3160,     0,   632,  2331,   355,   611,   356,  1053,
         1775,   617,  3297,   366,    83,   419,   495,     1,   878,   994,
  

In [11]:
# 모델과 tokenizer 파일로 저장
#tokenizer.save_pretrained('kogpt2')
#model.save_pretrained('kogpt2')

In [13]:
# text generation 테스트 해보는 함수 
def eval_keywords(keywords):
    model.eval()
    
    for keyword in keywords:
        input_seq = "<s>" + keyword
        generated = torch.tensor(tokenizer.encode(input_seq)).unsqueeze(0)
        generated = generated.to(device)
        sample_outputs = model.generate(generated,
                                        do_sample = True,
                                        top_k=30,
                                        max_length=50,
                                        top_p=0.90,
                                        num_return_sequences=2)
        
        for i, sample_output in enumerate(sample_outputs):
            # skip_special_tokens=True 로 해서 <s>, </s> 토큰들은 출력안함
            print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            if i == 1:
                print("\n")
                                   

In [14]:
# 각 단어를 입력하여, text generation 해 봄
keywords = ["지미 카터","제임스 얼","수학"]
eval_keywords(keywords)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: <s>지미 카터히 탕그 김입든지드 박한 수�
1: <s>지미 카터로 해게는 공어려 타는 더스




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: <s>제임스 얼몬도 사랑게 좋라 사랑이 알면�
1: <s>제임스 얼라우 해고 식다 미요자 보아다 


0: <s>수학국스고 시국스서 해지 서로 부만고
1: <s>수학다력시마에기 성력시면 가들 안주


