In [1]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer


device = 'cuda:0'
model_name = 'EleutherAI/gpt-neo-1.3B'

model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [15]:
import pandas as pd

prompt = "Transformers are the"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# 텍스트 생성하기

- 공부를 위해서 직접 구현해보자!
- 급한사람은 아래 코드를 바로 이용가능하다

```
gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.9,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
```



### 그리디 버전으로 텍스트 생성
- 다양성이 필요한 작업보다 결정적이고 사실적으로 정확한 출력이 필요한 수식에 유리
- 혹은 빔서치를 활용할 수도 있다!

In [16]:
import pandas as pd
iterations = []

n_steps = 8
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration['input'] = tokenizer.decode(input_ids[0]) 
        output = model(input_ids=input_ids)
        
        # 첫번째 배치의 마지막 토큰의 로짓을 선택해 소프트맥스 적용
        next_token_logits = output.logits[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        
        # 그리디 방식으로 디코딩 수행
        # 각 스텝별로 확률이 가장 높은 토큰을 선택
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (f'{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)')
            iteration[f'Choice {choice_idx+1}'] = token_choice
            
            #예측한 다음 토큰을 입력에 추가
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)
        iterations.append(iteration)

In [17]:
pd.DataFrame(iterations)

Unnamed: 0,input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Transformers are the,most (20.53%),best (6.12%),coolest (2.24%),Transformers (2.22%),biggest (2.09%)
1,Transformers are the most,popular (25.45%),successful (7.44%),iconic (3.54%),famous (3.00%),beloved (2.99%)
2,Transformers are the most popular,toys (8.64%),toy (6.75%),and (5.74%),movie (4.42%),franchise (4.18%)
3,Transformers are the most popular toys,in (35.96%),of (13.08%),for (9.05%),ever (4.70%),and (2.87%)
4,Transformers are the most popular toys in,the (69.00%),America (6.14%),Japan (2.12%),children (2.08%),American (1.52%)
5,Transformers are the most popular toys in the,world (68.31%),United (4.99%),Transformers (3.71%),US (2.67%),U (2.46%)
6,Transformers are the most popular toys in the ...,. (42.39%),", (28.99%)",and (8.55%),today (3.27%),right (3.25%)
7,Transformers are the most popular toys in the ...,They (13.98%),The (6.58%),But (3.99%),There (3.94%),And (3.63%)


- 사실 Spell 생성은 다양한게 중요하니까
- 여기에 좀 더 적합한 샘플링 방법을 사용
- 

### 샘플링 방법으로 텍스트 생성

- 확률이 가장 높은 K 토큰에서만 샘플링해서 확률이 낮은 토큰을 피함

In [28]:
max_length = 8
prompt = "Transformers are the"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

In [30]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=0.5, top_k=10, pad_token_id=5025)
print(tokenizer.decode(output_temp[0]))

Transformers are the most popular series in


## 모델 평가를 위한 BLEU 점수 구하기


In [32]:
from datasets import load_metric

bleu_metric = load_metric('sacrebleu')

In [35]:
import numpy as np

In [36]:
bleu_metric.add(prediction='the cat is on mat', reference=['the cat in on the mat'])
results = bleu_metric.compute(smooth_method='floor', smooth_value=0)
results['precisions'] = [np.round(p,2) for p in results['precisions']]

In [37]:
results

{'score': 0.0,
 'counts': [4, 1, 0, 0],
 'totals': [5, 4, 3, 2],
 'precisions': [80.0, 25.0, 0.0, 0.0],
 'bp': 0.8187307530779819,
 'sys_len': 5,
 'ref_len': 6}

### 참고링크

- 트랜스포머를 활용한 자연어 처리
- https://huggingface.co/docs/transformers/model_doc/gpt_neo#overview