### 데이터 입력 확인
- 입력 단어: 안녕하세요
- 입력 정의: '편한 사이에서, 서로 만나거나 헤어질 때 정답게 하는 인사말.', '친한 사이에서 서로 만나거나 헤어질 때 인사로 하는 말.'

In [92]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")
model = AutoModelForMaskedLM.from_pretrained("klue/roberta-small")


In [131]:
mask_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
sep_id = tokenizer.convert_tokens_to_ids(['[SEP]'])[0]
cls_id = tokenizer.convert_tokens_to_ids(['[CLS]'])[0]
pad_id = tokenizer.convert_tokens_to_ids(['[PAD]'])[0]
unk_id = tokenizer.convert_tokens_to_ids(['[UNK]'])[0]

In [132]:
k = 5
definition = []
ins = {'definition':['편한 사이에서, 서로 만나거나 헤어질 때 정답게 하는 인사말.', '친한 사이에서 서로 만나거나 헤어질 때 인사로 하는 말.']}

for word in ins['definition']:
    definition.extend(tokenizer.tokenize(word))
definition = tokenizer.convert_tokens_to_ids(definition)

input = [cls_id] + [mask_id] * k + [sep_id] + definition

input = input[:256] 
input.append(sep_id) # input 최대 길이 = 255([CLS], [], [MASK]*k, ) + 1('[SEP]') = 256
ins['input'] = input


In [136]:

word2bpes = []
word2idx = {}
word = '안녕하세요'


if word not in word2idx:
    word2idx[word] = len(word2idx)
    bpes = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word))
    word2bpes.append(bpes)

number_word_in_train = len(word2idx)


if word not in word2idx:
    word2idx[word] = len(word2idx)
    bpes = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(word))
    word2bpes.append(bpes)


In [138]:
print(word2idx)
word2bpes

{'안녕하세요': 0}


[[5891, 2205, 5971]]

In [139]:
name = 'train'
max_word_len = 5

for idx in range(1):
    if unk_id in bpes:
        if name == 'train':
            continue # bpes = target word 토큰
        else:
            bpes = [0] * (max_word_len + 1)  # +1이 있는 이유는 모르겠음. 아마 이후 if문에서 train을 거르기 위해서?
    if len(bpes) <= max_word_len:
        ins['target'] = idx
        
    else:
        if name != 'train':
            ins['target'] = -1

In [140]:
ins['target']

0

In [141]:
max_word_len = 5
for i in range(len(word2bpes)):
        bpes = word2bpes[i] # bpes = target word 토큰
        print('전: ', bpes)
        bpes = bpes[:max_word_len] + [mask_id] * max(0, max_word_len - len(bpes))
        print('후: ', bpes)
        word2bpes[i] = bpes

전:  [5891, 2205, 5971]
후:  [5891, 2205, 5971, 4, 4]


In [142]:
word2bpes

[[5891, 2205, 5971, 4, 4]]

### Model 출력 확인

In [153]:
from torch import nn
import torch
from transformers import RobertaForMaskedLM
from transformers import BertForMaskedLM

class RDRobertaForMaskedLM(RobertaForMaskedLM):
    def set_start_end(self, start=1, end=5):
        self.start = start
        self.end = end

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        masked_lm_labels=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        lm_labels=None,
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output[:, self.start:self.end])

        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here

        return outputs  # (ltr_lm_loss), (masked_lm_loss), prediction_scores, (hidden_states), (attentions)


In [156]:
class ENRobertaReverseDict(nn.Module):
    def __init__(self, pre_name, word2bpes, pad_id, number_word_in_train):
        super().__init__()
        self.roberta_model = RDRobertaForMaskedLM.from_pretrained(pre_name)
        self.roberta_model.set_start_end(1, 1+len(word2bpes[0]))
        self.max_word_len = len(word2bpes[0])
        # 1 x 1 x vocab_size
        word2bpes = torch.LongTensor(word2bpes).transpose(0, 1).unsqueeze(0) 
        self.register_buffer('word2bpes', word2bpes)
        self.number_word_in_train = number_word_in_train
        self.pad_id = pad_id

    def forward(self, input):
        """
        input 형식: cls + mask + sep_id + definition
        """
        attention_mask = input.ne(self.pad_id)

        #  batch_size x max_len x vocab_size
        bpe_reps = self.roberta_model(input_ids=input, token_type_ids=None,
                                                        attention_mask=attention_mask)[0]

        # bsz x max_word_len x word_vocab_size
        word2bpes = self.word2bpes.repeat(bpe_reps.size(0), 1, 1)
        word_scores = bpe_reps.gather(dim=-1, index=word2bpes)   # bsz x max_word_len x word_vocab_size

        word_scores = word_scores.sum(dim=1)
        if self.training and self.number_word_in_train is not None:
            word_scores = word_scores[:, :self.number_word_in_train]

        return {'pred': word_scores}


In [157]:
from transformers import AdamW

model = ENRobertaReverseDict("klue/roberta-small", word2bpes, pad_id=pad_id,
                          number_word_in_train=1)

optimizer = AdamW(model.parameters(), lr=1e-5)


In [166]:
attention_mask = [pad_id]*len(input)
bpe_reps = model(input_ids=input, token_type_ids=None, attention_mask=attention_mask)[0]

TypeError: forward() got an unexpected keyword argument 'input_ids'