In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

문장의 순서를 맞추는 것,
시계열의 문장을 다음 값을 예측하는 것.
앞의 값을 어느정도 활용해서 뒤의 값을 예측
아주 일부분을 원리로 -> 이건 언어모델의 기초 일부분일 뿐

In [7]:
sentence= ("""if you want to build a ship, don't drum up people together to collect wood and don't assign them tasks and work,
but rather teach them to long for the endless immensity of the sea.""").split()
sentence
print(sentence)

['if', 'you', 'want', 'to', 'build', 'a', 'ship,', "don't", 'drum', 'up', 'people', 'together', 'to', 'collect', 'wood', 'and', "don't", 'assign', 'them', 'tasks', 'and', 'work,', 'but', 'rather', 'teach', 'them', 'to', 'long', 'for', 'the', 'endless', 'immensity', 'of', 'the', 'sea.']


같은단어들을 제거하고, 단어가 총 29개 나옴

In [8]:
vocab=sorted(list(set(sentence)))
vocab
print(vocab)
vocab_size=len(vocab)
print(vocab_size)

['a', 'and', 'assign', 'build', 'but', 'collect', "don't", 'drum', 'endless', 'for', 'if', 'immensity', 'long', 'of', 'people', 'rather', 'sea.', 'ship,', 'tasks', 'teach', 'the', 'them', 'to', 'together', 'up', 'want', 'wood', 'work,', 'you']
29


In [9]:
word_to_index={c:i for i,c in enumerate(vocab,1)}
print(word_to_index) 
word_to_index['<unk>']=0 #인덱스 0에 unk 넣음
print(word_to_index)
print(word_to_index['for'])

{'a': 1, 'and': 2, 'assign': 3, 'build': 4, 'but': 5, 'collect': 6, "don't": 7, 'drum': 8, 'endless': 9, 'for': 10, 'if': 11, 'immensity': 12, 'long': 13, 'of': 14, 'people': 15, 'rather': 16, 'sea.': 17, 'ship,': 18, 'tasks': 19, 'teach': 20, 'the': 21, 'them': 22, 'to': 23, 'together': 24, 'up': 25, 'want': 26, 'wood': 27, 'work,': 28, 'you': 29}
{'a': 1, 'and': 2, 'assign': 3, 'build': 4, 'but': 5, 'collect': 6, "don't": 7, 'drum': 8, 'endless': 9, 'for': 10, 'if': 11, 'immensity': 12, 'long': 13, 'of': 14, 'people': 15, 'rather': 16, 'sea.': 17, 'ship,': 18, 'tasks': 19, 'teach': 20, 'the': 21, 'them': 22, 'to': 23, 'together': 24, 'up': 25, 'want': 26, 'wood': 27, 'work,': 28, 'you': 29, '<unk>': 0}
10


하이퍼파라미터를 넣음

In [10]:
# 하이퍼파라미터

hidden_size=vocab_size+1
sequence_length= 3
learning_rate= 0.1

print(hidden_size)

30


In [11]:
#train data(input, label) 생성
def build_data(sentence, word_to_index):
  encoded=[word_to_index[token] for token in sentence]
  print(encoded) # [1, 4, 7, 2, 5, 3, 6]
  input_seq=encoded[:-1] #인풋
  label_seq= encoded[1:] #아웃풋
  print(label_seq) # [4, 7, 2, 5, 3, 6]
  input_seq=torch.LongTensor(input_seq).unsqueeze(0)
  label_seq=torch.LongTensor(label_seq).unsqueeze(0)
  print(label_seq)
  return input_seq, label_seq #차원을 텐서로, 높혀서 리턴

In [12]:
X,Y = build_data(sentence, word_to_index)

print(X)
print(X)
print(X.size(), Y.size())

[11, 29, 26, 23, 4, 1, 18, 7, 8, 25, 15, 24, 23, 6, 27, 2, 7, 3, 22, 19, 2, 28, 5, 16, 20, 22, 23, 13, 10, 21, 9, 12, 14, 21, 17]
[29, 26, 23, 4, 1, 18, 7, 8, 25, 15, 24, 23, 6, 27, 2, 7, 3, 22, 19, 2, 28, 5, 16, 20, 22, 23, 13, 10, 21, 9, 12, 14, 21, 17]
tensor([[29, 26, 23,  4,  1, 18,  7,  8, 25, 15, 24, 23,  6, 27,  2,  7,  3, 22,
         19,  2, 28,  5, 16, 20, 22, 23, 13, 10, 21,  9, 12, 14, 21, 17]])
tensor([[11, 29, 26, 23,  4,  1, 18,  7,  8, 25, 15, 24, 23,  6, 27,  2,  7,  3,
         22, 19,  2, 28,  5, 16, 20, 22, 23, 13, 10, 21,  9, 12, 14, 21]])
tensor([[11, 29, 26, 23,  4,  1, 18,  7,  8, 25, 15, 24, 23,  6, 27,  2,  7,  3,
         22, 19,  2, 28,  5, 16, 20, 22, 23, 13, 10, 21,  9, 12, 14, 21]])
torch.Size([1, 34]) torch.Size([1, 34])


In [13]:
class Net(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, batch_first=True):
        super().__init__()
        #embedding layer
        self.embedding_layer=nn.Embedding(num_embeddings=vocab_size, embedding_dim=input_size)
        #RNN층
        self.rnn=nn.RNN(input_size, hidden_size, batch_first=batch_first)
        #출력층
        self.fc=nn.Linear(hidden_size, vocab_size) # one_hot_encoding 결과로 출력
        #1번 단어일 점수, 2번 단어일 점수 -> 아크맥스

    def forward(self, x): 
        # 임베딩층 : 크기변화(배치크기, 시퀀스 길이) =>(배치크기, 시퀀스 길이, 임베딩 차원)
        output=self.embedding_layer(x) #입력받은걸 임베딩에 넣어
        
        # RNN층 : 크기변화(배치크기, 시퀀스 길이, 임베딩 차원) =>
        # output: (배치크기, 시퀀스 길이, 은닉층 크기)
        # hidden : (1, 배치크기, 은닉층 크기)
        output, hidden=self.rnn(output) 

        # 출력층 : 크기변화(배치크기, 시퀀스길이, 은닉층 크기) 
        # => (배치크기, 시퀀스 길이, 단어장 크기)
        output=self.fc(output)

        # 크기변화 : 배치크기, 시퀀스 길이, 단어장 크기)=> (배치크기*시퀀스 길이, 단어장 크기)
        return output.view(-1, output.size(2)) 
    #단어장 크기 = 단어의 전체 크기 , 우리는 시퀀스 길이를 안했으니 배치크기일거야.

In [25]:
# 모델생성, 비용함수, 최적화
model=Net(vocab_size+1, sequence_length, hidden_size, batch_first=True) #+1은 0자리 unk 넣은거
loss_func=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters())

In [26]:
outputs=model(X)
# print(outputs)
print(outputs.size()) #단어가 39개였는데 30개, 행은 34개

torch.Size([34, 30])


k,v 된거를 v:k 를 만들어

인덱스를 넣으면 단어를 얻을 수 있어.

In [27]:
# index:worddictionary 
index_to_word={v:k for k, v in word_to_index.items()}
print(index_to_word)
print(word_to_index)

{1: 'a', 2: 'and', 3: 'assign', 4: 'build', 5: 'but', 6: 'collect', 7: "don't", 8: 'drum', 9: 'endless', 10: 'for', 11: 'if', 12: 'immensity', 13: 'long', 14: 'of', 15: 'people', 16: 'rather', 17: 'sea.', 18: 'ship,', 19: 'tasks', 20: 'teach', 21: 'the', 22: 'them', 23: 'to', 24: 'together', 25: 'up', 26: 'want', 27: 'wood', 28: 'work,', 29: 'you', 0: '<unk>'}
{'a': 1, 'and': 2, 'assign': 3, 'build': 4, 'but': 5, 'collect': 6, "don't": 7, 'drum': 8, 'endless': 9, 'for': 10, 'if': 11, 'immensity': 12, 'long': 13, 'of': 14, 'people': 15, 'rather': 16, 'sea.': 17, 'ship,': 18, 'tasks': 19, 'teach': 20, 'the': 21, 'them': 22, 'to': 23, 'together': 24, 'up': 25, 'want': 26, 'wood': 27, 'work,': 28, 'you': 29, '<unk>': 0}


decode는 숫자를 단어로 뽑아 줄 수 있는 것

In [28]:
decode=lambda y:[index_to_word.get(x) for x in y]
decode(outputs.argmax(-1).tolist())

['sea.',
 'if',
 'endless',
 'people',
 'assign',
 'of',
 '<unk>',
 'sea.',
 'people',
 'endless',
 'endless',
 'endless',
 'assign',
 'endless',
 'and',
 'sea.',
 'them',
 'assign',
 'endless',
 "don't",
 'of',
 'them',
 'people',
 'build',
 '<unk>',
 'endless',
 'people',
 'assign',
 'of',
 'sea.',
 'sea.',
 'sea.',
 '<unk>',
 'endless']

훈련안에 있는거야. 미리 보여주신거.

In [None]:
for step in range(500):
  optimizer.zero_grad()
  output=model(X)
  loss=loss_func(output, Y.view(-1)) #y는 항상 1차원으로 만들어
  loss.backward()
  optimizer.step()

  pred=output.softmax(-1).argmax(-1).tolist() #아웃풋을 소프트맥스, 아그맥스하고 투 리스트한 것을 pred
  if step%10==0:
    # print(pred)
    print(f'{step}, {pred},\n {' '.join(['IF']+decode(pred))}') #단어니까 ' ' 띄워, IF는 정답 포함안되있어서.



[17, 11, 9, 15, 3, 14, 0, 17, 15, 9, 9, 9, 3, 9, 2, 17, 22, 3, 9, 7, 14, 22, 15, 4, 0, 9, 15, 3, 14, 17, 17, 17, 0, 9]
0, [17, 11, 9, 15, 3, 14, 0, 17, 15, 9, 9, 9, 3, 9, 2, 17, 22, 3, 9, 7, 14, 22, 15, 4, 0, 9, 15, 3, 14, 17, 17, 17, 0, 9],
 IF sea. if endless people assign of <unk> sea. people endless endless endless assign endless and sea. them assign endless don't of them people build <unk> endless people assign of sea. sea. sea. <unk> endless
[17, 26, 9, 15, 3, 2, 2, 17, 3, 9, 9, 28, 3, 8, 2, 19, 22, 3, 9, 7, 14, 22, 15, 2, 2, 9, 15, 7, 22, 17, 17, 14, 7, 9]
10, [17, 26, 9, 15, 3, 2, 2, 17, 3, 9, 9, 28, 3, 8, 2, 19, 22, 3, 9, 7, 14, 22, 15, 2, 2, 9, 15, 7, 22, 17, 17, 14, 7, 9],
 IF sea. want endless people assign and and sea. assign endless endless work, assign drum and tasks them assign endless don't of them people and and endless people don't them sea. sea. of don't endless
[2, 2, 2, 7, 3, 2, 7, 17, 16, 19, 2, 2, 28, 8, 2, 7, 22, 3, 2, 7, 7, 22, 16, 2, 2, 9, 7, 7, 22, 17, 17, 1