In [1]:
import numpy
import torch
import torch.nn as nn
import re

In [2]:
def text_preprocess(text : str) :
    text = re.sub(r"[^0-9a-zA-Z]",repl=" ",string=text.lower().replace("n't"," not"))
    text = re.sub(r"[0-9]+",repl="N",string=text)
    text = re.sub(r"\s+",repl=" ",string=text)
    return text

def make_dict(sentences : list, word_dict : dict = None) :
    data = " ".join(sentences)
    data = text_preprocess(data).split()
    if word_dict is None :
        word_dict = {}
        word_dict["<pad>"] = 0 #패딩
        word_dict["<unk>"] = 1 #없는 단어
    for w in data :
        if w not in word_dict :
            word_dict[w] = len(word_dict)
    number_dict = {i : w for w, i in word_dict.items()}
    return word_dict, number_dict

def word_num_encoding(sentences : list, word_dict : dict, unk : str = "<unk>") :
    word_size = len(word_dict)
    corpus = []
    max_len = 0
    for s in sentences :
        s = text_preprocess(s).split()
        max_len = max(max_len, len(s))
    for s in sentences :
        s_array = []
        s = text_preprocess(s).split()
        for i in range(max_len) :
            if len(s) <= i :
                s_array.append(0)
                continue
            try :
                s_array.append(word_dict[s[i]])
            except :
                s_array.append(word_dict[unk])
        corpus.append(s_array)
    corpus = numpy.array(corpus)    
    return corpus

In [3]:
sentences = ["I like dog", "I love coffee", "I hate milk", "You like cat", "You love milk", "You hate coffee"]
word_dict, number_dict = make_dict(sentences)
corpus = word_num_encoding(sentences, word_dict)

print(word_dict)
print(corpus)

{'<pad>': 0, '<unk>': 1, 'i': 2, 'like': 3, 'dog': 4, 'love': 5, 'coffee': 6, 'hate': 7, 'milk': 8, 'you': 9, 'cat': 10}
[[ 2  3  4]
 [ 2  5  6]
 [ 2  7  8]
 [ 9  3 10]
 [ 9  5  8]
 [ 9  7  6]]


In [4]:
eye = numpy.eye(len(word_dict))

s_array = []
for s in corpus :
    temp = []
    for w in s :
        temp.append(eye[w])
    s_array.append(temp)

s_array = numpy.array(s_array)
print(s_array.shape)
print(s_array)

(6, 3, 11)
[[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]

 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]]


In [6]:
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.RNN(11, 5, batch_first=True)
        self.seq = nn.Sequential(
            nn.Linear(5, 11),
        )

    def forward(self, x):
        x, h = self.rnn(x)
        x = self.seq(x[:, -1, :])
        return x

F = Network()
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(F.parameters(), lr=0.3)


x = torch.tensor(s_array[:, :2, :], dtype=torch.float)
t = torch.tensor(s_array[:, 2, :], dtype=torch.long)

epoch = 200
for e in range(epoch):
    loss_sum = 0
    for b in range(x.shape[0]):
        y = F(x[b:b+1])

        loss = loss_function(y, t[b:b+1].argmax(dim=1))
        loss_sum += loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (e + 1) % 10 == 0:
        print("epoch {} | loss {}".format(e + 1, loss_sum))


epoch 10 | loss 5.908846378326416
epoch 20 | loss 3.9566662311553955
epoch 30 | loss 1.4268100261688232
epoch 40 | loss 0.32074546813964844
epoch 50 | loss 0.17494863271713257
epoch 60 | loss 0.1207888051867485
epoch 70 | loss 0.09246902167797089
epoch 80 | loss 0.0750012919306755
epoch 90 | loss 0.0631246417760849
epoch 100 | loss 0.054511457681655884
epoch 110 | loss 0.04797303304076195
epoch 120 | loss 0.04283611476421356
epoch 130 | loss 0.03869238495826721
epoch 140 | loss 0.0352780781686306
epoch 150 | loss 0.03241501376032829
epoch 160 | loss 0.029979674145579338
epoch 170 | loss 0.027882732450962067
epoch 180 | loss 0.026058126240968704
epoch 190 | loss 0.024455921724438667
epoch 200 | loss 0.023038089275360107


In [7]:
#sentences = ["I like dog", "I love coffee", "I hate milk", "You like cat", "You love milk", "You hate coffee"]

result = F(x)
result_arg = torch.argmax(result, dim = 1)
for i in result_arg :
    print(number_dict[i.item()])


dog
coffee
milk
cat
milk
coffee


In [8]:
import pickle

with open("test.pickle", mode = "wb") as f :
    pickle.dump(word_dict, f)

In [9]:
with open("test.pickle", mode = "rb") as f :
    A = pickle.load(f)

print(A)

{'<pad>': 0, '<unk>': 1, 'i': 2, 'like': 3, 'dog': 4, 'love': 5, 'coffee': 6, 'hate': 7, 'milk': 8, 'you': 9, 'cat': 10}
