### seq2seq

### 데이터 뒤집기

In [17]:
import numpy as np

a = np.array([1,2,3,4,5])
a = np.flip(a)
# a = a[::-1]
print(a)

[5 4 3 2 1]


In [18]:
import numpy as np

a = np.array([[1,2,3,4,5],
              [6,7,8,9,10]])
# a = np.flip(a)
a = a[:, ::-1]
print(a)

[[ 5  4  3  2  1]
 [10  9  8  7  6]]


In [None]:
a = np.arange(24).reshape(2,3,4)
b = np.arange(30).reshape(2,3,5)
print(a)
print(b)
out = np.concatenate((a, b), axis=2)
print(out.shape)
print(out)

In [14]:
# coding: utf-8
import sys
sys.path.append('..')
from common.time_layers import *
from common.base_model import BaseModel


class Encoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None

# T = 7
# vocab_size = len(char_to_id)
# wordvec_size = 16
# hideen_size = 128
# batch_size = 128        
        
    def forward(self, xs):
#         print("xs.shape=", xs.shape)   # (128,7)
        xs = self.embed.forward(xs)
#         print("xs.shape=", xs.shape)   # (128,7,16)
        hs = self.lstm.forward(xs)
#         print("hs.shape=", hs.shape)   # (128,7,128)
        self.hs = hs
#         print("h.shape=", hs[:, -1, :].shape)   # (128,128)
        return hs[:, -1, :]

    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh

        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout


class Decoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, h):
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.forward(out)
        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))

        return sampled


class Seq2seq(BaseModel):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads

    def forward(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]

        h = self.encoder.forward(xs)
        print("h.shape = ", h.shape)
        score = self.decoder.forward(decoder_xs, h)
        loss = self.softmax.forward(score, decoder_ts)
        return loss

    def backward(self, dout=1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = self.encoder.backward(dh)
        return dout

    def generate(self, xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.decoder.generate(h, start_id, sample_size)
        return sampled


In [15]:
# coding: utf-8
import sys
sys.path.append('..')
from common.time_layers import *
# from seq2seq import Seq2seq, Encoder


class PeekyDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None

    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape

        print("h.shape=", h.shape)     # (128,128)
        self.lstm.set_state(h)

        print("xs.shape=", xs.shape)     # (128,4)
        out = self.embed.forward(xs)
        print("out.shape=", out.shape)   # (128,4,16)
        hs = np.repeat(h, T, axis=0)     # (128,128)
        print("hs.shape=", hs.shape)     # (128*4,128)
        hs = hs.reshape(N, T, H)
        print("hs.shape=", hs.shape)     # (128,4,128)
        out = np.concatenate((hs, out), axis=2)  # (128,4,144)
        print("out.shape=", out.shape)

        out = self.lstm.forward(out)
        print("out.shape=", out.shape)   # (128,4,128)
        out = np.concatenate((hs, out), axis=2)
        print("out.shape=", out.shape)   # (128,4,256)

        score = self.affine.forward(out)
        self.cache = H
        return score

    def backward(self, dscore):
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)

        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        char_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([char_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            char_id = np.argmax(score.flatten())
            sampled.append(char_id)

        return sampled


class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = PeekyDecoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()

        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads


In [16]:
# coding: utf-8
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from dataset import sequence
from common.optimizer import Adam
from common.trainer import Trainer
from common.util import eval_seq2seq
from seq2seq import Seq2seq
# from peeky_seq2seq import PeekySeq2seq


# 데이터셋 읽기
(x_train, t_train), (x_test, t_test) = sequence.load_data('addition.txt')
char_to_id, id_to_char = sequence.get_vocab()

# 입력 반전 여부 설정 =============================================
is_reverse =  True
if is_reverse:
    x_train, x_test = x_train[:, ::-1], x_test[:, ::-1]
# ================================================================

# 하이퍼파라미터 설정
vocab_size = len(char_to_id)
print(vocab_size)
print(char_to_id)
wordvec_size = 16
hideen_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0

# 일반 혹은 엿보기(Peeky) 설정 =====================================
# model = Seq2seq(vocab_size, wordvec_size, hideen_size)
model = PeekySeq2seq(vocab_size, wordvec_size, hideen_size)
# ================================================================
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch=1,
                batch_size=batch_size, max_grad=max_grad)

    correct_num = 0
    for i in range(len(x_test)):
        question, correct = x_test[[i]], t_test[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct,
                                    id_to_char, verbose, is_reverse)

    acc = float(correct_num) / len(x_test)
    acc_list.append(acc)
    print('검증 정확도 %.3f%%' % (acc * 100))

# 그래프 그리기
x = np.arange(len(acc_list))
plt.plot(x, acc_list, marker='o')
plt.xlabel('에폭')
plt.ylabel('정확도')
plt.ylim(0, 1.0)
plt.show()



13
{'1': 0, '6': 1, '+': 2, '7': 3, '5': 4, ' ': 5, '_': 6, '9': 7, '2': 8, '0': 9, '3': 10, '8': 11, '4': 12}
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
| 에폭 1 |  반복 1 / 351 | 시간 0[s] | 손실 2.57
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shap

h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)
out.shape= (128, 4, 256)
h.shape =  (128, 128)
h.shape= (128, 128)
xs.shape= (128, 4)
out.shape= (128, 4, 16)
hs.shape= (512, 128)
hs.shape= (128, 4, 128)
out.shape= (128, 4, 144)
out.shape= (128, 4, 128)


KeyboardInterrupt: 

### np.concatnate 테스트

In [27]:
N=1
T=4
D=5
H=3

out = np.arange(N*T*D).reshape(N,T,D)
print(out.shape)
print(out)
h = np.arange(N*H).reshape(N,H)
print(h.shape)
print(h)
hs = np.repeat(h, T, axis=0)
print(hs.shape)
print(hs)
hs = hs.reshape(N, T, H)
print(hs.shape)
print(hs)
# out = np.concatenate((hs, out), axis=2)
# print(out.shape)

(1, 4, 5)
[[[ 0  1  2  3  4]
  [ 5  6  7  8  9]
  [10 11 12 13 14]
  [15 16 17 18 19]]]
(1, 3)
[[0 1 2]]
(4, 3)
[[0 1 2]
 [0 1 2]
 [0 1 2]
 [0 1 2]]
(1, 4, 3)
[[[0 1 2]
  [0 1 2]
  [0 1 2]
  [0 1 2]]]


In [None]:
N=2
T=4
D=4
H=3
out = np.arange(N*T*D).reshape(N,T,D)
print(out.shape)
h = np.arange(N*H).reshape(N,H)
print(h.shape)
print(h)
hs = np.repeat(h, T, axis=0)
print(hs.shape)
print(hs)
hs = hs.reshape(N, T, H)
print(hs.shape)
print(hs)
# out = np.concatenate((hs, out), axis=2)
# print(out.shape)