In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from mxnet import nd, autograd, gluon
import mxnet as mx
from mxnet.gluon import nn, rnn

In [2]:
def n(digits =3):
    number = ''
    for i in range(np.random.randint(1, digits + 1)):
        number += np.random.choice(list('0123456789'))
    return int(number)

def padding(chars, maxlen):
    return chars + ' ' * (maxlen - len(chars))

N = 50000
N_train = int(N * .9)
N_validation = N - N_train

digits = 3
input_digits = digits * 2 + 3
output_digits = digits + 3

added = set()
questions = []
answers = []
answers_y = []

while len(questions) < N:
    a, b = n(), n()
    pair = tuple(sorted((a, b)))
    if pair in added:
        continue
        
    question = 'S{}+{}E'.format(a, b)
    question = padding(question, input_digits)
    answer = 'S' + str(a + b) + 'E'
    answer = padding(answer, output_digits)
    answer_y = str(a + b) + 'E'
    answer_y = padding(answer_y, output_digits)
    
    added.add(pair)
    questions.append(question)
    answers.append(answer)
    answers_y.append(answer_y)
    
chars = '0123456789+SE '
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

X = np.zeros((len(questions), input_digits, len(chars)), dtype=np.integer)
Y = np.zeros((len(questions), digits + 3, len(chars)), dtype=np.integer)
Z = np.zeros((len(questions), digits + 3, len(chars)), dtype=np.integer)

for i in range(N):
    for t, char in enumerate(questions[i]):
        X[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers[i]):
        Y[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers_y[i]):
        Z[i, t, char_indices[char]] = 1
    
X_train, X_validation, Y_train, Y_validation, Z_train, Z_validation = \
    train_test_split(X, Y, Z, train_size=N_train)



### Testset

In [3]:
def gen_n_test(N):
    q = []
    y = []
    for i in range(N):
        a, b = n(), n() 
        question = '{}+{}'.format(a, b)
        answer_y = str(a + b)
        q.append(question)
        y.append(answer_y)
    return(q,y)

In [4]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

### LSTM vs LSTMCell

  * LSTM과 LSTMCell은 서로 다른 형태의 parameter를 지님
  * LSTMCell은 1번의 time step을 도는 것을 가정, LSTM은 주어진 timestep 모두를 도는 것으로 가정
  
  * LSTM의 state parameter는 (num_layer, batch_size, n_hidden)/ LSTMCell의 state parameter는 (batch_size, n_hidden)을 지님
  * LSTM의 state는 time step만큼의 hidden state값을 모두 포함, LSTMCell의 state는 1 time step만큼의 hidden state값만 포함
  * LSTMCell의 unroll을 쓰면 
  * Encoder에 LSTM을 쓰고, Decoder에 LSTMCell을 쓰는 경우에는 state parameter에 유의해야 함
      * Encoder에 LSTM을 쓰는 이유는 stacking을 위해서이고, Decoder에 LSTMCell을 쓰는 이유는 generation을 위해서임.
      

In [5]:
class calculator(gluon.Block):
    def __init__(self, n_hidden, in_seq_len, out_seq_len, vocab_size, enc_layer, dec_layer = 1, **kwargs):
        super(calculator, self).__init__(**kwargs)
        self.in_seq_len = in_seq_len
        self.out_seq_len = out_seq_len
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        self.enc_layer = enc_layer
        
        with self.name_scope():
            self.encoder = rnn.LSTM(hidden_size = n_hidden, num_layers = enc_layer, layout = 'NTC')
            self.decoder = rnn.LSTMCell(hidden_size = n_hidden)
            self.batchnorm = nn.BatchNorm(axis = 2)
            self.dense = nn.Dense(self.vocab_size, flatten = False)
            
    def forward(self, inputs, outputs):
        # API says: num_layers, batch_size, num_hidden
        self.batch_size = inputs.shape[0]
        begin_state = self.encoder.begin_state(batch_size = self.batch_size, ctx = ctx)
        enout, (h, c) = self.encoder(inputs, begin_state) # h, c: n_layer * batch_size * n_hidden
        # Pick the hidden states and cell states at the last time step in the second layer
        next_h = h[1] # batch_size * n_hidden
        next_c = c[1] # batch_size * n_hidden
        #next_h = nd.mean(h, axis = 0) #: Does not work
        #next_c = nd.mean(c, axis = 0) # Does not work
        for i in range(self.out_seq_len):

            deout, (next_h, next_c) = self.decoder(outputs[:, i, :], [next_h, next_c],)
            if i == 0:
                deouts = deout
            else:
                deouts = nd.concat(deouts, deout, dim = 1)
        
        deouts = nd.reshape(deouts, (-1, self.out_seq_len, self.n_hidden))
        deouts = self.batchnorm(deouts)
        deouts_fc = self.dense(deouts)
        return deouts_fc
    
    def calculation(self, input_str, char_indices, indices_char, input_digits = 9, lchars = 14, ctx = mx.gpu()):
        input_str = 'S' + input_str + 'E'
        X = nd.zeros((1, input_digits, lchars), ctx = ctx)
        for t, char in enumerate(input_str):
            X[0, t, char_indices[char]] = 1
        Y_init = nd.zeros((1, lchars), ctx = ctx)
        Y_init[0, char_indices['S']] = 1
        begin_state = self.encoder.begin_state(batch_size = 1, ctx = ctx)
        enout, (h, c) = self.encoder(X, begin_state)
        next_h = h[1]
        next_c = c[1]
        deout = Y_init
        
        for i in range(self.out_seq_len):
            deout, (next_h, next_c) = self.decoder(deout, [next_h, next_c])
            deout = nd.expand_dims(deout, axis = 1)
            deout = self.batchnorm(deout)
            deout = deout[:, 0, :]

            deout_sm = self.dense(deout)
            deout = nd.one_hot(nd.argmax(nd.softmax(deout_sm, axis = 1), axis = 1), depth = self.vocab_size)
            if i == 0:
                ret_seq = indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]
            else:
                ret_seq += indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]

            if ret_seq[-1] == ' ' or ret_seq[-1] == 'E':
                break
        return ret_seq.strip('E').strip()
        

In [6]:
tr_set = gluon.data.ArrayDataset(X_train, Y_train, Z_train)
tr_data_iterator = gluon.data.DataLoader(tr_set, batch_size=256, shuffle=True)

te_set =gluon.data.ArrayDataset(X_validation, Y_validation, Z_validation)
te_data_iterator = gluon.data.DataLoader(te_set, batch_size=256, shuffle=True)

In [7]:
ctx = mx.gpu()
model = calculator(300, 9, 6, 14, 2)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

trainer = gluon.Trainer(model.collect_params(), 'rmsprop')
loss = gluon.loss.SoftmaxCrossEntropyLoss(axis = 2, sparse_label = False)

In [8]:
print(model)

calculator(
  (encoder): LSTM(None -> 300, NTC, num_layers=2)
  (decoder): LSTMCell(None -> 1200)
  (batchnorm): BatchNorm(axis=2, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
  (dense): Dense(None -> 14, linear)
)


In [9]:
def calculate_loss(model, data_iter, loss_obj, ctx = ctx):
    test_loss = []
    for i, (x_data, y_data, z_data) in enumerate(data_iter):
        x_data = x_data.as_in_context(ctx).astype('float32')
        y_data = y_data.as_in_context(ctx).astype('float32')
        z_data = z_data.as_in_context(ctx).astype('float32')
        with autograd.predict_mode():
            z_output = model(x_data, y_data)
            loss_te = loss_obj(z_output, z_data)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return np.mean(test_loss)

In [10]:
epochs = 201

tot_test_loss = []
tot_train_loss = []
for e in range(epochs):
    train_loss = []
    for i, (x_data, y_data, z_data) in enumerate(tr_data_iterator):
        x_data = x_data.as_in_context(ctx).astype('float32')
        y_data = y_data.as_in_context(ctx).astype('float32')
        z_data = z_data.as_in_context(ctx).astype('float32')
        
        with autograd.record():
            z_output = model(x_data, y_data)
            loss_ = loss(z_output, z_data)
        loss_.backward()
        trainer.step(x_data.shape[0])
        curr_loss = nd.mean(loss_).asscalar()
        train_loss.append(curr_loss)
        
    if e % 10 == 0:
        q, y = gen_n_test(10)
        for i in range(10):
            with autograd.predict_mode():
                p = model.calculation(q[i], char_indices, indices_char).strip()
                iscorr = 1 if p == y[i] else 0
                if iscorr == 1:
                    print(colors.ok + '☑' + colors.close, end=' ')
                else:
                    print(colors.fail + '☒' + colors.close, end=' ')
                print("{} = {}({}) 1/0 {}".format(q[i], p, y[i], str(iscorr) ))
    #caculate test loss
    test_loss = calculate_loss(model, te_data_iterator, loss_obj = loss, ctx=ctx) 

    print("Epoch %s. Train Loss: %s, Test Loss : %s" % (e, np.mean(train_loss), test_loss))    
    tot_test_loss.append(test_loss)
    tot_train_loss.append(np.mean(train_loss))


[91m☒[0m 87+8 = 102(95) 1/0 0
[91m☒[0m 45+0 = 524(45) 1/0 0
[91m☒[0m 28+3 = 138(31) 1/0 0
[91m☒[0m 0+50 = 102(50) 1/0 0
[91m☒[0m 265+84 = 663(349) 1/0 0
[91m☒[0m 40+948 = 1020(988) 1/0 0
[91m☒[0m 27+1 = 124(28) 1/0 0
[91m☒[0m 37+623 = 738(660) 1/0 0
[91m☒[0m 8+2 = 102(10) 1/0 0
[91m☒[0m 7+0 = 102(7) 1/0 0
Epoch 0. Train Loss: 1.1953808, Test Loss : 1.1294714
Epoch 1. Train Loss: 1.12513, Test Loss : 1.1277721
Epoch 2. Train Loss: 1.104025, Test Loss : 1.0783864
Epoch 3. Train Loss: 1.0531863, Test Loss : 0.99584687
Epoch 4. Train Loss: 0.93822825, Test Loss : 0.8749555
Epoch 5. Train Loss: 0.8466254, Test Loss : 0.815833
Epoch 6. Train Loss: 0.76491624, Test Loss : 0.7248025
Epoch 7. Train Loss: 0.67228866, Test Loss : 0.6715063
Epoch 8. Train Loss: 0.5836759, Test Loss : 0.5467197
Epoch 9. Train Loss: 0.48728725, Test Loss : 0.4945925
[91m☒[0m 6+57 = 64(63) 1/0 0
[91m☒[0m 745+656 = 1412(1401) 1/0 0
[91m☒[0m 84+0 = 96(84) 1/0 0
[91m☒[0m 5+15 = 199(20) 1/0 0


Epoch 88. Train Loss: 7.532723e-06, Test Loss : 0.0018578371
Epoch 89. Train Loss: 7.2334424e-06, Test Loss : 0.001878264
[92m☑[0m 28+39 = 67(67) 1/0 1
[92m☑[0m 90+51 = 141(141) 1/0 1
[92m☑[0m 799+0 = 799(799) 1/0 1
[92m☑[0m 12+28 = 40(40) 1/0 1
[92m☑[0m 11+7 = 18(18) 1/0 1
[92m☑[0m 55+88 = 143(143) 1/0 1
[92m☑[0m 10+5 = 15(15) 1/0 1
[92m☑[0m 9+369 = 378(378) 1/0 1
[92m☑[0m 167+1 = 168(168) 1/0 1
[92m☑[0m 500+26 = 526(526) 1/0 1
Epoch 90. Train Loss: 7.1406625e-06, Test Loss : 0.0018877139
Epoch 91. Train Loss: 6.967133e-06, Test Loss : 0.0018468521
Epoch 92. Train Loss: 6.725182e-06, Test Loss : 0.0018626777
Epoch 93. Train Loss: 6.4540463e-06, Test Loss : 0.0018544985
Epoch 94. Train Loss: 6.3536236e-06, Test Loss : 0.0018264944
Epoch 95. Train Loss: 6.224529e-06, Test Loss : 0.0018367938
Epoch 96. Train Loss: 6.0399834e-06, Test Loss : 0.0018573438
Epoch 97. Train Loss: 5.9027107e-06, Test Loss : 0.0018553918
Epoch 98. Train Loss: 5.714528e-06, Test Loss : 0.0018

Epoch 172. Train Loss: 2.1420558e-06, Test Loss : 0.0018470923
Epoch 173. Train Loss: 2.0954724e-06, Test Loss : 0.001825155
Epoch 174. Train Loss: 2.1623584e-06, Test Loss : 0.001800246
Epoch 175. Train Loss: 2.1547653e-06, Test Loss : 0.0018029095
Epoch 176. Train Loss: 2.1017363e-06, Test Loss : 0.0018171541
Epoch 177. Train Loss: 2.0645996e-06, Test Loss : 0.0018283458
Epoch 178. Train Loss: 2.0615996e-06, Test Loss : 0.001770859
Epoch 179. Train Loss: 2.0395282e-06, Test Loss : 0.0018300212
[92m☑[0m 454+27 = 481(481) 1/0 1
[91m☒[0m 0+7 = 8(7) 1/0 0
[92m☑[0m 6+8 = 14(14) 1/0 1
[92m☑[0m 69+100 = 169(169) 1/0 1
[92m☑[0m 58+3 = 61(61) 1/0 1
[92m☑[0m 2+197 = 199(199) 1/0 1
[92m☑[0m 3+72 = 75(75) 1/0 1
[91m☒[0m 589+2 = 691(591) 1/0 0
[92m☑[0m 785+32 = 817(817) 1/0 1
[92m☑[0m 2+981 = 983(983) 1/0 1
Epoch 180. Train Loss: 2.0728671e-06, Test Loss : 0.0018110892
Epoch 181. Train Loss: 2.0122313e-06, Test Loss : 0.0021244537
Epoch 182. Train Loss: 2.0436748e-06, Test Los