<a href="https://colab.research.google.com/github/junanda/machine-learning/blob/main/Sequence_to_Sequence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Sequence to Sequence for performing number addition**


### **Setup**

In [1]:
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

# parameter model and dataset
TRAINING_SIZE = 50000
DIGITS = 3
REVERSE = True

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
MAX_LENGTH = DIGITS + 1 + DIGITS

## **Prepare data before training**

In [2]:
class CharacterTable:
    """
    Given a set of character:
    + Encode them to a one-hot integer representation
    + Decode the one-hot or integer representation to their character output
    + Decode a vector of probabilities to their character output 
    """
    def __init__(self, chars):
        self.charts = sorted(set(chars))
        self.chart_indices = dict((c, i) for i, c in enumerate(self.charts))
        self.indices_char = dict((i, c) for i, c in enumerate(self.charts))
    
    def encode(self, C, num_rows):
        x = np.zeros((num_rows, len(self.charts)))

        for i, c in enumerate(C):
            x[i, self.chart_indices[c]] = 1

        return x
    
    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        
        return "".join(self.indices_char[x] for x in x)

## **Generating Data**

In [3]:
# all the numbers, plus sign and space for padding
chars = "0123456789+ "
ctable = CharacterTable(chars)

question = []
expected = []
seen = set()
print("Generating data")

while len(question) < TRAINING_SIZE:
    f = lambda: int(
        "".join(
            np.random.choice(list("0123456789"))
            for i in range(np.random.randint(1, DIGITS + 1))
        )
    )
    a, b = f(), f()
    # skipp any addition question we've already seen
    # also skip any such that x+Y == Y+x (hence to sorting)
    key = tuple(sorted((a, b)))
    if key in seen:
        continue
    seen.add(key)
    # pad the data with spaces suxh that if is always MAXLEN
    q = "{}+{}".format(a, b)
    query = q + " " * (MAX_LENGTH - len(q))
    ans = str(a+b)

    # answer can be of maximum size digits + 1
    ans += " " * (DIGITS + 1 - len(ans))
    if REVERSE:
        # Reverse the query, e.g., '12+345  ' becomes '  543+21'. (Note the
        # space used for padding.)
        query = query[::-1]
    
    question.append(query)
    expected.append(ans)

print("Total Questions: ", len(question))

Generating data
Total Questions:  50000


In [4]:
question[:10]

['   4+94',
 '  138+6',
 '  1+055',
 ' 24+492',
 '  606+4',
 '   53+8',
 '  428+9',
 '318+288',
 '  497+0',
 '  165+5']

In [5]:
print("Vectorize")
x = np.zeros((len(question), MAX_LENGTH, len(chars)), dtype=np.bool)
y = np.zeros((len(question), DIGITS + 1, len(chars)), dtype=np.bool)

for i, sentence in enumerate(question):
    x[i] = ctable.encode(sentence, MAX_LENGTH)

for i, sentence in enumerate(expected):
    y[i] = ctable.encode(sentence, DIGITS + 1)

# shuffle (x,y) in unison as the later parts of x will almost all be larger digits
indices = np.arange(len(y))
np.random.shuffle(indices)

x = x[indices]
y = y[indices]

# Explicit set apart 10% for validation data that we never train over
split_at = len(x) - len(x) // 10
(x_train, x_val) = x[:split_at], x[split_at:]
(y_train, y_val) = y[:split_at], y[split_at:]

print("Training Data:")
print(x_train.shape)
print(y_train.shape)

print("Validation Data:")
print(x_val.shape)
print(y_val.shape)

Vectorize
Training Data:
(45000, 7, 12)
(45000, 4, 12)
Validation Data:
(5000, 7, 12)
(5000, 4, 12)


## **Build Model**

In [6]:
print("build Model")
num_layers = 1

model = keras.Sequential()
# Encode the input sequence using LSTM, producing an output of size 128
# Note: In a Situation where your input sequences have a variable length,
# use input_shape = (None, num_feature)
model.add(layers.LSTM(128, input_shape=(MAX_LENGTH, len(chars))))
# as decoder RNN's input, repeatedly provide with the last output of RNN for each time step.
# Repeat 'DIGITS + 1' times as that's the naximum length of output, e.g., when DIGITS = 3, max output is 999+999=1998
model.add(layers.RepeatVector(DIGITS + 1))
# the Decoder RNN could be multiple layers stacked or single layer.
for _ in range(num_layers):
    # By setting return_sequences to True, return not only the last output but
    # all the outputs so far in the form of (num_samples, timesteps,
    # output_dim). This is necessary as TimeDistributed in the below expects
    # the first dimension to be the timesteps.
    model.add(layers.LSTM(128, return_sequences=True))

model.add(layers.Dense(len(chars), activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

build Model
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 128)               72192     
_________________________________________________________________
repeat_vector (RepeatVector) (None, 4, 128)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 4, 128)            131584    
_________________________________________________________________
dense (Dense)                (None, 4, 12)             1548      
Total params: 205,324
Trainable params: 205,324
Non-trainable params: 0
_________________________________________________________________


## **Train Model**

In [9]:
epochs = 30
batch_size=32

# dataset
for epoch in range(1, epochs):
    print()
    print("Iteration, ", epoch)
    model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_val, y_val),)
    # select 10 samples from the validation set at random so we can visualize errors
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = np.argmax(model.predict(rowx), axis=-1)
        q = ctable.decode(rowx[0])

        correct = ctable.decode(rowy[0])
        quess = ctable.decode(preds[0], calc_argmax=False)
        print("Q", q[::-1] if REVERSE else q, end=" ")
        print("T", correct, end=" ")

        if correct == quess:
            print("Benar " + quess)
        else:
            print("Salah "+ quess)


Iteration,  1
Q 57+533  T 590  Benar 590 
Q 934+84  T 1018 Benar 1018
Q 871+3   T 874  Benar 874 
Q 10+725  T 735  Benar 735 
Q 643+551 T 1194 Benar 1194
Q 39+125  T 164  Benar 164 
Q 12+592  T 604  Benar 604 
Q 60+90   T 150  Benar 150 
Q 580+3   T 583  Benar 583 
Q 385+61  T 446  Benar 446 

Iteration,  2
Q 83+5    T 88   Benar 88  
Q 929+488 T 1417 Benar 1417
Q 257+11  T 268  Benar 268 
Q 73+235  T 308  Benar 308 
Q 688+38  T 726  Benar 726 
Q 446+42  T 488  Benar 488 
Q 285+168 T 453  Benar 453 
Q 182+3   T 185  Benar 185 
Q 5+413   T 418  Benar 418 
Q 34+850  T 884  Benar 884 

Iteration,  3
Q 58+31   T 89   Benar 89  
Q 846+376 T 1222 Benar 1222
Q 607+5   T 612  Benar 612 
Q 648+46  T 694  Benar 694 
Q 199+400 T 599  Benar 599 
Q 338+9   T 347  Benar 347 
Q 93+491  T 584  Benar 584 
Q 60+301  T 361  Benar 361 
Q 8+953   T 961  Benar 961 
Q 38+589  T 627  Benar 627 

Iteration,  4
Q 147+78  T 225  Benar 225 
Q 831+770 T 1601 Benar 1601
Q 623+797 T 1420 Benar 1420
Q 110+722 T 832 