# Train a decoder-only transformer (GPT-like) to do addition

I learned a lot from https://github.com/karpathy/minGPT, but I have rewritten all the code based on my own understanding.


In [2]:
import math
from dataclasses import dataclass

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from tqdm import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data processing
### Generate an addition dataset

In [3]:
PLUS_SIGN = 10
MUL_SIGN  = 11
MINUS_SIGN = 12
EQUAL_SIGN = 13
EOS = 14
BOS = 15
PAD = 16
UNK = 17

symbol_to_int_dict = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4,
                      "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
                      "+": PLUS_SIGN, "*": MUL_SIGN, "-": MINUS_SIGN,
                      "=": EQUAL_SIGN,  "<EOS>": EOS, "<BOS>": BOS,
                      "<pad>": PAD, "??": UNK
                      }

int_to_symbol_dict = {y:x for (x,y) in symbol_to_int_dict.items()}
vocab_size = len(symbol_to_int_dict)

def decode_equation(equation):
    '''convert an equation in list format to string format '''
    res = "".join([str(int_to_symbol_dict.get(x, UNK)) for x in equation.tolist()])
    return res.replace("<BOS>", "").replace("<EOS>", "")

def encode_equation(equation, max_ndigits, padQ=True):
    '''convert an equation(up to the equal sign in it) in string format to a list'''
    equal_size_loc = equation.index('=')
    plus_size_loc = equation.index('+')
    num1 = pad_number(equation[0:plus_size_loc], max_ndigits)
    num2 = pad_number(equation[plus_size_loc+1:equal_size_loc], max_ndigits)
    new_equation = num1 + "+" + num2 + "="
    return torch.tensor([BOS]+[symbol_to_int_dict.get(n, UNK) for n in new_equation]).to(DEVICE)


def pad_number(num, max_ndigits)->str:
    s = str(num)
    while len(s)<max_ndigits:
      s = "0"+s
    return s

def create_add_dataset(max_ndigits, dataset_size, padQ=True):
    ''' Function for creating an addition dataset.
    if padQ=True, pre-padding of 0s will be added on the numbers such that all the 
    numbers has the same length max_ndigits, for example, with max_ndigits=3,  
    32 will be represented 032.
    '''
    dataset_str = []
    for i in range(dataset_size):
        num1, num2 = np.random.randint(0, 10**max_ndigits, 2)
        ans = num1 + num2
        if padQ:
            equation = pad_number(num1, max_ndigits) + '+' + pad_number(num2, max_ndigits) + "=" + pad_number(ans, max_ndigits)
        else:
            equation = str(num1) + '+' + str(num2) + "=" + str(ans)
        dataset_str.append(equation)

    dataset = [torch.tensor([BOS]+[symbol_to_int_dict.get(n, UNK) for n in x]+[EOS])
               for x in dataset_str]
    return dataset, dataset_str

print(create_add_dataset(2,20, padQ=False))
print(create_add_dataset(2, 20, padQ=True))

([tensor([15,  4,  9, 10,  7,  8, 13,  1,  2,  7, 14]), tensor([15,  9,  7, 10,  4,  5, 13,  1,  4,  2, 14]), tensor([15,  7, 10,  2,  1, 13,  2,  8, 14]), tensor([15,  6,  6, 10,  7,  4, 13,  1,  4,  0, 14]), tensor([15,  5,  4, 10,  9,  8, 13,  1,  5,  2, 14]), tensor([15,  7,  7, 10,  3,  3, 13,  1,  1,  0, 14]), tensor([15,  4,  0, 10,  2,  5, 13,  6,  5, 14]), tensor([15,  1,  5, 10,  7,  6, 13,  9,  1, 14]), tensor([15,  8,  3, 10,  4,  7, 13,  1,  3,  0, 14]), tensor([15,  2,  0, 10,  4,  9, 13,  6,  9, 14]), tensor([15,  9,  3, 10,  4, 13,  9,  7, 14]), tensor([15,  5, 10,  5, 13,  1,  0, 14]), tensor([15,  4,  2, 10,  6,  0, 13,  1,  0,  2, 14]), tensor([15,  7,  9, 10,  2,  9, 13,  1,  0,  8, 14]), tensor([15,  9, 10,  7,  3, 13,  8,  2, 14]), tensor([15,  9, 10,  8, 13,  1,  7, 14]), tensor([15,  1,  8, 10,  9,  8, 13,  1,  1,  6, 14]), tensor([15,  3,  9, 10,  9,  6, 13,  1,  3,  5, 14]), tensor([15,  1,  5, 10,  6,  9, 13,  8,  4, 14]), tensor([15,  6,  1, 10,  7,  3, 13, 

### Create dataloders for the train, validation and test sets

In [4]:
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

batch_size = 256
def pad_sequence(batch):
    input_padded = torch.nn.utils.rnn.pad_sequence(batch,
                                batch_first=True, padding_value = PAD)
    return input_padded

@dataclass
class DataLoaders:
    max_ndigits: int
    dataset_size: int
    padQ: bool = True
    val_loader = None
    test_loader = None
    train_loader = None

    def split_data(self, split=[0.7, 0.1, 0.2]):
        if isinstance(split[0], float):
            train_size  = round(self.dataset_size*split[0])
            val_size = round(self.dataset_size*split[1])
            test_size = self.dataset_size - train_size - val_size
        elif isinstance(split[0], int):
            val_size = split[0]
            test_size = split[1]
            train_size  = dataset_size - test_size - val_size


        dataset, _ = create_add_dataset(self.max_ndigits, self.dataset_size, padQ=self.padQ)
        train_set, val_set, test_set = torch.utils.data.random_split(dataset,
                                                             [train_size, val_size, test_size],
                                                    generator=torch.Generator().manual_seed(42) )

        self.train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                           shuffle=True, collate_fn = pad_sequence)
        self.test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                           shuffle=True, collate_fn=pad_sequence)
        self.val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size,
                                           shuffle=True, collate_fn=pad_sequence)


## GPT model
Here is my implementation of the GPT model, including the multi-headed self-attention module.

In [5]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0 # check the h number
        self.d_k = d_model//h
        self.d_model = d_model
        self.h = h
        self.WQ = nn.Linear(d_model, d_model)
        self.WK = nn.Linear(d_model, d_model)
        self.WV = nn.Linear(d_model, d_model)
        self.linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x_query, x_key, x_value, mask=None):
        nbatch = x_query.size(0) # get batch size
        # 1) Linear projections to get the multi-head query, key and value
        # x_query, x_key, x_value dimension: nbatch * seq_len * d_model
        # LHS query, key, value dimensions: nbatch * h * seq_len * d_k
        query = self.WQ(x_query).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        key   = self.WK(x_key).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        value = self.WV(x_value).view(nbatch, -1, self.h, self.d_k).transpose(1,2)
        # 2) Attention
        # scores has dimensions: nbatch * h * seq_len * seq_len
        scores = torch.matmul(query, key.transpose(-2, -1))/math.sqrt(self.d_model)
        # 3) Mask out padding tokens and future tokens
        if mask is not None:
            scores = scores.masked_fill(mask, float('-inf'))
        # p_atten dimensions: nbatch * h * seq_len * seq_len
        p_atten = torch.nn.functional.softmax(scores, dim=-1)
        # x dimensions: nbatch * h * seq_len * d_k
        x = torch.matmul(p_atten, value)
        # x now has dimensions:nbtach * seq_len * d_model
        x = x.transpose(1, 2).contiguous().view(nbatch, -1, self.d_model)

        return self.linear(x) # final linear layer


class ResidualConnection(nn.Module):
  '''residual connection: x + dropout(sublayer(layernorm(x))) '''
  def __init__(self, dim, dropout):
      super().__init__()
      self.drop = nn.Dropout(dropout)
      self.norm = nn.LayerNorm(dim)

  def forward(self, x, sublayer):
      return x + self.drop(sublayer(self.norm(x)))


class Decoder(nn.Module):

    def __init__(self, vocab_size, h, d_embed, max_len, N=4, drop_rate=0.1):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_embed)
        self.pos_embed = nn.Embedding(max_len, d_embed)
        self.dropout = nn.Dropout(drop_rate)
        self.decoder_blocks = nn.Sequential(*[DecoderBlock(h, d_embed) for _ in range(N)])
        self.norm = nn.LayerNorm(d_embed)
        self.linear = nn.Linear(d_embed, vocab_size)

    def forward(self, trg, trg_pad_mask):
        pos_embedding = self.pos_embed(torch.tensor(range(trg.size(-1))).to(DEVICE))
        x = self.embed(trg) + pos_embedding
        x = self.dropout(x)
        for layer in self.decoder_blocks:
            x = layer( x, trg_pad_mask)
        x = self.norm(x)
        logits = self.linear(x)
        return logits


class DecoderBlock(nn.Module):
    def __init__(self, h, d_embed, dropout=0.1):
        super().__init__()
        self.atten1 = MultiHeadedAttention(h, d_embed)
        self.atten2 = MultiHeadedAttention(h, d_embed)
        self.ffn = nn.Sequential(
            nn.Linear(d_embed, 4*d_embed),
            nn.GELU(),
            nn.Linear(4*d_embed, d_embed),
            nn.Dropout(dropout)
        )
        self.residual1 = ResidualConnection(d_embed, dropout)
        self.residual2 = ResidualConnection(d_embed, dropout)
        self.residual3 = ResidualConnection(d_embed, dropout)

    def future_mask(self, seq_len):
        '''mask for masking out tokens at future positions'''
        mask = (torch.triu(torch.ones(seq_len, seq_len, requires_grad=False), diagonal=1)!=0).to(DEVICE)
        return mask.view(1, 1, seq_len, seq_len)

    def forward(self,  decoder_layer_input, decoder_pad_mask):
        y = decoder_layer_input
        seq_len = y.size(-2)
        decoder_mask = torch.logical_or(decoder_pad_mask, self.future_mask(seq_len))
        y = self.residual1(y, lambda y: self.atten1(y, y, y, mask=decoder_mask))

        return self.residual3(y, self.ffn)

class GPT(nn.Module):
    def __init__(self, decoder):
        super().__init__()
        self.decoder = decoder

    def forward(self, input, pad_mask):
        return self.decoder(input, pad_mask)

### Let's creat a GPT!

In [6]:
@dataclass
class ModelConfig:
  d_embed: int
  # d_ff is the dimension of the fully-connected layer
  d_ff: int
  # h is the number of attention head
  h: int
  N_decoder: int
  max_len: int
  dropout: float


def make_GPT(config):
    model = GPT(Decoder(vocab_size, config.h, config.d_embed, config.max_len,
                        config.N_decoder)).to(DEVICE)
    # initialize model parameters
    # it seems that this initialization is very important!
    for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
    return model

## Functions for training and input/output processing

In [7]:
def make_batch_input(x):
        'function for generating model input, target and pad_mask from raw input x'
        input = x[:, :-1].to(DEVICE)
        equal_sign_loc = [(equation==EQUAL_SIGN).nonzero().item() for equation in x]
        target = [torch.cat((torch.tensor([PAD]*equal_sign_loc[i]), x[i][equal_sign_loc[i]+1:])) for i in range(len(x))]
        target = torch.cat(target, 0).contiguous().view(-1).to(DEVICE)
        pad_mask = (input == PAD).view(input.size(0), 1, 1, input.size(-1))
        return input, target, pad_mask

In [8]:
def train_epoch(model, dataloader):
    model.train()
    grad_norm_clip = 1.0
    losses, acc, count = [], 0, 0
    num_batches = len(dataloader)
    pbar = tqdm(enumerate(dataloader), total=num_batches)
    for idx, x  in  pbar:
        optimizer.zero_grad()
        input, target, pad_mask = make_batch_input(x)
        pred = model(input, pad_mask).to(DEVICE)
        pred = pred.view(-1, pred.size(-1))
        loss = loss_fn(pred, target).to(DEVICE)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_norm_clip)
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
        # report progress
        if idx>0 and idx%50 == 0:
            pbar.set_description(f"ep: {scheduler.last_epoch//num_batches}, train loss={loss.item():.3f},lr={scheduler.get_last_lr()[0]:.5f}")
    return np.mean(losses)

def train(model, dataloaders, epochs):
    best_val_loss = float('inf')
    global early_stop_count
    train_size = len(dataloaders.train_loader)*batch_size
    for ep in range(epochs):
        train_loss = train_epoch(model, dataloaders.train_loader)
        val_loss = validate(model, dataloaders.val_loader)
        print(f'ep {ep}: train_loss: {train_loss:.5f}, val_loss: {val_loss:.5f}')
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        else:
            if scheduler.last_epoch>2*warmup_steps:
                early_stop_count -= 1
                if early_stop_count<=0:
                    #torch.save(model, f'saved_models/{SRC}_to_{TRG}_train_size_{train_size}_model_size_{model_size}.pt')
            #f = open("save_model/{SRC}_to_{TRG}_dataset_size_{dataset_size}.txt", 'w'):
            #f.write()
                    return train_loss, val_loss
    return train_loss, val_loss


def validate(model, dataloder):
    'function for computing the loss on the validation set'
    model.eval()
    losses = []
    with torch.no_grad():
        for i, x in enumerate(dataloder):
            input, target, pad_mask = make_batch_input(x)
            pred = model(input, pad_mask).to(DEVICE)
            pred = pred.view(-1, pred.size(-1))
            losses.append(loss_fn(pred, target).item())
    return np.mean(losses)

In [9]:
@torch.no_grad()
def compute_sum(model, x):
    'Function for computing the sum of two numbers.'
    for i in range(max_ndigits+2):
        pad_mask = (x == PAD).view(1, 1, 1, x.size(-1)).to(DEVICE)
        logits = model(x, pad_mask)
        last_output = logits.argmax(-1)[:,-1].view(1,1)
        x = torch.cat((x, last_output), 1).to(DEVICE)
        if last_output.item() == EOS:
            break
    return x[0]

def evaluate(model, dataloader, num_batch=None):
    '''Function for evaluation the model.
    This function take equations, and truncate them up to the equal-sign, and feed them to the
    model to get the predictions, compare them with the correct answers, and output the accuracy.
    '''
    model.eval()
    acc, count = 0, 0
    num_wrong_to_display = 5
    for idx, x in enumerate(dataloader):
        for equation in x:
            loc_equal_sign = equation.tolist().index(EQUAL_SIGN)
            loc_EOS = equation.tolist().index(EOS)
            input = equation[0:loc_equal_sign+1].view(1, -1).to(DEVICE)
            ans = equation[:loc_EOS+1].tolist()
            ans_pred = compute_sum(model, input)
            count += 1

            if ans == ans_pred.tolist():
                acc +=1
            else:
                if num_wrong_to_display > 0:
                    print(f'correct equation: {decode_equation(equation).replace("<pad>","")}')
                    print(f'predicted:        {decode_equation(ans_pred)}')
                    num_wrong_to_display -= 1
        if num_batch and idx>num_batch:
            break
    return acc/count

def what_is(question:str)->str:
    'function for computing the sum of two numbers with input in literal string format'
    pred = compute_sum(model, encode_equation(question, max_ndigits).view(1,-1))
    pred = decode_equation(pred)
    pred = pred[pred.index("=")+1:]
    return question+pred


## 2-digit addition

In [10]:
max_ndigits = 2
# max_len is determined by 1+ max_ndigits + 1 + max_ndigits + 1 + max_ndigits +1 +1
max_len = 3*max_ndigits + 6
config = ModelConfig(d_embed=128, d_ff=256, h=4, N_decoder=2, max_len= max_len,
                           dropout=0.1)
dataset_size = 10000
data_loaders = DataLoaders(max_ndigits, dataset_size, padQ=True)
data_loaders.split_data(split=[1000, 2000])
train_size = len(data_loaders.train_loader)*batch_size
model = make_GPT(config)
model_size = sum([p.numel() for p in model.parameters()])
print(f'model_size: {model_size}, train_set_size: {train_size}')
warmup_steps = 3*len(data_loaders.train_loader)
# lr first increases in the warmup steps, and then descreases
lr_fn = lambda step: config.d_embed**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
optimizer = torch.optim.Adam(model.parameters(), lr=0.2, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
early_stop_count = 10 # Setting early_stop_count to a large number, that is, I'm not implementing early_stop here

train_loss, val_loss = train(model, data_loaders, epochs=30)


model_size: 535570, train_set_size: 7168


100%|██████████| 28/28 [00:00<00:00, 38.40it/s]


ep 0: train_loss: 2.08677, val_loss: 1.45922


100%|██████████| 28/28 [00:00<00:00, 49.18it/s]


ep 1: train_loss: 1.37419, val_loss: 1.21162


100%|██████████| 28/28 [00:00<00:00, 50.86it/s]


ep 2: train_loss: 1.19457, val_loss: 1.10961


100%|██████████| 28/28 [00:00<00:00, 49.60it/s]


ep 3: train_loss: 1.06580, val_loss: 0.90669


100%|██████████| 28/28 [00:00<00:00, 51.10it/s]


ep 4: train_loss: 0.93853, val_loss: 0.84697


100%|██████████| 28/28 [00:00<00:00, 48.83it/s]


ep 5: train_loss: 0.88191, val_loss: 0.80651


100%|██████████| 28/28 [00:00<00:00, 50.16it/s]


ep 6: train_loss: 0.83542, val_loss: 0.76162


100%|██████████| 28/28 [00:00<00:00, 49.32it/s]


ep 7: train_loss: 0.71996, val_loss: 0.49947


100%|██████████| 28/28 [00:00<00:00, 48.98it/s]


ep 8: train_loss: 0.51262, val_loss: 0.28390


100%|██████████| 28/28 [00:00<00:00, 49.56it/s]


ep 9: train_loss: 0.28996, val_loss: 0.09254


100%|██████████| 28/28 [00:00<00:00, 48.65it/s]


ep 10: train_loss: 0.14584, val_loss: 0.04050


100%|██████████| 28/28 [00:00<00:00, 49.15it/s]


ep 11: train_loss: 0.08972, val_loss: 0.02177


100%|██████████| 28/28 [00:00<00:00, 48.09it/s]


ep 12: train_loss: 0.05284, val_loss: 0.01480


100%|██████████| 28/28 [00:00<00:00, 49.77it/s]


ep 13: train_loss: 0.03300, val_loss: 0.01189


100%|██████████| 28/28 [00:00<00:00, 50.15it/s]


ep 14: train_loss: 0.02405, val_loss: 0.00869


100%|██████████| 28/28 [00:00<00:00, 50.22it/s]


ep 15: train_loss: 0.01790, val_loss: 0.00767


100%|██████████| 28/28 [00:00<00:00, 50.21it/s]


ep 16: train_loss: 0.01319, val_loss: 0.00147


100%|██████████| 28/28 [00:00<00:00, 51.15it/s]


ep 17: train_loss: 0.01326, val_loss: 0.00359


100%|██████████| 28/28 [00:00<00:00, 48.09it/s]


ep 18: train_loss: 0.01185, val_loss: 0.00208


100%|██████████| 28/28 [00:00<00:00, 50.71it/s]


ep 19: train_loss: 0.00792, val_loss: 0.00277


100%|██████████| 28/28 [00:00<00:00, 50.44it/s]


ep 20: train_loss: 0.00556, val_loss: 0.00295


100%|██████████| 28/28 [00:00<00:00, 52.44it/s]


ep 21: train_loss: 0.00670, val_loss: 0.00063


100%|██████████| 28/28 [00:00<00:00, 49.56it/s]


ep 22: train_loss: 0.00831, val_loss: 0.00057


100%|██████████| 28/28 [00:00<00:00, 49.95it/s]


ep 23: train_loss: 0.00627, val_loss: 0.00050


100%|██████████| 28/28 [00:00<00:00, 47.47it/s]


ep 24: train_loss: 0.00443, val_loss: 0.00095


100%|██████████| 28/28 [00:00<00:00, 50.45it/s]


ep 25: train_loss: 0.00398, val_loss: 0.00024


100%|██████████| 28/28 [00:00<00:00, 48.30it/s]


ep 26: train_loss: 0.00273, val_loss: 0.00004


100%|██████████| 28/28 [00:00<00:00, 50.61it/s]


ep 27: train_loss: 0.00409, val_loss: 0.00176


100%|██████████| 28/28 [00:00<00:00, 48.10it/s]


ep 28: train_loss: 0.00368, val_loss: 0.00007


100%|██████████| 28/28 [00:00<00:00, 48.43it/s]

ep 29: train_loss: 0.00388, val_loss: 0.00079





In [11]:
test_loss = validate(model, data_loaders.test_loader)
print('training set examples the model gives an incorrect result:')
train_acc = evaluate(model, data_loaders.train_loader, 20)
print('validataion set examples the model gives an incorrect result:')
val_acc = evaluate(model, data_loaders.test_loader)
print('test set examples the model gives an incorrect result:')
test_acc = evaluate(model, data_loaders.test_loader)
current_result = f'''train_size: {train_size}, train_loss: {train_loss},
                val_loss: {val_loss}, test_loss: {test_loss},
                test_acc: {test_acc}, val_acc: {val_acc}, train_acc: {train_acc}
                '''
print(current_result)

training set examples the model gives an incorrect result:
correct equation: 09+09=18
predicted:        09+09=16
correct equation: 19+00=19
predicted:        19+00=10
correct equation: 09+10=19
predicted:        09+10=109
validataion set examples the model gives an incorrect result:
correct equation: 19+00=19
predicted:        19+00=10
test set examples the model gives an incorrect result:
correct equation: 19+00=19
predicted:        19+00=10
train_size: 7168, train_loss: 0.003877029294796687,
                val_loss: 0.0007883256266723038, test_loss: 0.0004133981838094769,
                test_acc: 0.9995, val_acc: 0.9995, train_acc: 0.9994673295454546
                


## 5-digit addition 
<!-- and scaling laws
For 5-digit addition, there are 10<sup>10</sup> possible data points, so we will have enough data to study the scaling laws. For example, we can study how the performance of the model (with fixed number of parameters) improves as we increase the training set size.   -->

In [12]:
max_ndigits = 5
# max_len is determined by 1+ max_ndigits + 1 + max_ndigits + 1 + max_ndigits +1 +1
max_len = 3*max_ndigits + 6
config = ModelConfig(d_embed=128, d_ff=256, h=4, N_decoder=2, max_len= max_len,
                           dropout=0.1)

dataset_size = 100000
data_loaders = DataLoaders(max_ndigits, dataset_size, padQ=True)
data_loaders.split_data(split=[10000, 20000])
train_size = len(data_loaders.train_loader)*batch_size
model = make_GPT(config)
model_size = sum([p.numel() for p in model.parameters()])
print(f'model_size: {model_size}, train_set_size: {train_size}')
warmup_steps = 3*len(data_loaders.train_loader)
# lr first increases in the warmup steps, and then descreases
lr_fn = lambda step: config.d_embed**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
optimizer = torch.optim.Adam(model.parameters(), lr=0.2, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
early_stop_count = 10 # Setting early_stop_count to a large number, that is, I'm not implementing early_stop here

train_loss, val_loss = train(model, data_loaders, epochs=30)


model_size: 536722, train_set_size: 70144


ep: 0, train loss=1.814,lr=0.00019: 100%|██████████| 274/274 [00:06<00:00, 42.44it/s]


ep 0: train_loss: 2.18202, val_loss: 1.75309


ep: 1, train loss=1.580,lr=0.00039: 100%|██████████| 274/274 [00:06<00:00, 42.34it/s]


ep 1: train_loss: 1.67661, val_loss: 1.46803


ep: 2, train loss=0.869,lr=0.00060: 100%|██████████| 274/274 [00:06<00:00, 42.45it/s]


ep 2: train_loss: 1.11583, val_loss: 0.77282


ep: 3, train loss=0.425,lr=0.00054: 100%|██████████| 274/274 [00:06<00:00, 42.48it/s]


ep 3: train_loss: 0.62263, val_loss: 0.28733


ep: 4, train loss=0.265,lr=0.00048: 100%|██████████| 274/274 [00:06<00:00, 42.32it/s]


ep 4: train_loss: 0.31493, val_loss: 0.21190


ep: 5, train loss=0.228,lr=0.00044: 100%|██████████| 274/274 [00:06<00:00, 42.87it/s]


ep 5: train_loss: 0.24592, val_loss: 0.18460


ep: 6, train loss=0.214,lr=0.00041: 100%|██████████| 274/274 [00:06<00:00, 42.74it/s]


ep 6: train_loss: 0.20722, val_loss: 0.14593


ep: 7, train loss=0.141,lr=0.00038: 100%|██████████| 274/274 [00:06<00:00, 42.69it/s]


ep 7: train_loss: 0.17520, val_loss: 0.11357


ep: 8, train loss=0.121,lr=0.00036: 100%|██████████| 274/274 [00:06<00:00, 42.38it/s]


ep 8: train_loss: 0.14217, val_loss: 0.08053


ep: 9, train loss=0.097,lr=0.00034: 100%|██████████| 274/274 [00:06<00:00, 41.67it/s]


ep 9: train_loss: 0.11092, val_loss: 0.04565


ep: 10, train loss=0.074,lr=0.00032: 100%|██████████| 274/274 [00:06<00:00, 42.15it/s]


ep 10: train_loss: 0.08129, val_loss: 0.02878


ep: 11, train loss=0.055,lr=0.00031: 100%|██████████| 274/274 [00:06<00:00, 41.67it/s]


ep 11: train_loss: 0.06424, val_loss: 0.02101


ep: 12, train loss=0.043,lr=0.00030: 100%|██████████| 274/274 [00:06<00:00, 41.44it/s]


ep 12: train_loss: 0.05352, val_loss: 0.01927


ep: 13, train loss=0.039,lr=0.00029: 100%|██████████| 274/274 [00:06<00:00, 41.41it/s]


ep 13: train_loss: 0.04568, val_loss: 0.01477


ep: 14, train loss=0.030,lr=0.00028: 100%|██████████| 274/274 [00:06<00:00, 42.11it/s]


ep 14: train_loss: 0.03856, val_loss: 0.01376


ep: 15, train loss=0.047,lr=0.00027: 100%|██████████| 274/274 [00:06<00:00, 42.46it/s]


ep 15: train_loss: 0.03354, val_loss: 0.01147


ep: 16, train loss=0.043,lr=0.00026: 100%|██████████| 274/274 [00:06<00:00, 42.37it/s]


ep 16: train_loss: 0.02981, val_loss: 0.01097


ep: 17, train loss=0.034,lr=0.00025: 100%|██████████| 274/274 [00:06<00:00, 42.57it/s]


ep 17: train_loss: 0.02661, val_loss: 0.00887


ep: 18, train loss=0.030,lr=0.00025: 100%|██████████| 274/274 [00:06<00:00, 42.51it/s]


ep 18: train_loss: 0.02475, val_loss: 0.00821


ep: 19, train loss=0.018,lr=0.00024: 100%|██████████| 274/274 [00:06<00:00, 42.41it/s]


ep 19: train_loss: 0.02194, val_loss: 0.00830


ep: 20, train loss=0.021,lr=0.00023: 100%|██████████| 274/274 [00:06<00:00, 42.41it/s]


ep 20: train_loss: 0.02112, val_loss: 0.00789


ep: 21, train loss=0.024,lr=0.00023: 100%|██████████| 274/274 [00:06<00:00, 42.76it/s]


ep 21: train_loss: 0.01965, val_loss: 0.00774


ep: 22, train loss=0.016,lr=0.00022: 100%|██████████| 274/274 [00:06<00:00, 42.65it/s]


ep 22: train_loss: 0.01800, val_loss: 0.00703


ep: 23, train loss=0.015,lr=0.00022: 100%|██████████| 274/274 [00:06<00:00, 42.89it/s]


ep 23: train_loss: 0.01712, val_loss: 0.00635


ep: 24, train loss=0.015,lr=0.00021: 100%|██████████| 274/274 [00:06<00:00, 42.70it/s]


ep 24: train_loss: 0.01637, val_loss: 0.00637


ep: 25, train loss=0.018,lr=0.00021: 100%|██████████| 274/274 [00:06<00:00, 42.74it/s]


ep 25: train_loss: 0.01552, val_loss: 0.00598


ep: 26, train loss=0.020,lr=0.00021: 100%|██████████| 274/274 [00:06<00:00, 41.38it/s]


ep 26: train_loss: 0.01490, val_loss: 0.00619


ep: 27, train loss=0.018,lr=0.00020: 100%|██████████| 274/274 [00:06<00:00, 42.23it/s]


ep 27: train_loss: 0.01415, val_loss: 0.00571


ep: 28, train loss=0.018,lr=0.00020: 100%|██████████| 274/274 [00:06<00:00, 41.40it/s]


ep 28: train_loss: 0.01363, val_loss: 0.00689


ep: 29, train loss=0.024,lr=0.00020: 100%|██████████| 274/274 [00:06<00:00, 42.16it/s]


ep 29: train_loss: 0.01355, val_loss: 0.00567


In [13]:
test_loss = validate(model, data_loaders.test_loader)
print('training set examples the model gives an incorrect result:')
train_acc = evaluate(model, data_loaders.train_loader, 20)
print('validataion set examples the model gives an incorrect result:')
val_acc = evaluate(model, data_loaders.test_loader)
print('test set examples the model gives an incorrect result:')
test_acc = evaluate(model, data_loaders.test_loader)
current_result = f'''train_size: {train_size}, train_loss: {train_loss},
                val_loss: {val_loss}, test_loss: {test_loss},
                test_acc: {test_acc}, val_acc: {val_acc}, train_acc: {train_acc}
                '''
print(current_result)

training set examples the model gives an incorrect result:
correct equation: 36244+68755=104999
predicted:        36244+68755=105999
correct equation: 13653+00497=14150
predicted:        13653+00497=14050
correct equation: 50263+13737=64000
predicted:        50263+13737=64900
correct equation: 33117+04884=38001
predicted:        33117+04884=38901
correct equation: 04469+06649=11118
predicted:        04469+06649=11018
validataion set examples the model gives an incorrect result:
correct equation: 22566+52033=74599
predicted:        22566+52033=74699
correct equation: 17208+42391=59599
predicted:        17208+42391=59699
correct equation: 11594+01964=13558
predicted:        11594+01964=13458
correct equation: 64352+20748=85100
predicted:        64352+20748=85000
correct equation: 05543+10259=15802
predicted:        05543+10259=15702
test set examples the model gives an incorrect result:
correct equation: 42114+48986=91100
predicted:        42114+48986=91000
correct equation: 07537+10087=

# 10-digit addition

In [14]:
max_ndigits = 10
# max_len is determined by 1+ max_ndigits + 1 + max_ndigits + 1 + max_ndigits +1 +1
max_len = 3*max_ndigits + 6
config = ModelConfig(d_embed=128, d_ff=256, h=4, N_decoder=2, max_len= max_len,
                           dropout=0.1)

dataset_size = 100000
data_loaders = DataLoaders(max_ndigits, dataset_size, padQ=True)
data_loaders.split_data(split=[10000, 20000])
train_size = len(data_loaders.train_loader)*batch_size
model = make_GPT(config)
model_size = sum([p.numel() for p in model.parameters()])
print(f'model_size: {model_size}, train_set_size: {train_size}')
warmup_steps = 3*len(data_loaders.train_loader)
# lr first increases in the warmup steps, and then descreases
lr_fn = lambda step: config.d_embed**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
optimizer = torch.optim.Adam(model.parameters(), lr=0.2, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
early_stop_count = 10 # Setting early_stop_count to a large number, that is, I'm not implementing early_stop here

train_loss, val_loss = train(model, data_loaders, epochs=30)


model_size: 538642, train_set_size: 70144


ep: 0, train loss=2.071,lr=0.00019: 100%|██████████| 274/274 [00:06<00:00, 40.83it/s]


ep 0: train_loss: 2.39638, val_loss: 2.03156


ep: 1, train loss=1.918,lr=0.00039: 100%|██████████| 274/274 [00:06<00:00, 40.83it/s]


ep 1: train_loss: 1.98073, val_loss: 1.87417


ep: 2, train loss=1.859,lr=0.00060: 100%|██████████| 274/274 [00:06<00:00, 40.97it/s]


ep 2: train_loss: 1.86913, val_loss: 1.82903


ep: 3, train loss=1.295,lr=0.00054: 100%|██████████| 274/274 [00:06<00:00, 40.29it/s]


ep 3: train_loss: 1.55404, val_loss: 1.20452


ep: 4, train loss=0.885,lr=0.00048: 100%|██████████| 274/274 [00:06<00:00, 40.44it/s]


ep 4: train_loss: 1.00704, val_loss: 0.76884


ep: 5, train loss=0.599,lr=0.00044: 100%|██████████| 274/274 [00:06<00:00, 39.96it/s]


ep 5: train_loss: 0.71838, val_loss: 0.45213


ep: 6, train loss=0.347,lr=0.00041: 100%|██████████| 274/274 [00:06<00:00, 40.99it/s]


ep 6: train_loss: 0.43668, val_loss: 0.22178


ep: 7, train loss=0.256,lr=0.00038: 100%|██████████| 274/274 [00:06<00:00, 40.19it/s]


ep 7: train_loss: 0.28587, val_loss: 0.14779


ep: 8, train loss=0.186,lr=0.00036: 100%|██████████| 274/274 [00:06<00:00, 40.49it/s]


ep 8: train_loss: 0.21654, val_loss: 0.11472


ep: 9, train loss=0.172,lr=0.00034: 100%|██████████| 274/274 [00:06<00:00, 40.80it/s]


ep 9: train_loss: 0.17819, val_loss: 0.09592


ep: 10, train loss=0.135,lr=0.00032: 100%|██████████| 274/274 [00:06<00:00, 40.34it/s]


ep 10: train_loss: 0.15577, val_loss: 0.07628


ep: 11, train loss=0.127,lr=0.00031: 100%|██████████| 274/274 [00:06<00:00, 40.61it/s]


ep 11: train_loss: 0.13799, val_loss: 0.06478


ep: 12, train loss=0.104,lr=0.00030: 100%|██████████| 274/274 [00:06<00:00, 40.78it/s]


ep 12: train_loss: 0.12080, val_loss: 0.05944


ep: 13, train loss=0.094,lr=0.00029: 100%|██████████| 274/274 [00:06<00:00, 40.75it/s]


ep 13: train_loss: 0.10662, val_loss: 0.04405


ep: 14, train loss=0.079,lr=0.00028: 100%|██████████| 274/274 [00:06<00:00, 40.43it/s]


ep 14: train_loss: 0.09135, val_loss: 0.02853


ep: 15, train loss=0.068,lr=0.00027: 100%|██████████| 274/274 [00:06<00:00, 40.58it/s]


ep 15: train_loss: 0.07558, val_loss: 0.02282


ep: 16, train loss=0.055,lr=0.00026: 100%|██████████| 274/274 [00:06<00:00, 40.53it/s]


ep 16: train_loss: 0.06361, val_loss: 0.01646


ep: 17, train loss=0.061,lr=0.00025: 100%|██████████| 274/274 [00:06<00:00, 40.73it/s]


ep 17: train_loss: 0.05339, val_loss: 0.01214


ep: 18, train loss=0.033,lr=0.00025: 100%|██████████| 274/274 [00:06<00:00, 40.60it/s]


ep 18: train_loss: 0.04383, val_loss: 0.00939


ep: 19, train loss=0.034,lr=0.00024: 100%|██████████| 274/274 [00:06<00:00, 40.87it/s]


ep 19: train_loss: 0.03760, val_loss: 0.00838


ep: 20, train loss=0.029,lr=0.00023: 100%|██████████| 274/274 [00:06<00:00, 40.71it/s]


ep 20: train_loss: 0.03244, val_loss: 0.00670


ep: 21, train loss=0.024,lr=0.00023: 100%|██████████| 274/274 [00:06<00:00, 40.80it/s]


ep 21: train_loss: 0.02837, val_loss: 0.00551


ep: 22, train loss=0.020,lr=0.00022: 100%|██████████| 274/274 [00:06<00:00, 40.87it/s]


ep 22: train_loss: 0.02580, val_loss: 0.00462


ep: 23, train loss=0.020,lr=0.00022: 100%|██████████| 274/274 [00:06<00:00, 40.48it/s]


ep 23: train_loss: 0.02276, val_loss: 0.00408


ep: 24, train loss=0.012,lr=0.00021: 100%|██████████| 274/274 [00:06<00:00, 40.91it/s]


ep 24: train_loss: 0.02092, val_loss: 0.00346


ep: 25, train loss=0.020,lr=0.00021: 100%|██████████| 274/274 [00:06<00:00, 40.69it/s]


ep 25: train_loss: 0.01939, val_loss: 0.00361


ep: 26, train loss=0.017,lr=0.00021: 100%|██████████| 274/274 [00:06<00:00, 40.73it/s]


ep 26: train_loss: 0.01773, val_loss: 0.00292


ep: 27, train loss=0.020,lr=0.00020: 100%|██████████| 274/274 [00:06<00:00, 40.68it/s]


ep 27: train_loss: 0.01665, val_loss: 0.00288


ep: 28, train loss=0.018,lr=0.00020: 100%|██████████| 274/274 [00:06<00:00, 40.79it/s]


ep 28: train_loss: 0.01523, val_loss: 0.00296


ep: 29, train loss=0.013,lr=0.00020: 100%|██████████| 274/274 [00:06<00:00, 40.70it/s]


ep 29: train_loss: 0.01442, val_loss: 0.00251


In [15]:
test_loss = validate(model, data_loaders.test_loader)
print('training set examples the model gives an incorrect result:')
train_acc = evaluate(model, data_loaders.train_loader, 20)
print('validataion set examples the model gives an incorrect result:')
val_acc = evaluate(model, data_loaders.test_loader)
print('test set examples the model gives an incorrect result:')
test_acc = evaluate(model, data_loaders.test_loader)
current_result = f'''train_size: {train_size}, train_loss: {train_loss},
                val_loss: {val_loss}, test_loss: {test_loss},
                test_acc: {test_acc}, val_acc: {val_acc}, train_acc: {train_acc}
                '''
print(current_result)

training set examples the model gives an incorrect result:
correct equation: 7468304691+0498295053=7966599744
predicted:        7468304691+0498295053=7966699744
correct equation: 1590708383+0049128964=1639837347
predicted:        1590708383+0049128964=1649837347
correct equation: 5194315098+0451784052=5646099150
predicted:        5194315098+0451784052=5646199150
correct equation: 2108145275+5358904716=7467049991
predicted:        2108145275+5358904716=7467050991
correct equation: 3729454603+4687952393=8417406996
predicted:        3729454603+4687952393=8417407996
validataion set examples the model gives an incorrect result:
correct equation: 5312000192+0444699376=5756699568
predicted:        5312000192+0444699376=5756799568
correct equation: 6904736627+4050463343=10955199970
predicted:        6904736627+4050463343=10955299970
correct equation: 0334270341+7341929505=7676199846
predicted:        0334270341+7341929505=7676299846
correct equation: 3115063951+2261932871=5376996822
predicted:

# 30-digit addition

In [17]:
max_ndigits = 18
# max_len is determined by 1+ max_ndigits + 1 + max_ndigits + 1 + max_ndigits +1 +1
max_len = 3*max_ndigits + 6
config = ModelConfig(d_embed=128, d_ff=256, h=4, N_decoder=2, max_len= max_len,
                           dropout=0.1)

dataset_size = 100000
data_loaders = DataLoaders(max_ndigits, dataset_size, padQ=True)
data_loaders.split_data(split=[10000, 20000])
train_size = len(data_loaders.train_loader)*batch_size
model = make_GPT(config)
model_size = sum([p.numel() for p in model.parameters()])
print(f'model_size: {model_size}, train_set_size: {train_size}')
warmup_steps = 3*len(data_loaders.train_loader)
# lr first increases in the warmup steps, and then descreases
lr_fn = lambda step: config.d_embed**(-0.5) * min([(step+1)**(-0.5), (step+1)*warmup_steps**(-1.5)])
optimizer = torch.optim.Adam(model.parameters(), lr=0.2, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_fn)
loss_fn = nn.CrossEntropyLoss(ignore_index=PAD)
early_stop_count = 10 # Setting early_stop_count to a large number, that is, I'm not implementing early_stop here

train_loss, val_loss = train(model, data_loaders, epochs=30)


model_size: 541714, train_set_size: 70144


ep: 0, train loss=2.207,lr=0.00019: 100%|██████████| 274/274 [00:09<00:00, 30.42it/s]


ep 0: train_loss: 2.47354, val_loss: 2.17260


ep: 1, train loss=2.100,lr=0.00039: 100%|██████████| 274/274 [00:09<00:00, 30.43it/s]


ep 1: train_loss: 2.13935, val_loss: 2.08373


ep: 2, train loss=2.047,lr=0.00060: 100%|██████████| 274/274 [00:08<00:00, 30.46it/s]


ep 2: train_loss: 2.06390, val_loss: 2.03688


ep: 3, train loss=1.969,lr=0.00054: 100%|██████████| 274/274 [00:09<00:00, 30.43it/s]


ep 3: train_loss: 2.01141, val_loss: 1.94423


ep: 4, train loss=1.929,lr=0.00048: 100%|██████████| 274/274 [00:09<00:00, 30.43it/s]


ep 4: train_loss: 1.94031, val_loss: 1.90746


ep: 5, train loss=1.771,lr=0.00044: 100%|██████████| 274/274 [00:09<00:00, 30.33it/s]


ep 5: train_loss: 1.86322, val_loss: 1.62660


ep: 6, train loss=1.424,lr=0.00041: 100%|██████████| 274/274 [00:09<00:00, 30.30it/s]


ep 6: train_loss: 1.52318, val_loss: 1.35436


ep: 7, train loss=0.992,lr=0.00038: 100%|██████████| 274/274 [00:08<00:00, 30.45it/s]


ep 7: train_loss: 1.16957, val_loss: 0.90167


ep: 8, train loss=0.813,lr=0.00036: 100%|██████████| 274/274 [00:08<00:00, 30.48it/s]


ep 8: train_loss: 0.89238, val_loss: 0.65972


ep: 9, train loss=0.709,lr=0.00034: 100%|██████████| 274/274 [00:09<00:00, 30.33it/s]


ep 9: train_loss: 0.73878, val_loss: 0.59970


ep: 10, train loss=0.663,lr=0.00032: 100%|██████████| 274/274 [00:08<00:00, 30.49it/s]


ep 10: train_loss: 0.67888, val_loss: 0.56699


ep: 11, train loss=0.641,lr=0.00031: 100%|██████████| 274/274 [00:08<00:00, 30.46it/s]


ep 11: train_loss: 0.64554, val_loss: 0.55776


ep: 12, train loss=0.615,lr=0.00030: 100%|██████████| 274/274 [00:09<00:00, 30.40it/s]


ep 12: train_loss: 0.62287, val_loss: 0.55029


ep: 13, train loss=0.613,lr=0.00029: 100%|██████████| 274/274 [00:09<00:00, 30.39it/s]


ep 13: train_loss: 0.60661, val_loss: 0.54972


ep: 14, train loss=0.579,lr=0.00028: 100%|██████████| 274/274 [00:09<00:00, 30.37it/s]


ep 14: train_loss: 0.59472, val_loss: 0.52087


ep: 15, train loss=0.519,lr=0.00027: 100%|██████████| 274/274 [00:09<00:00, 30.41it/s]


ep 15: train_loss: 0.56482, val_loss: 0.45223


ep: 16, train loss=0.487,lr=0.00026: 100%|██████████| 274/274 [00:09<00:00, 30.27it/s]


ep 16: train_loss: 0.49745, val_loss: 0.42000


ep: 17, train loss=0.453,lr=0.00025: 100%|██████████| 274/274 [00:09<00:00, 30.38it/s]


ep 17: train_loss: 0.47427, val_loss: 0.40193


ep: 18, train loss=0.453,lr=0.00025: 100%|██████████| 274/274 [00:08<00:00, 30.45it/s]


ep 18: train_loss: 0.45598, val_loss: 0.39469


ep: 19, train loss=0.427,lr=0.00024: 100%|██████████| 274/274 [00:09<00:00, 30.24it/s]


ep 19: train_loss: 0.44702, val_loss: 0.38294


ep: 20, train loss=0.366,lr=0.00023: 100%|██████████| 274/274 [00:08<00:00, 30.46it/s]


ep 20: train_loss: 0.38175, val_loss: 0.30720


ep: 21, train loss=0.337,lr=0.00023: 100%|██████████| 274/274 [00:08<00:00, 30.45it/s]


ep 21: train_loss: 0.34885, val_loss: 0.27962


ep: 22, train loss=0.309,lr=0.00022: 100%|██████████| 274/274 [00:09<00:00, 30.32it/s]


ep 22: train_loss: 0.32068, val_loss: 0.26575


ep: 23, train loss=0.311,lr=0.00022: 100%|██████████| 274/274 [00:09<00:00, 30.25it/s]


ep 23: train_loss: 0.30826, val_loss: 0.25557


ep: 24, train loss=0.294,lr=0.00021: 100%|██████████| 274/274 [00:08<00:00, 30.51it/s]


ep 24: train_loss: 0.30131, val_loss: 0.25781


ep: 25, train loss=0.288,lr=0.00021: 100%|██████████| 274/274 [00:08<00:00, 30.45it/s]


ep 25: train_loss: 0.29493, val_loss: 0.24835


ep: 26, train loss=0.299,lr=0.00021: 100%|██████████| 274/274 [00:08<00:00, 30.52it/s]


ep 26: train_loss: 0.28961, val_loss: 0.24591


ep: 27, train loss=0.286,lr=0.00020: 100%|██████████| 274/274 [00:09<00:00, 30.39it/s]


ep 27: train_loss: 0.28261, val_loss: 0.23941


ep: 28, train loss=0.271,lr=0.00020: 100%|██████████| 274/274 [00:08<00:00, 30.50it/s]


ep 28: train_loss: 0.27736, val_loss: 0.23904


ep: 29, train loss=0.269,lr=0.00020: 100%|██████████| 274/274 [00:09<00:00, 30.34it/s]


ep 29: train_loss: 0.27249, val_loss: 0.22960


In [18]:
test_loss = validate(model, data_loaders.test_loader)
print('training set examples the model gives an incorrect result:')
train_acc = evaluate(model, data_loaders.train_loader, 20)
print('validataion set examples the model gives an incorrect result:')
val_acc = evaluate(model, data_loaders.test_loader)
print('test set examples the model gives an incorrect result:')
test_acc = evaluate(model, data_loaders.test_loader)
current_result = f'''train_size: {train_size}, train_loss: {train_loss},
                val_loss: {val_loss}, test_loss: {test_loss},
                test_acc: {test_acc}, val_acc: {val_acc}, train_acc: {train_acc}
                '''
print(current_result)

training set examples the model gives an incorrect result:
correct equation: 263630793199099197+351592269863556326=615223063062655523
predicted:        263630793199099197+351592269863556326=615222063062655523
correct equation: 042020288389593022+134950025848508040=176970314238101062
predicted:        042020288389593022+134950025848508040=176960314238101062
correct equation: 462120657002111403+236865303396658690=698985960398770093
predicted:        462120657002111403+236865303396658690=698985950398770093
correct equation: 672081886656355429+704570885523208255=1376652772179563684
predicted:        672081886656355429+704570885523208255=1376657722179563584
correct equation: 100762936102229398+485330265878666867=586093201980896265
predicted:        100762936102229398+485330265878666867=586093201971896265
validataion set examples the model gives an incorrect result:
correct equation: 468935121562377068+621520603948631902=1090455725511008970
predicted:        468935121562377068+62152060394863