## Arithmetic as a language

### Load Module

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


### Load data

In [3]:
df_train = pd.read_csv('./dataset/arithmetic_train.csv')
df_eval = pd.read_csv('./dataset/arithmetic_eval.csv')

df_train = df_train.drop('hash', axis=1)
df_eval = df_eval.drop('hash', axis=1)
df_train.head(5)

Unnamed: 0,src,tgt
0,14*(43+20)=,882
1,(6+1)*5=,35
2,13+32+29=,74
3,31*(3-11)=,-248
4,24*49+1=,1177


In [4]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

df_train.head(5)

Unnamed: 0,src,tgt,len
0,14*(43+20)=882,882,14
1,(6+1)*5=35,35,10
2,13+32+29=74,74,11
3,31*(3-11)=-248,-248,14
4,24*49+1=1177,1177,12


### TODO 1

### Build Dictionary

- The model cannot perform calculations directly with plain text.
- Convert all text(numbers/symbols) into numerical representations.
- Special tokens
  - '\<pad>'
    - Each sentence within a batch may have different lengths.
    - The length is padded with '<pad>' to match the longest sentence in the batch
  - '\<eos>'
     -  Specifies the end of the generated sequence.
     -  Without '<eos>', the model will not know when to stop generating.

In [5]:
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to convert charactors to ids, while id_to_char is the opposite
char_to_id = {}
id_to_char = {}
tokens = ['<pad>', '<eos>', '+', '-', '*', '=', '(', ')']

for idx, token in enumerate(tokens):
    char_to_id[token] = idx
    id_to_char[idx] = token

for i in range(10):
    char_to_id[str(i)] = i + len(tokens)
    id_to_char[i + len(tokens)] = str(i)

vocab_size = len(char_to_id)
print('Vocab size: {}'.format(vocab_size))
print("char_to_id:", char_to_id)
print("id_to_char", id_to_char)

Vocab size: 18
char_to_id: {'<pad>': 0, '<eos>': 1, '+': 2, '-': 3, '*': 4, '=': 5, '(': 6, ')': 7, '0': 8, '1': 9, '2': 10, '3': 11, '4': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17}
id_to_char {0: '<pad>', 1: '<eos>', 2: '+', 3: '-', 4: '*', 5: '=', 6: '(', 7: ')', 8: '0', 9: '1', 10: '2', 11: '3', 12: '4', 13: '5', 14: '6', 15: '7', 16: '8', 17: '9'}


### TODO 2

### Data Preprocessing

- The data is processed into the format required for the model's input and output.
- Example" 1+2+3=0
  - Model input: 1 + 2 - 3 = 0
  - Model output: / / / / / 0 \<eos> (the "/" can be replaced with \<pad>)
  - The key for the model's output is that the model does not need to predict the next character of the previous part.
  - What matters is that once the model sees '=', it should start generating the answer, which is '0'.
  - After generating the answer, it should also generate \<eos>.

In [6]:
# Data preprocessing
char_id_list = []
label_id_list = []

for src in df_train['src']:
    id_list = []
    label_list = []
    answer_flag = 0 # record whether '=' appeared
    for token in list(src):
        id_list.append(char_to_id[token])
        if(token == '='): 
            answer_flag = 1
            continue
        if(answer_flag): label_list.append(char_to_id[token]) # append answer if '=' appeard
        else: label_list.append(char_to_id['<pad>']) # append '<pad>' before '=' 
    # id_list.append(char_to_id['<eos>'])
    label_list.append(char_to_id['<eos>'])
    
    char_id_list.append(id_list)
    label_id_list.append(label_list)
    
print("src:", df_train['src'][0])
print('char_id_list:', char_id_list[0], "length:", len(char_id_list[0]))
print('label_id_list:', label_id_list[0], "length:", len(label_id_list[0]))

src: 14*(43+20)=882
char_id_list: [9, 12, 4, 6, 12, 11, 2, 10, 8, 7, 5, 16, 16, 10] length: 14
label_id_list: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 10, 1] length: 14


In [7]:
df_train['char_id_list'] = char_id_list
df_train['label_id_list'] = label_id_list

df_train.head(5)

Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,882,14,"[9, 12, 4, 6, 12, 11, 2, 10, 8, 7, 5, 16, 16, 10]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 10, 1]"
1,(6+1)*5=35,35,10,"[6, 14, 2, 9, 7, 4, 13, 5, 11, 13]","[0, 0, 0, 0, 0, 0, 0, 11, 13, 1]"
2,13+32+29=74,74,11,"[9, 11, 2, 11, 10, 2, 10, 17, 5, 15, 12]","[0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 1]"
3,31*(3-11)=-248,-248,14,"[11, 9, 4, 6, 11, 3, 9, 9, 7, 5, 3, 10, 12, 16]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 10, 12, 16, 1]"
4,24*49+1=1177,1177,12,"[10, 12, 4, 12, 17, 2, 9, 5, 9, 9, 15, 15]","[0, 0, 0, 0, 0, 0, 0, 9, 9, 15, 15, 1]"


### TODO 3

### Data Batching

- Use `torch.utils.data.Dataset` to create a data generation tool called dataset.
- Then, use `torch.utils.data.DataLoader` to randomly sample from the dataset and group the samples into batches.

In [8]:
import torch.utils

class Dataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        # return how much data in the Dataset object
        return len(self.data)

    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        x = torch.tensor(self.data['char_id_list'][index], dtype=torch.long)
        y = torch.tensor(self.data['label_id_list'][index], dtype=torch.long)
        return x, y

In [9]:
# padding function
# this cell is referenced to Chat-GPT
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    x, y = zip(*batch)
    x = pad_sequence(x, batch_first=True, padding_value=char_to_id['<pad>'])
    y = pad_sequence(y, batch_first=True, padding_value=char_to_id['<pad>'])
    return x, y

### TODO 4

### Generator

- The `start_char` is fed into the model.
- Each time a sequence is input into the model, it generates a prediction for the next token.
- The prediction for the next token corresponds to the last element in the model's output sequence.
- When the output is '\<eos>', the generation should be stopped.

### Model

In [54]:
class MathRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(MathRNN, self).__init__()
        
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])
        self.lstm = torch.nn.LSTM(input_size=embed_dim,
                                  hidden_size=hidden_dim,
                                  batch_first=True)
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(in_features=hidden_dim, out_features= hidden_dim // 2),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=hidden_dim // 2, out_features=vocab_size),
            # torch.nn.Softmax(dim=-1) # cross entropy loss will do this by itself
        )
    
    def forward(self, x):
        embed = self.embedding(x)
        output, hidden = self.lstm(embed) # (batch_size, sequence_length, hidden_dim)
        logits = self.linear(output) # (batch_size, sequence_length, vocab_size)
        
        return logits
    
    def generator(self, start_char, max_len=200):
        char_list = [char_to_id[c] for c in start_char]
        
        next_char = None
        
        while len(char_list) < max_len:
            # Pack the char_list to tensor
            # Input the tensor to the embedding layer, LSTM layers, linear respectively
            # write your code here
            in_tensor = torch.tensor(char_list).to(device)
            
            y = self.forward(in_tensor) # Obtain the next token prediction y
            
            next_char = torch.argmax(y, dim=-1)[-1].item() # Use argmax function to get the next token prediction
                    
            if next_char == char_to_id['<eos>']: break
            
            char_list.append(next_char)
                
        return [id_to_char[ch_id] for ch_id in char_list]

### Hyper-parameters

In [66]:
from torch.utils.data import DataLoader

embed_dim = 32
hidden_dim = 128
lr = 1e-4
batch_size = 4096

dataset = Dataset(df_train)
data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

model = MathRNN(vocab_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim)
optimier = torch.optim.Adam(model.parameters(), lr=lr)
cross_entropy_loss = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])

### TODO 5

### Training

In [67]:
epochs = 10
store_epoch = 5

# load checkpoint
model = torch.load('./checkpoints/model_10.pt')

model.to(device)

MathRNN(
  (embedding): Embedding(18, 32, padding_idx=0)
  (lstm): LSTM(32, 128, batch_first=True)
  (linear): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=18, bias=True)
  )
)

In [68]:
# This cell is referenced to Chat-GPT
for epoch in range(epochs):
    for x_batch, y_batch in tqdm(data_loader):
        # get data
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        # Forward
        pred = model(x_batch)

        # Backward
        loss = cross_entropy_loss(pred.view(-1, vocab_size), y_batch.view(-1))
        optimier.zero_grad()
        loss.backward()
        optimier.step()
        
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
    if((epoch + 1) % store_epoch == 0):
        torch.save(model, f'./checkpoints/model_{epoch+1}.pt')

100%|██████████| 579/579 [03:07<00:00,  3.08it/s]


Epoch 1, Loss: 0.5692586302757263


100%|██████████| 579/579 [03:09<00:00,  3.06it/s]


Epoch 2, Loss: 0.5741397142410278


100%|██████████| 579/579 [03:05<00:00,  3.12it/s]


Epoch 3, Loss: 0.5830414295196533


100%|██████████| 579/579 [03:05<00:00,  3.13it/s]


Epoch 4, Loss: 0.5845175981521606


100%|██████████| 579/579 [03:03<00:00,  3.15it/s]


Epoch 5, Loss: 0.5705891251564026


100%|██████████| 579/579 [03:05<00:00,  3.12it/s]


Epoch 6, Loss: 0.5755837559700012


100%|██████████| 579/579 [03:03<00:00,  3.15it/s]


Epoch 7, Loss: 0.5891082286834717


100%|██████████| 579/579 [03:06<00:00,  3.11it/s]


Epoch 8, Loss: 0.5744043588638306


100%|██████████| 579/579 [03:04<00:00,  3.14it/s]


Epoch 9, Loss: 0.5860238671302795


100%|██████████| 579/579 [03:06<00:00,  3.10it/s]

Epoch 10, Loss: 0.5737103223800659





In [59]:
# store model checkpoint
torch.save(model, './checkpoints/model_10.pt')

### TODO 6

### Evaluation

In [60]:
# Data preprocessing
char_eval_list = []
label_id_eval_list = []

for src in df_eval['src']:
    char_list = []
    label_list = []
    answer_flag = 0 # record whether '=' appeared
    for token in list(src):
        if(answer_flag == 0):
            char_list.append(token) # append input before '=' (include)
        if(token == '='): 
            answer_flag = 1
            continue
        if(answer_flag): label_list.append(char_to_id[token]) # append answer if '=' appeard
        else: label_list.append(char_to_id['<pad>']) # append '<pad>' before '=' 
    
    char_eval_list.append(char_list)
    label_id_eval_list.append(label_list)
    
print("src:", df_eval['src'][0])
print('char_id_list:', char_eval_list[0], "length:", len(char_eval_list[0]))
print('label_id_list:', label_id_eval_list[0], "length:", len(label_id_eval_list[0]))

src: 48+43+34=125
char_id_list: ['4', '8', '+', '4', '3', '+', '3', '4', '='] length: 9
label_id_list: [0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 13] length: 11


In [62]:
# Evaluation
ac_count = 0
total_count = len(label_id_eval_list)
model.eval()

for x, y in tqdm(zip(char_eval_list, label_id_eval_list)):
    pred = model.generator(x)
    if(len(pred) != len(y)): continue
    ac_flag = 1
    for idx, label_char in enumerate(y):
        if(label_char == char_to_id['<pad>']): continue
        else:
            if(pred[idx] != label_char):
                ac_flag = 0
                break
    if(ac_flag == 1): ac_count += 1
    
acc = round(ac_count / total_count, 2)
print(f"Accurate count: {ac_count} / {total_count}")
print(f"Evaluate Acc: {acc}%")

263250it [16:15, 269.82it/s]

Accurate count: 0 / 263250
Evaluate Acc: 0.0%



