## Arithmetic as a language

### Load Module

In [1]:
import pandas as pd
import numpy as np
import torch
import tqdm

### Load data

In [2]:
df_train = pd.read_csv('./dataset/arithmetic_train.csv')
df_eval = pd.read_csv('./dataset/arithmetic_eval.csv')

df_train.head(5)

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [4]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))
df_eval['src'] = df_eval['src'].add(df_eval['tgt'])
df_eval['len'] = df_eval['src'].apply(lambda x: len(x))

df_train.head(5)

Unnamed: 0.1,Unnamed: 0,src,tgt,len
0,2285313,14*(43+20)=882,882,14
1,317061,(6+1)*5=35,35,10
2,718770,13+32+29=74,74,11
3,170195,31*(3-11)=-248,-248,14
4,2581417,24*49+1=1177,1177,12


### TODO 1

### Build Dictionary

- The model cannot perform calculations directly with plain text.
- Convert all text(numbers/symbols) into numerical representations.
- Special tokens
  - '\<pad>'
    - Each sentence within a batch may have different lengths.
    - The length is padded with '<pad>' to match the longest sentence in the batch
  - '\<eos>'
     -  Specifies the end of the generated sequence.
     -  Without '<eos>', the model will not know when to stop generating.

In [None]:
char_to_id = {}
id_to_char = {}

# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to convert charactors to ids, while id_to_char is the opposite
# write your code here

vocab_size = len(char_to_id)
print('Vocab size{}'.format(vocab_size))

### TODO 2

### Data Preprocessing

- The data is processed into the format required for the model's input and output.
- Example" 1+2+3=0
  - Model input: 1 + 2 - 3 = 0
  - Model output: / / / / / 0 \<eos> (the "/" can be replaced with \<pad>)
  - The key for the model's output is that the model does not need to predict the next character of the previous part.
  - What matters is that once the model sees '=', it should start generating the answer, which is '0'.
  - After generating the answer, it should also generate \<eos>.

In [None]:
# Data preprocessing
# write your code here

### TODO 3

### Data Batching

- Use `torch.utils.data.Dataset` to create a data generation tool called dataset.
- Then, use `torch.utils.data.DataLoader` to randomly sample from the dataset and group the samples into batches.

In [None]:
import torch.utils

class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
        
    def __len__(self):
        # return how much data in the Dataset object
        return # write your code here

    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        x = # write your code here
        y = # write your code here
        return x, y

### TODO 4

### Generator

- The `start_char` is fed into the model.
- Each time a sequence is input into the model, it generates a prediction for the next token.
- The prediction for the next token corresponds to the last element in the model's output sequence.
- When the output is '\<eos>', the generation should be stopped.

In [None]:
def generator(self, start_char, max_len=200):
    char_list = [char_to_id[c] for c in start_char]
    
    next_char = None
    
    while len(char_list) < max_len:
        # Pack the char_list to tensor
        # Input the tensor to the embedding layer, LSTM layers, linear respectively
        # write your code here
        y = # Obtain the next token prediction y
        
        next_char = # Use argmax function to get the next token prediction
        
        if next_char == char_to_id['<eos>']: break
        
        char_list.append(next_char)
        
    return [id_to_char[ch_id] for ch_id in char_list]

### TODO 5

### Training

### TODO 6

### Evaluation