In [9]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader
import os
import torch.nn.utils.rnn

df_train = pd.read_csv('arithmetic_train.csv')
df_eval = pd.read_csv('arithmetic_eval.csv')

df_train["tgt"] = df_train["tgt"].apply(lambda x:str(x))
df_train["src"] = df_train["src"].add(df_train["tgt"])
df_train["len"] = df_train["src"].apply(lambda x: len(x))

df_eval["tgt"] = df_eval["tgt"].apply(lambda x:str(x))
df_eval["src"] = df_eval["src"].add(df_eval["tgt"])
df_eval["len"] = df_eval["src"].apply(lambda x: len(x))


#TODO1
char_to_id = {'<pad>': 0, '<eos>': 1}
id_to_char = {0: '<pad>', 1: '<eos>'}

# 添加数字0到9的映射
for i in range(10):
    char_to_id[str(i)] = i + 2
    id_to_char[i + 2] = str(i)

# 添加符号的映射
symbols = ['+', '-', '*', '(', ')', '=']
for idx, symbol in enumerate(symbols, start=len(char_to_id)):
    char_to_id[symbol] = idx
    id_to_char[idx] = symbol

vocab_size = len(char_to_id)

df_train = df_train.drop(df_train.columns[0], axis=1)
df_eval = df_eval.drop(df_eval.columns[0], axis=1)

In [11]:
#TODO2
def process_dataframe(df):
    # 新增兩個欄位
    df['char_id_list'] = None
    df['label_id_list'] = None
    # 逐行處理 df_train
    for i in range(len(df)):
        char = []
        e_id = 0
        for j in range(len(df["src"][i])):
            # 將 src 中的字元轉換為對應的 id
            char.append(char_to_id[df["src"][i][j]])
            # 如果遇到對應 id 為 17，記錄其位置
            if char_to_id[df["src"][i][j]] == 17:
                e_id = j
        # 將 1 添加到 char 列表末尾
        char = char + [1]
        # 更新 label_id_list 和 char_id_list 欄位
        df.at[i, 'char_id_list'] = char
        df.at[i, 'label_id_list'] = [0]*(e_id+1) + char[e_id+1:]
        
    df['tgt'] = 0

    return df

# 呼叫函式
df_train = process_dataframe(df_train)
df_eval = process_dataframe(df_eval)

df_train.head()

Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,14*(43+20)=882,0,14,"[3, 6, 14, 15, 6, 5, 12, 4, 2, 16, 17, 10, 10,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 10, 4, 1]"
1,(6+1)*5=35,0,10,"[15, 8, 12, 3, 16, 14, 7, 17, 5, 7, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 5, 7, 1]"
2,13+32+29=74,0,11,"[3, 5, 12, 5, 4, 12, 4, 11, 17, 9, 6, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 6, 1]"
3,31*(3-11)=-248,0,14,"[5, 3, 14, 15, 5, 13, 3, 3, 16, 17, 13, 4, 6, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 4, 6, 10, 1]"
4,24*49+1=1177,0,12,"[4, 6, 14, 6, 11, 12, 3, 17, 3, 3, 9, 9, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 9, 9, 1]"


In [27]:
batch_size = 64
epochs = 2
embed_dim = 128
hidden_dim = 128
lr = 0.001
grad_clip = 1

In [29]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
       
        self.sequences = sequences
    
    def __len__(self):
        
        return len(self.sequences)
    
    def __getitem__(self, index):
        
        seq = self.sequences.loc[index, "char_id_list"]
        end_idx = seq.index(17)  # 找到字符 `=` 的索引位置
        start_idx = len(seq) - end_idx -2
        x =  seq[:end_idx + 1] + [0]*start_idx
        y = [0]* (end_idx) + seq[end_idx+1:]

        return x, y

def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])
    
    # Pad the input sequence
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])
    
    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens



ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])
ds_eval = Dataset(df_eval[['char_id_list', 'label_id_list']])


# Build dataloader of train set and eval set, collate_fn is the collate function
dl_train = DataLoader(Dataset(df_train), batch_size, shuffle=True, collate_fn=collate_fn)
dl_eval = DataLoader(Dataset(df_eval), batch_size, shuffle=False, collate_fn=collate_fn)

In [30]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()
        
        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])
        
        self.rnn_layer1 = torch.nn.LSTM(input_size=embed_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.rnn_layer2 = torch.nn.LSTM(input_size=hidden_dim,
                                        hidden_size=hidden_dim,
                                        batch_first=True)
        
        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))
        
    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)
    
    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)
        
        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)
        
        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)
        
        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)
        
        batch_x = self.linear(batch_x)
        
        return batch_x
    
    def generator(self, start_char, max_len=200):
        
        char_list = [char_to_id[c] for c in start_char]
        
        next_char = None
        hidden1 = (torch.zeros(1, 1, hidden_dim).to("cpu"), 
               torch.zeros(1, 1, hidden_dim).to("cpu"))
        hidden2 = (torch.zeros(1, 1, hidden_dim).to("cpu"), 
               torch.zeros(1, 1, hidden_dim).to("cpu"))
    
        while len(char_list) < max_len: 
            # Write your code here 
            # Pack the char_list to tensor
            input_tensor = torch.tensor(char_list).unsqueeze(0).to("cpu")
            # Input the tensor to the embedding layer, LSTM layers, linear respectively
            embedded = self.embedding(input_tensor)
            out, hidden1 = self.rnn_layer1(embedded, hidden1)
            out, hidden2 = self.rnn_layer2(out, hidden2)
            
            y = self.linear(out[:, -1, :]) 
            # Get the predicted character ID by taking the argmax of the output probabilities
            next_char = torch.argmax(y, dim=-1).item()
         
            if next_char == char_to_id['<eos>']:
                break
            
            char_list.append(next_char)
            
        return [id_to_char[ch_id] for ch_id in char_list]




In [33]:
import torch.cuda
import torch.optim as optim

torch.manual_seed(2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr)

In [35]:
from tqdm import tqdm
from copy import deepcopy

model = model.to(device)
model.train()
i = 0
for epoch in range(1, epochs+1):
    # The process bar
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        # Clear the gradient
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        batch_pred_y = batch_pred_y.view(-1, batch_pred_y.size(-1))  
        batch_y = batch_y.view(-1)
       
        # Input the prediction and ground truths to loss function
        loss = criterion(batch_pred_y, batch_y.to(device))
        
        # Back propagation
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip) # gradient clipping

        # Optimize parameters in the model
        optimizer.step()

        i+=1
        if i%50==0:
            bar.set_postfix(loss = loss.item())
    
    # Evaluate your model
    bar = tqdm(dl_eval, desc=f"Validation epoch {epoch}")
    matched = 0
    total = 0
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        
        predictions = model(batch_x.to(device), batch_x_lens)
        
        # Convert predictions to the predicted class labels
        pred_labels = torch.argmax(predictions, dim=-1)
        
        batch_y = batch_y.to(device)
        # Check whether the prediction matches the ground truths
        for i in range(batch_y.size(0)):
            total += 1        
            mask = batch_y[i] != 0  
            # Check if the entire sequence matches (exact match)
            if torch.equal(pred_labels[i, mask], batch_y[i, mask]):
                matched += 1

    print(matched/total)

Train epoch 1: 100%|██████████| 37020/37020 [1:30:31<00:00,  6.82it/s, loss=0.49] 
Validation epoch 1: 100%|██████████| 4114/4114 [05:14<00:00, 13.08it/s]


0.35661918328584996


Train epoch 2: 100%|██████████| 37020/37020 [1:30:25<00:00,  6.82it/s, loss=0.466]
Validation epoch 2: 100%|██████████| 4114/4114 [04:20<00:00, 15.77it/s]

0.4694548907882241





In [37]:
model = model.to("cpu")
print("".join(model.generator("1+2=")))

1+2=3134566791119911111991119191119191919191919191919191919191919191919191919911919191919191919191919191919191918181818181818181989919119991119181819191818181891919191991191919199119911991191991199119
