In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
from tqdm import tqdm
torch.cuda.manual_seed_all(48758)
torch.manual_seed(48758)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
df_train = pd.read_csv('arithmetic_NLP/arithmetic_train.csv')
df_eval = pd.read_csv('arithmetic_NLP/arithmetic_eval.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [3]:
df_train['tgt'] = df_train['tgt'].apply(lambda x:str(x))
df_train['src'] = df_train['src'].add(df_train['tgt'])

df_eval['tgt'] = df_eval['tgt'].apply(lambda x:str(x))

In [4]:
#TODO1:build dictionary
char_to_id = {
    '<pad>':0,
    '1':1,
    '2':2,
    '3':3,
    '4':4,
    '5':5,
    '6':6,
    '7':7,
    '8':8,
    '9':9,
    '0':10,
    '<eos>':11,
    '+':12,
    '-':13,
    '*':14,
    '(':15,
    ')':16,
    '=':17,
    '<bos>':18
}
id_to_char = {
    0:'<pad>',
    1:'1',
    2:'2',
    3:'3',
    4:'4',
    5:'5',
    6:'6',
    7:'7',
    8:'8',
    9:'9',
    10:'0',
    11:'<eos>',
    12:'+',
    13:'-',
    14:'*',
    15:'(',
    16:')',
    17:'=',
    18:'<bos>'
}

vocab_size = len(char_to_id)
print(f'vocab_size:{vocab_size}')

vocab_size:19


In [5]:
#TODO2:data preprocessing
def get_label(char_id_list):
  ret = []
  flag = False
  for id in char_id_list:
    if id == char_to_id['<bos>']:
      continue
    if id == char_to_id['=']:
      flag = True
      ret.append(char_to_id['<pad>'])
      continue
    if flag:
      ret.append(id)
    else:
      ret.append(char_to_id['<pad>'])
  return ret

df_train['char_id_list'] = df_train['src'].apply(lambda x:[char_to_id['<bos>']]+[char_to_id[ch] for ch in x])#<bos>a+b=c
df_train['label_id_list'] = df_train['char_id_list'].apply(lambda x:get_label(x)+[char_to_id['<eos>']])#<pad>*n c<eos>

df_eval['char_id_list'] = df_eval['src'].apply(lambda x:[char_to_id['<bos>']]+[char_to_id[ch] for ch in x])#<bos>a+b=
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt,char_id_list,label_id_list
0,2285313,14*(43+20)=882,882,"[18, 1, 4, 14, 15, 4, 3, 12, 2, 10, 16, 17, 8,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 2, 11]"
1,317061,(6+1)*5=35,35,"[18, 15, 6, 12, 1, 16, 14, 5, 17, 3, 5]","[0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 11]"
2,718770,13+32+29=74,74,"[18, 1, 3, 12, 3, 2, 12, 2, 9, 17, 7, 4]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 4, 11]"
3,170195,31*(3-11)=-248,-248,"[18, 3, 1, 14, 15, 3, 13, 1, 1, 16, 17, 13, 2,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 2, 4, 8, 11]"
4,2581417,24*49+1=1177,1177,"[18, 2, 4, 14, 4, 9, 12, 1, 17, 1, 1, 7, 7]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 7, 7, 11]"


In [11]:
filter_train1 = df_train[~df_train['src'].str.contains('9')].reset_index(drop=True)

In [6]:
#TODO3: Data Batching
class Dataset(torch.utils.data.Dataset):
  def __init__(self,sequences):
    self.sequences = sequences
  def __len__(self):
    return len(self.sequences)
  def __getitem__(self, index):
    x=self.sequences['char_id_list'][index]
    y=self.sequences['label_id_list'][index]
    return x,y

In [7]:
class CharRNN(torch.nn.Module):
  def __init__(self,vocab_size,embed_dim,hidden_dim,Type='LSTM'):
    super(CharRNN,self).__init__()
    self.hidden_dim = hidden_dim
    self.Type = Type
    self.embedding=torch.nn.Embedding(num_embeddings=vocab_size,
                      embedding_dim=embed_dim,
                      padding_idx=char_to_id['<pad>'])
    if Type=='LSTM':
      self.rnn_layer1=torch.nn.LSTM(input_size=embed_dim,
                      hidden_size=hidden_dim,
                      batch_first=True)
      self.rnn_layer2=torch.nn.LSTM(input_size=hidden_dim,
                      hidden_size=hidden_dim,
                      batch_first=True)
    elif Type=='RNN':
      self.rnn_layer1=torch.nn.RNN(input_size=embed_dim,
                      hidden_size=hidden_dim,
                      batch_first=True)
      self.rnn_layer2=torch.nn.RNN(input_size=hidden_dim,
                      hidden_size=hidden_dim,
                      batch_first=True)
    else:#Type==GRU
      self.rnn_layer1=torch.nn.GRU(input_size=embed_dim,
                      hidden_size=hidden_dim,
                      batch_first=True)
      self.rnn_layer2=torch.nn.GRU(input_size=hidden_dim,
                      hidden_size=hidden_dim,
                      batch_first=True)
    self.linear_layer=torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                              out_features=hidden_dim),
                        torch.nn.ReLU(),
                        torch.nn.Linear(in_features=hidden_dim,
                              out_features=vocab_size))

  def forward(self,x,hidden1=None,hidden2=None):
    ret = self.embedding(x)
    if self.Type=='LSTM':
      if hidden1 is None:
        hidden1 = (torch.zeros(1,x.shape[0],self.hidden_dim).to(x.device),torch.zeros(1,x.shape[0],self.hidden_dim).to(x.device))
      if hidden2 is None:
        hidden2 = (torch.zeros(1,x.shape[0],self.hidden_dim).to(x.device),torch.zeros(1,x.shape[0],self.hidden_dim).to(x.device))
    else:
      if hidden1 is None:
        hidden1 = torch.zeros(1,x.shape[0],self.hidden_dim).to(x.device)
      if hidden2 is None:
        hidden2 = torch.zeros(1,x.shape[0],self.hidden_dim).to(x.device)

    ret,hidden1 = self.rnn_layer1(ret,hidden1)
    ret,hidden2 = self.rnn_layer2(ret,hidden2)
    ret = self.linear_layer(ret)
    return ret, hidden1, hidden2

  #TODO4: Generation
  def generator(self,start_char,max_len=200,device='cpu'):
    char_list = [c for c in start_char]
    next_char = None
    if self.Type=='LSTM':
      hidden1 = (torch.zeros(1,1,self.hidden_dim).to(device),torch.zeros(1,1,self.hidden_dim).to(device))
      hidden2 = (torch.zeros(1,1,self.hidden_dim).to(device),torch.zeros(1,1,self.hidden_dim).to(device))
    else:
      hidden1 = torch.zeros(1,1,self.hidden_dim).to(device)
      hidden2 = torch.zeros(1,1,self.hidden_dim).to(device)
    start = False
    with torch.no_grad():
      while len(char_list) < max_len:
        if not start:
          y = torch.tensor([char_list]).to(device)
          start = True
        else:
          y = torch.tensor([[char_list[-1]]]).to(device)
        y,hidden1,hidden2 = self.forward(y,hidden1,hidden2)
        next_char = torch.argmax(y[:,-1,:],dim=-1)
        if next_char == char_to_id['<eos>']:
          break
        char_list.append(next_char.detach().cpu().item())
    return [id_to_char[ch_id] for ch_id in char_list]

In [8]:
#setting
def collate_fn(batch):
  maxlen = max(len(seq[0]) for seq in batch)
  x = [list(seq[0]) + [char_to_id['<pad>']]*(maxlen-len(seq[0])) for seq in batch]
  y = [list(seq[1]) + [char_to_id['<pad>']]*(maxlen-len(seq[1])) for seq in batch]
  x = torch.stack([torch.tensor(seq) for seq in x])
  y = torch.stack([torch.tensor(seq) for seq in y])
  return x,y

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device:{device}')
epochs = 10
learning_rate = 5e-3
embed_dim = 100
hidden_dim = 200
batch_size = 5000
Type = 'LSTM'


model = CharRNN(vocab_size,embed_dim=embed_dim,hidden_dim=hidden_dim,Type=Type).to(device)
loss_func = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
dataset_train = Dataset(df_train)
total_num = len(dataset_train)
dataloader_train = torch.utils.data.DataLoader(dataset_train,batch_size=batch_size,collate_fn=collate_fn,shuffle=True)

device:cuda


In [9]:
#TODO5:train
model.train()
for epoch in range(epochs):
  for x,y in tqdm(dataloader_train):
    x,y = x.to(device),y.to(device)
    output,_,_ = model(x)
    loss = loss_func(output.view(-1,vocab_size),y.view(-1))
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

100%|██████████| 474/474 [00:46<00:00, 10.27it/s]
100%|██████████| 474/474 [00:46<00:00, 10.26it/s]
100%|██████████| 474/474 [00:45<00:00, 10.38it/s]
100%|██████████| 474/474 [00:46<00:00, 10.30it/s]
100%|██████████| 474/474 [00:45<00:00, 10.37it/s]
100%|██████████| 474/474 [00:46<00:00, 10.29it/s]
100%|██████████| 474/474 [00:45<00:00, 10.42it/s]
100%|██████████| 474/474 [00:45<00:00, 10.34it/s]
100%|██████████| 474/474 [00:45<00:00, 10.40it/s]
100%|██████████| 474/474 [00:45<00:00, 10.37it/s]


In [10]:
#TODO6:eval
def get_ans(s):
  ans = ''
  flag = False
  for ch in s:
    if ch == '=' and not flag:
      flag = True
      continue
    if flag:
      if ch == '<eos>':
        break
      ans+=ch
  return ans

model.eval()
data_num = len(df_eval['char_id_list'])
true_num = 0

for id,input_data in enumerate(df_eval['char_id_list']):
  result = get_ans(model.generator(input_data,device=device,max_len=25))
  if result == df_eval['tgt'][id]:
    true_num+=1
print(f'accuracy:{true_num/data_num}')

accuracy:0.633261158594492


In [18]:
def ques_to_id(x):
  return [char_to_id['<bos>']]+[char_to_id[ch] for ch in x]

Q_arr = ['100+375=',
         '(300-78)*2=',
         '498-486+9=',
         '(57-759)-67=',
         '0*(758-9)=',
         '467*2-89=',
         "526-(67*11)=",
         "301*(687-686)=",
         "90-349=",
         "-974+(758-8)=",
         "100*0-896=",
         "67-987+89=",
         "83*12-900="]

for input_data in Q_arr:
  print(''.join(model.generator(ques_to_id(input_data),device=device)))

<bos>100+375=60
<bos>(300-78)*2=16
<bos>498-486+9=31
<bos>(57-759)-67=1
<bos>0*(758-9)=0
<bos>467*2-89=85
<bos>526-(67*11)=-321
<bos>301*(687-686)=780
<bos>90-349=-35
<bos>-974+(758-8)=44
<bos>100*0-896=-38
<bos>67-987+89=44
<bos>83*12-900=284
