In [1]:
import torch
import random
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm

In [2]:
class arithmeticDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        # read the data from arithmetic.csv
        self.data = pd.read_csv('drive/MyDrive/Colab Notebooks/arithmetic.csv')
        # split the data into train, validation (usually 9:1)
        if split == "train":
            self.data = self.data.iloc[:int(0.9 * len(self.data))]
        else:
            self.data = self.data.iloc[int(0.9 * len(self.data)):]

    def __getitem__(self, index):
        input_seq = str(self.data.iloc[index]['src'])  # Convert to string
        output_seq = str(self.data.iloc[index]['tgt'])  # Convert to string
        return input_seq, output_seq

    def __len__(self):
        return len(self.data)


In [9]:
class arithmeticModel(torch.nn.Module):
    def __init__(self, num_embeddings, embedding_dim, input_size, hidden_size, num_layers, in_features, out_features) -> None:
        super().__init__()
        # Define the embedding layer
        self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim)
        # Define the LSTM layer
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # Define the feedforward neural network layer
        self.ffn = torch.nn.Linear(in_features, out_features)

    def forward(self, input_logits):
        # Input the data into the embedding layer
        embedded_input = self.embedding(input_logits)
        # Input the embeddings into LSTM layer
        lstm_output, (hidden_state, cell_state) = self.lstm(embedded_input)
        # Input the output of LSTM layer to ffn and get output logits
        output_logits = self.ffn(lstm_output)

        return output_logits

In [4]:
class arithmeticTokenizer:
    def __init__(self) -> None:
        super().__init__()
        # define tokens in the dictionary (e.g. +,-,1,2,3,4,5,6...)
        self.tokens = ['+', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' , '(', ')', '*', '=', ' ']

    def encode(self, seqs):
        # Get the maximum length of the sequences
        max_length = max(len(seq) for seq in seqs)
        # Pad sequences to the maximum length
        padded_seqs = [seq.ljust(max_length) for seq in seqs]
        # transform the input sequence to a tensor of ids
        encoded_seqs = []
        for seq in padded_seqs:
            encoded_seq = [self.tokens.index(token) for token in seq]
            encoded_seqs.append(encoded_seq)
        return torch.tensor(encoded_seqs)

    def decode(self, ids):
        # transform the output ids to sequences
        decoded_seqs = []
        for id_seq in ids:
            decoded_seq = [self.tokens[id] for id in id_seq]
            decoded_seqs.append(decoded_seq)
        return decoded_seqs


In [5]:
def collate_fn(sample):
    # sample 是一个由数据批次组成的元组
    src_seqs, tgt_seqs = zip(*sample)

    # 使用 arithmeticTokenizer 对源序列和目标序列进行编码
    tokenizer = arithmeticTokenizer()
    src_seqs = tokenizer.encode(src_seqs)

    # Pad target sequences to the same length as source sequences
    max_length = max(len(seq) for seq in src_seqs)
    tgt_seqs = [seq.ljust(max_length) for seq in tgt_seqs]
    tgt_seqs = tokenizer.encode(tgt_seqs)

    # 将编码后的序列转换为张量
    src_seqs_tensor = torch.tensor(src_seqs)
    tgt_seqs_tensor = torch.tensor(tgt_seqs)

    return src_seqs_tensor, tgt_seqs_tensor


In [10]:
# Define the batch size
batch_size = 64

# Instantiate the arithmeticDataset for training set and validation set
train_dataset = arithmeticDataset(split="train")
validate_dataset = arithmeticDataset(split="validate")

train_dl = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn
)
validate_dl = DataLoader(
    dataset=validate_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

# Define the parameters for the model
num_embeddings = len(arithmeticTokenizer().tokens)
embedding_dim = 50
input_size = embedding_dim  # The size of input to the RNN layer is the same as the dimension of embeddings
hidden_size = 64
num_layers = 1
in_features = hidden_size  # The number of input features to the feedforward neural network layer
out_features = num_embeddings  # The number of output features from the feedforward neural network layer

model = arithmeticModel(num_embeddings, embedding_dim, input_size, hidden_size, num_layers, in_features, out_features)# Create model.

# transmit the model to GPU memory
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create optimizer.
optim = torch.optim.Adam(
    params=model.parameters(),
    lr=1e-4,
)

objtv = torch.nn.CrossEntropyLoss() # Create objective function.

epochs = 3

for cur_epoch in range(epochs):
    # activate the batch normalization and dropout layers in the model
    model.train()
    # process bar
    tqdm_train_dl = tqdm(
        train_dl,
        desc=f'Train: epoch {cur_epoch}'
    )
    num_correct = 0
    num_samples = 0
    for batch in tqdm_train_dl:
        # Clean up gradient.
        optim.zero_grad()
        # Forward pass.
        src_seqs, tgt_seqs = batch
        src_seqs, tgt_seqs = src_seqs.to(device), tgt_seqs.to(device)
        output_logits = model(src_seqs)

        # Calculate loss.
        loss = objtv(output_logits.transpose(1, 2), tgt_seqs)
        # Backward pass.
        loss.backward()
        # Gradient descent. model optimization
        optim.step()

        # Calculate accuracy
        predicted_labels = torch.argmax(output_logits, dim=2)
        num_correct += (predicted_labels == tgt_seqs).sum().item()
        num_samples += tgt_seqs.size(0) * tgt_seqs.size(1)  # Multiply by batch size and sequence length

    train_accuracy = num_correct / num_samples
    print(f'Train Accuracy: {train_accuracy * 100:.2f}%')

    # Save checkpoints.
    # deactivate the batch normalization and dropout layers in the model
    model.eval()
    tqdm_validate_dl = tqdm(
        validate_dl,
        desc=f'validation: epoch {cur_epoch}'
    )
    num_correct = 0
    num_samples = 0
    for batch in tqdm_validate_dl:
        # Input the data batch into the model and obtain the predictions
        src_seqs, tgt_seqs = batch
        src_seqs, tgt_seqs = src_seqs.to(device), tgt_seqs.to(device)
        output_logits = model(src_seqs)

        # Calculate accuracy
        predicted_labels = torch.argmax(output_logits, dim=2)
        num_correct += (predicted_labels == tgt_seqs).sum().item()
        num_samples += tgt_seqs.size(0) * tgt_seqs.size(1)  # Multiply by batch size and sequence length

    validation_accuracy = num_correct / num_samples
    print(f'Validation Accuracy: {validation_accuracy * 100:.2f}%')


  src_seqs_tensor = torch.tensor(src_seqs)
  tgt_seqs_tensor = torch.tensor(tgt_seqs)
Train: epoch 0: 100%|██████████| 37020/37020 [14:55<00:00, 41.32it/s]


Train Accuracy: 77.27%


validation: epoch 0: 100%|██████████| 4114/4114 [01:16<00:00, 53.62it/s]


Validation Accuracy: 75.41%


Train: epoch 1: 100%|██████████| 37020/37020 [14:54<00:00, 41.40it/s]


Train Accuracy: 77.47%


validation: epoch 1: 100%|██████████| 4114/4114 [01:13<00:00, 55.79it/s]


Validation Accuracy: 75.29%


Train: epoch 2: 100%|██████████| 37020/37020 [14:56<00:00, 41.32it/s]


Train Accuracy: 77.48%


validation: epoch 2: 100%|██████████| 4114/4114 [01:13<00:00, 56.35it/s]

Validation Accuracy: 74.66%



