# Select An Architecture

In [None]:
architecture = "best" # choose this for the overall best architecture
# architecture = "task" # choose this for the task specific best architecture

# Prepare Data
Import libraries, download text data from github, set seeds for reproducability, create vocabulary, dataset and dataloader

In [None]:
%matplotlib inline
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils as torch_utils
import numpy as np

import random
import math
import time

In [None]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from concurrent.futures import ThreadPoolExecutor

def download_file(file_url):
    response = requests.get(file_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch the file. Status code: {response.status_code}")
        return None
    
# URLs of the files you want to download
file_urls = [
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/length_split/tasks_train_length.txt',
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/length_split/tasks_test_length.txt'  # Replace with the actual URL of the second file
]

# Use ThreadPoolExecutor to download files concurrently
with ThreadPoolExecutor(max_workers=len(file_urls)) as executor:
    results = list(executor.map(download_file, file_urls))

# Check if all downloads were successful
if all(result is not None for result in results):
    # Assuming each line in the text file is a sentence
    sentences_list = [result.split('\n') for result in results]

    # Create DataFrames for each file
    dfs = [pd.DataFrame({'Sentences': sentences}) for sentences in sentences_list]

    # Create DataFrames for each file
    train_df_exp2 = pd.DataFrame({'Sentences': sentences_list[0]})
    eval_df_exp2 = pd.DataFrame({'Sentences': sentences_list[1]})
else:
    print("One or more downloads failed.")
    

In [None]:
SEED = 3

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from torch.utils.data import Dataset

class SentenceDataset(Dataset):
   def __init__(self, sentences):
       self.sentences = sentences

   def __len__(self):
       return len(self.sentences)

   def __getitem__(self, idx):
       sentence = self.sentences[idx]
       in_sentence = sentence.split('IN:')[1].split('OUT:')[0].strip()
       out_sentence = sentence.split('OUT:')[1].strip()
       return [in_sentence, out_sentence]

In [None]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count PAD SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


In [None]:
train_set_length = SentenceDataset(train_df_exp2['Sentences'])
eval_set_length = SentenceDataset(eval_df_exp2['Sentences'])

In [None]:
input_vocab = Vocab('IN')
output_vocab = Vocab('OUT')

for line in train_set_length:
    input_vocab.addSentence(line[0])
    output_vocab.addSentence(line[1])

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataset, in_voacab, out_vocab):
        self.dataset = dataset
        self.in_vocab = in_voacab
        self.out_vocab = out_vocab
        input_sentences = []
        target_sentences = []
        for line in self.dataset:
            input_sentences.append(line[0])
            target_sentences.append(line[1])
        self.input_sentences = input_sentences
        self.target_sentences = target_sentences


    def __len__(self):
        return len(self.input_sentences)

    def __getitem__(self, idx):
        input_sentence = self.input_sentences[idx]
        target_sentence = self.target_sentences[idx]

        # Convert words to indices using word2index dictionary
        input_indices = indexesFromSentence(self.in_vocab, input_sentence)
        input_indices.append(EOS_token)
        target_indices = indexesFromSentence(self.out_vocab, target_sentence)
        target_indices.append(EOS_token)

        return torch.LongTensor(input_indices).to(device), torch.LongTensor(target_indices).to(device)


# Model Definitions
Includes both overall best and task best architecture

In [None]:
import torch.nn as nn
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.5):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))

        output, hidden = self.LSTM(embedded)
        return output, hidden

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.5):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch

class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.5):
        super(DecoderLSTM, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, epoch=0, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        if target_tensor is not None:
            target_len = target_tensor.size(1)
        else:
            target_len = 48

        for i in range(target_len):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)
            if target_tensor is not None and epoch < 50000:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.dropout(self.embedding(input))
        output = F.relu(output)
        output, hidden = self.LSTM(output, hidden)
        output = self.out(output)
        return output, hidden

class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights


class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.5):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, epoch=0, target_tensor=None, oracle=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        if target_tensor is not None:
            target_len = target_tensor.size(1)
        else:
            target_len = 48

        if oracle is not None:
            min_len = oracle

        for i in range(target_len):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None and epoch < 50000:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                # Code also includes lenght oracle as debugging option
                _, topi = decoder_output.topk(2)
                top1 = topi[0,0,0].item()
                top2 = topi[0,0,1].item()

                if oracle is not None and i < min_len and top1 == EOS_token:
                    decoder_input = torch.tensor([top2]).to(device).unsqueeze(1).detach()  # detach from history as input
                else:
                    decoder_input = torch.tensor([top1]).to(device).unsqueeze(1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

# Train encoder decoder

In [None]:
import torch.nn.utils as torch_utils

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, epoch, max_norm=5.0):

    total_loss = 0

    input_tensor, target_tensor = next(iter(dataloader))
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, epoch,target_tensor)

    loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        target_tensor.view(-1)
    )

    loss.backward()

    # Gradient clipping for both encoder and decoder
    torch_utils.clip_grad_norm_(encoder.parameters(), max_norm)
    torch_utils.clip_grad_norm_(decoder.parameters(), max_norm)

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.item()

    return total_loss

In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, epoch)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

# Choose Architecture

In [None]:
batch_size = 1

train_dataset = CustomDataset(train_set_length, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

if architecture == "best":
    hidden_size = 200
    encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
    decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)
elif architecture == "task":
    hidden_size = 50
    encoder = EncoderRNN(input_vocab.n_words, hidden_size).to(device)
    decoder = AttnDecoderRNN(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=100, plot_every=500)

In [None]:
# torch.save(encoder, f"rnnencoder{SEED}")
# torch.save(decoder, f"rnndecoder{SEED}")

# Evaluation

In [None]:
cmd_lengths_compact = [4, 6, 7, 8, 9]
act_lengths_compact = [24, 25, 26, 27, 28, 30, 32, 33, 36, 40, 48]

def evaluate(encoder, decoder, test_sentences, input_lang, output_lang):
    success = 0
    success_partial = 0

    MAX_LENGTH = 49
    cmd_lengths = np.zeros(10)
    act_lengths = np.zeros(MAX_LENGTH)
    cmd_acc = np.zeros(10)
    act_acc = np.zeros(MAX_LENGTH)

    with torch.no_grad():
        for i in range(len(test_sentences)):
            print(i)
            input_sentence = test_sentences[i][0]
            input_tensor = tensorFromSentence(input_lang, input_sentence)

            cmd_len = len(input_sentence.split())
            act_len = len(test_sentences[i][1].split())

            encoder_outputs, encoder_hidden = encoder(input_tensor)

            decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden, epoch=1)
            _, topi = decoder_outputs.topk(1)
            decoded_ids = topi.squeeze()

            # -------------------- LENGHT ORACLE DEBUGGING --------------------------
            # min_len = act_len
            # decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, oracle=act_len)
            # _, topi = decoder_outputs.topk(2)
            # decoded_ids = topi.squeeze()
            # decoded_ids = torch.tensor([top2 if idx < min_len and top1 == EOS_token else top1 for idx, [top1, top2] in enumerate(decoded_ids.to('cpu').numpy())])

            # try:
            #     index = (decoded_ids == EOS_token).nonzero()[0].item()
            #     decoded_ids = decoded_ids[:index + 1]
            # except:
            #     # pass
            #     print(decoded_ids, i)


            decoded_words = []
            for idx in decoded_ids:
                if idx.item() == EOS_token:
                    break
                decoded_words.append(output_lang.index2word[idx.item()])


            cmd_lengths[cmd_len] += 1
            act_lengths[act_len] += 1

            if (decoded_words == test_sentences[i][1].split()):
                success = success + 1
                cmd_acc[cmd_len] += 1
                act_acc[act_len] += 1
                print("succes ", success, "out of", i+1)
            #check partial match
            if (test_sentences[i][1] in " ".join(decoded_words)):
                success_partial = success_partial + 1
                print("partial succes ", success_partial, "out of", i+1)

    cmd_lengths, act_lengths = cmd_lengths[cmd_lengths_compact], act_lengths[act_lengths_compact]
    cmd_acc, act_acc = cmd_acc[cmd_lengths_compact], act_acc[act_lengths_compact]

    cmd_acc = cmd_acc/cmd_lengths * 100
    act_acc = act_acc/act_lengths * 100

    print(f"Exact match accuracy: {success / len(test_sentences) * 100:.2f}%")
    print(f"Partial match accuracy: {success_partial / len(test_sentences) * 100:.2f}%")
    return (success / len(test_sentences) * 100), cmd_acc, act_acc

eval = [eval_set_length[i] for i in range(3920)]
encoder.eval()
decoder.eval()
avg, cmd, act = evaluate(encoder, decoder, eval, input_vocab, output_vocab)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.bar([str(len) for len in cmd_lengths_compact], cmd, color='lightsteelblue')
ax2.bar([str(len) for len in act_lengths_compact], act, color='lightsteelblue')

ax1.set_ylim(top = 100)
ax2.set_ylim(top = 100)

ax1.set_ylabel("Accuracy on new commands (%)")
ax2.set_ylabel("Accuracy on new commands (%)")

ax1.set_xlabel("Ground-truth action sequence length")
ax2.set_xlabel("Command length")

print("cmd lenghts in order: ", cmd_lengths_compact)
print("cmd lenghts in order: ", act_lengths_compact)
print()
print("exact match accuracy:               ", avg)
print("accuracy by command length:         ", cmd)
print("accuracy by action sequence lenght: ", act)