#Select an Architecture


In [1]:
architecture = "best" # choose this for the overall best architecture
# architecture = "task" # choose this for the task specific best architecture

# Dataset: Simple Split

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils as torch_utils
import numpy as np

import random
import math
import time

In [None]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from concurrent.futures import ThreadPoolExecutor

def download_file(file_url):
    response = requests.get(file_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch the file. Status code: {response.status_code}")
        return None

# URLs of the files you want to download
file_urls = [
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_train_simple.txt',
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_test_simple.txt'  # Replace with the actual URL of the second file
]

# Use ThreadPoolExecutor to download files concurrently
with ThreadPoolExecutor(max_workers=len(file_urls)) as executor:
    results = list(executor.map(download_file, file_urls))

# Check if all downloads were successful
if all(result is not None for result in results):
    # Assuming each line in the text file is a sentence
    sentences_list = [result.split('\n') for result in results]

    # Create DataFrames for each file
    dfs = [pd.DataFrame({'Sentences': sentences}) for sentences in sentences_list]

    # Create DataFrames for each file
    train_df = pd.DataFrame({'Sentences': sentences_list[0]})
    eval_df = pd.DataFrame({'Sentences': sentences_list[1]})
else:
    print("One or more downloads failed.")

In [None]:
print(train_df.head())

                                           Sentences
0  IN: jump opposite right twice and turn opposit...
1  IN: run opposite left after walk right OUT: I_...
2  IN: walk after run around right twice OUT: I_T...
3  IN: look around right thrice and turn left OUT...
4  IN: walk opposite left twice and walk opposite...


In [None]:
SEED = 4389

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from torch.utils.data import Dataset

class SentenceDataset(Dataset):
   def __init__(self, sentences):
       self.sentences = sentences

   def __len__(self):
       return len(self.sentences)

   def __getitem__(self, idx):
       sentence = self.sentences[idx]
       in_sentence = sentence.split('IN:')[1].split('OUT:')[0].strip()
       out_sentence = sentence.split('OUT:')[1].strip()
       return [in_sentence, out_sentence]

In [None]:
train_set = SentenceDataset(train_df['Sentences'])
eval_set = SentenceDataset(eval_df['Sentences'])

In [None]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count PAD SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


In [None]:
input_vocab = Vocab('IN')
output_vocab = Vocab('OUT')
for line in train_set:
    input_vocab.addSentence(line[0])
    output_vocab.addSentence(line[1])

In [None]:
print(input_vocab.word2index)
print(input_vocab.index2word)


{'jump': 3, 'opposite': 4, 'right': 5, 'twice': 6, 'and': 7, 'turn': 8, 'thrice': 9, 'run': 10, 'left': 11, 'after': 12, 'walk': 13, 'around': 14, 'look': 15}
{0: 'PAD', 1: 'SOS', 2: 'EOS', 3: 'jump', 4: 'opposite', 5: 'right', 6: 'twice', 7: 'and', 8: 'turn', 9: 'thrice', 10: 'run', 11: 'left', 12: 'after', 13: 'walk', 14: 'around', 15: 'look'}


# DATALOADER

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataset, in_voacab, out_vocab):
        self.dataset = dataset
        self.in_vocab = in_voacab
        self.out_vocab = out_vocab
        input_sentences = []
        target_sentences = []
        for line in self.dataset:
            input_sentences.append(line[0])
            target_sentences.append(line[1])
        self.input_sentences = input_sentences
        self.target_sentences = target_sentences


    def __len__(self):
        return len(self.input_sentences)

    def __getitem__(self, idx):
        input_sentence = self.input_sentences[idx]
        target_sentence = self.target_sentences[idx]

        # Convert words to indices using word2index dictionary
        input_indices = indexesFromSentence(self.in_vocab, input_sentence)
        input_indices.append(EOS_token)
        target_indices = indexesFromSentence(self.out_vocab, target_sentence)
        target_indices.append(EOS_token)

        return torch.LongTensor(input_indices).to(device), torch.LongTensor(target_indices).to(device)


# LSTM encoder decoder

In [4]:
import torch.nn as nn


class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.0):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))

        output, hidden = self.LSTM(embedded)
        return output, hidden


In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch

class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        #print("decoder input", decoder_input)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        if target_tensor is not None:
            target_len = target_tensor.size(1)
        else:
            target_len = 48
        #print("target len:", trg_len)
        #print("decoder hidden", decoder_hidden)

        for i in range(target_len):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)
            #print("decoder single output", decoder_output)
            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                #print("target tensor", target_tensor)
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
                #print("Decoder input case 1 (target)", decoder_input)
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
                #print("Decoder input case 2 (topi)", decoder_input)

        #print("decoder outputs", decoder_outputs)
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.LSTM(output, hidden)
        output = self.out(output)
        return output, hidden

# Train encoder decoder

In [None]:
import torch.nn.utils as torch_utils

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, max_norm=5.0):

    total_loss = 0

    input_tensor, target_tensor = next(iter(dataloader))
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

    #print("Decoder that get passed =", decoder_outputs.view(-1, decoder_outputs.size(-1)))
    #print("target_tensor that get passed =", target_tensor.view(-1))
    loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        target_tensor.view(-1)
    )
    #print("len of Decoder outputus:", len(decoder_outputs.view(-1, decoder_outputs.size(-1))))
    #print("len of target_tensor:", len(target_tensor.view(-1)))

    #print("decoder_outputs", decoder_outputs)
    #print("target_tensor", target_tensor)
    loss.backward()

    # Gradient clipping for both encoder and decoder
    torch_utils.clip_grad_norm_(encoder.parameters(), max_norm)
    torch_utils.clip_grad_norm_(decoder.parameters(), max_norm)

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.item()

    return total_loss
    #return total_loss / len(dataloader)

In [None]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
def evaluate(encoder, decoder, test_sentences, input_lang, output_lang):
    success = 0
    success_partial = 0
    with torch.no_grad():
      for i in range(len(test_sentences)):
          print(i)
          input_sentence = test_sentences[i][0]
          input_tensor = tensorFromSentence(input_lang, input_sentence)

          encoder_outputs, encoder_hidden = encoder(input_tensor)
          decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

          _, topi = decoder_outputs.topk(1)
          decoded_ids = topi.squeeze()

          decoded_words = []
          for idx in decoded_ids:
              if idx.item() == EOS_token:
                  break
              decoded_words.append(output_lang.index2word[idx.item()])
          #check exact match between decoded_words and test_dataset[i][1]
          #print("Expected output: " + test_sentences[i][1])
          #print("Predicted output: " + " ".join(decoded_words))

          #check exact match
          if (decoded_words == test_sentences[i][1].split()):
              success = success + 1
              print("succes ", success, "out of", i+1)
          #check partial match
          if (test_sentences[i][1] in " ".join(decoded_words)):
              success_partial = success_partial + 1
              print("partial succes ", success_partial, "out of", i+1)


    print(f"Exact match accuracy: {success / len(test_sentences) * 100:.2f}%")
    print(f"Partial match accuracy: {success_partial / len(test_sentences) * 100:.2f}%")
    return (success / len(test_sentences) * 100)

# Experiment 1

In [None]:
indexes_001 = random.sample(range(len(train_set)), int(len(train_set)*0.01))
train_set_001= [train_set[i] for i in indexes_001]

indexes_002 = random.sample(range(len(train_set)-1), int(len(train_set)*0.02))
train_set_002= [train_set[i] for i in indexes_002]

indexes_004 = random.sample(range(len(train_set)-1), int(len(train_set)*0.04))
train_set_004= [train_set[i] for i in indexes_004]

indexes_008 = random.sample(range(len(train_set)-1), int(len(train_set)*0.08))
train_set_008= [train_set[i] for i in indexes_008]

indexes_016 = random.sample(range(len(train_set)-1), int(len(train_set)*0.16))
train_set_016= [train_set[i] for i in indexes_016]

indexes_032 = random.sample(range(len(train_set)-1), int(len(train_set)*0.32))
train_set_032= [train_set[i] for i in indexes_032]

indexes_64 = random.sample(range(len(train_set)-1), int(len(train_set)*0.64))
train_set_64= [train_set[i] for i in indexes_64]

In [None]:
print(indexes_001[:10])

[5476, 15543, 14420, 623, 10390, 15643, 1756, 8810, 3513, 1757]


In [2]:
#Dropout change based on the chosen architecture:
if architecture == "best":
  dropout = 0.5
else:
  dropout = 0


1%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_001, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size, dropout).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 15, print_every=100, plot_every=500)

In [None]:
results = []

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result001 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result001)

2%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_002, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size, dropout).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)


train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result002 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
#results.append(result002)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
950
951
952
succes  211 out of 953
partial succes  335 out of 953
953
succes  212 out of 954
partial succes  336 out of 954
954
955
956
957
958
succes  213 out of 959
partial succes  337 out of 959
959
960
961
succes  214 out of 962
partial succes  338 out of 962
962
963
succes  215 out of 964
partial succes  339 out of 964
964
965
966
succes  216 out of 967
partial succes  340 out of 967
967
968
partial succes  341 out of 969
969
970
partial succes  342 out of 971
971
partial succes  343 out of 972
972
973
succes  217 out of 974
partial succes  344 out of 974
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
partial succes  345 out of 997
997
succes  218 out of 998
partial succes  346 out of 998
998
999
1000
1001
1002
1003
succes  219 out of 1004
partial succes  347 out of 1004
1004
succes  220 out of 1005
partial succes  348 out of 1005
1005
1006
1007
1008
1009
1010
1011
succes 

4%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_004, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size, dropout).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result004 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result004)

8%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_008, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size, dropout).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result008 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result008)

16%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_016, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size, dropout).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

0m 20s (- 34m 10s) (1000 1%) 0.9555
0m 38s (- 31m 37s) (2000 2%) 0.5306
0m 55s (- 29m 55s) (3000 3%) 0.3527
1m 12s (- 29m 3s) (4000 4%) 0.2420
1m 30s (- 28m 36s) (5000 5%) 0.1777
1m 47s (- 28m 6s) (6000 6%) 0.1358
2m 5s (- 27m 46s) (7000 7%) 0.1041
2m 22s (- 27m 23s) (8000 8%) 0.0907
2m 40s (- 26m 58s) (9000 9%) 0.0739
2m 56s (- 26m 32s) (10000 10%) 0.0650
3m 13s (- 26m 8s) (11000 11%) 0.0608
3m 31s (- 25m 50s) (12000 12%) 0.0515
3m 48s (- 25m 27s) (13000 13%) 0.0450
4m 5s (- 25m 6s) (14000 14%) 0.0423
4m 22s (- 24m 49s) (15000 15%) 0.0349
4m 39s (- 24m 29s) (16000 16%) 0.0343
4m 57s (- 24m 10s) (17000 17%) 0.0267
5m 13s (- 23m 50s) (18000 18%) 0.0327
5m 31s (- 23m 34s) (19000 19%) 0.0193
5m 49s (- 23m 18s) (20000 20%) 0.0253
6m 6s (- 22m 59s) (21000 21%) 0.0276
6m 24s (- 22m 44s) (22000 22%) 0.0219
6m 41s (- 22m 25s) (23000 23%) 0.0183
6m 59s (- 22m 7s) (24000 24%) 0.0189
7m 17s (- 21m 52s) (25000 25%) 0.0208
7m 34s (- 21m 33s) (26000 26%) 0.0106
7m 51s (- 21m 14s) (27000 27%) 0.0124


KeyboardInterrupt: ignored

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result016 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result008)

full text

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size, dropout).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

0m 21s (- 34m 55s) (1000 1%) 0.9421


KeyboardInterrupt: ignored

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result= evaluate(encoder, decoder, eval, input_vocab, output_vocab)
