# Dataset: Simple Split

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.utils as torch_utils
import numpy as np

import random
import math
import time

In [2]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
from concurrent.futures import ThreadPoolExecutor

def download_file(file_url):
    response = requests.get(file_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch the file. Status code: {response.status_code}")
        return None

# URLs of the files you want to download
file_urls = [
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_train_simple.txt',
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/simple_split/tasks_test_simple.txt'  # Replace with the actual URL of the second file
]

# Use ThreadPoolExecutor to download files concurrently
with ThreadPoolExecutor(max_workers=len(file_urls)) as executor:
    results = list(executor.map(download_file, file_urls))

# Check if all downloads were successful
if all(result is not None for result in results):
    # Assuming each line in the text file is a sentence
    sentences_list = [result.split('\n') for result in results]

    # Create DataFrames for each file
    dfs = [pd.DataFrame({'Sentences': sentences}) for sentences in sentences_list]

    # Create DataFrames for each file
    train_df = pd.DataFrame({'Sentences': sentences_list[0]})
    eval_df = pd.DataFrame({'Sentences': sentences_list[1]})
else:
    print("One or more downloads failed.")

In [3]:
print(train_df.head())

                                           Sentences
0  IN: jump opposite right twice and turn opposit...
1  IN: run opposite left after walk right OUT: I_...
2  IN: walk after run around right twice OUT: I_T...
3  IN: look around right thrice and turn left OUT...
4  IN: walk opposite left twice and walk opposite...


In [4]:
SEED = 5392

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from torch.utils.data import Dataset

class SentenceDataset(Dataset):
   def __init__(self, sentences):
       self.sentences = sentences

   def __len__(self):
       return len(self.sentences)

   def __getitem__(self, idx):
       sentence = self.sentences[idx]
       in_sentence = sentence.split('IN:')[1].split('OUT:')[0].strip()
       out_sentence = sentence.split('OUT:')[1].strip()
       return [in_sentence, out_sentence]

In [6]:
train_set = SentenceDataset(train_df['Sentences'])
eval_set = SentenceDataset(eval_df['Sentences'])

In [7]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "PAD", 1: "SOS", 2: "EOS"}
        self.n_words = 3  # Count PAD SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [8]:
def indexesFromSentence(vocab, sentence):
    return [vocab.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)


In [9]:
input_vocab = Vocab('IN')
output_vocab = Vocab('OUT')
for line in train_set:
    input_vocab.addSentence(line[0])
    output_vocab.addSentence(line[1])

In [10]:
print(input_vocab.word2index)
print(input_vocab.index2word)


{'jump': 3, 'opposite': 4, 'right': 5, 'twice': 6, 'and': 7, 'turn': 8, 'thrice': 9, 'run': 10, 'left': 11, 'after': 12, 'walk': 13, 'around': 14, 'look': 15}
{0: 'PAD', 1: 'SOS', 2: 'EOS', 3: 'jump', 4: 'opposite', 5: 'right', 6: 'twice', 7: 'and', 8: 'turn', 9: 'thrice', 10: 'run', 11: 'left', 12: 'after', 13: 'walk', 14: 'around', 15: 'look'}


# DATALOADER

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, dataset, in_voacab, out_vocab):
        self.dataset = dataset
        self.in_vocab = in_voacab
        self.out_vocab = out_vocab
        input_sentences = []
        target_sentences = []
        for line in self.dataset:
            input_sentences.append(line[0])
            target_sentences.append(line[1])
        self.input_sentences = input_sentences
        self.target_sentences = target_sentences


    def __len__(self):
        return len(self.input_sentences)

    def __getitem__(self, idx):
        input_sentence = self.input_sentences[idx]
        target_sentence = self.target_sentences[idx]

        # Convert words to indices using word2index dictionary
        input_indices = indexesFromSentence(self.in_vocab, input_sentence)
        input_indices.append(EOS_token)
        target_indices = indexesFromSentence(self.out_vocab, target_sentence)
        target_indices.append(EOS_token)

        return torch.LongTensor(input_indices).to(device), torch.LongTensor(target_indices).to(device)


# LSTM encoder decoder

In [12]:
import torch.nn as nn


class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.5):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))

        output, hidden = self.LSTM(embedded)
        return output, hidden


In [33]:
import torch.nn as nn
import torch.nn.functional as F
import torch

class DecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.5):
        super(DecoderLSTM, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.LSTM = nn.LSTM(hidden_size, hidden_size, num_layers=2, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, epoch=0, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        #print("decoder input", decoder_input)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        if target_tensor is not None:
            target_len = target_tensor.size(1)
        else:
            target_len = 48
        #print("target len:", trg_len)
        #print("decoder hidden", decoder_hidden)

        for i in range(target_len):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)
            #print("decoder single output", decoder_output)
            if target_tensor is not None and epoch < 50000:
                # Teacher forcing: Feed the target as the next input
                #print("target tensor", target_tensor)
                #print("epoch:", epoch)
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
                #print("Decoder input case 1 (target)", decoder_input)
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input
                #print("Decoder input case 2 (topi)", decoder_input)

        #print("decoder outputs", decoder_outputs)
        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.dropout(self.embedding(input))
        output = F.relu(output)
        output, hidden = self.LSTM(output, hidden)
        output = self.out(output)
        return output, hidden

# Train encoder decoder

In [22]:
import torch.nn.utils as torch_utils

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion, epoch, max_norm=5.0):

    total_loss = 0

    input_tensor, target_tensor = next(iter(dataloader))
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, epoch,target_tensor)

    #print("Decoder that get passed =", decoder_outputs.view(-1, decoder_outputs.size(-1)))
    #print("target_tensor that get passed =", target_tensor.view(-1))
    loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        target_tensor.view(-1)
    )

    #print("decoder_outputs", decoder_outputs)
    #print("target_tensor", target_tensor)
    loss.backward()

    # Gradient clipping for both encoder and decoder
    torch_utils.clip_grad_norm_(encoder.parameters(), max_norm)
    torch_utils.clip_grad_norm_(decoder.parameters(), max_norm)

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss += loss.item()

    return total_loss
    #return total_loss / len(dataloader)

In [15]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [16]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [17]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, epoch)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [31]:
def evaluate(encoder, decoder, test_sentences, input_lang, output_lang):
    success = 0
    success_partial = 0
    with torch.no_grad():
      for i in range(len(test_sentences)):
          print(i)
          input_sentence = test_sentences[i][0]
          input_tensor = tensorFromSentence(input_lang, input_sentence)

          encoder_outputs, encoder_hidden = encoder(input_tensor)
          decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden, epoch=1)

          _, topi = decoder_outputs.topk(1)
          decoded_ids = topi.squeeze()

          decoded_words = []
          for idx in decoded_ids:
              if idx.item() == EOS_token:
                  break
              decoded_words.append(output_lang.index2word[idx.item()])
          #check exact match between decoded_words and test_dataset[i][1]
          #print("Expected output: " + test_sentences[i][1])
          #print("Predicted output: " + " ".join(decoded_words))

          #check exact match
          if (decoded_words == test_sentences[i][1].split()):
              success = success + 1
              print("succes ", success, "out of", i+1)
          #check partial match
          if (test_sentences[i][1] in " ".join(decoded_words)):
              success_partial = success_partial + 1
              print("partial succes ", success_partial, "out of", i+1)


    print(f"Exact match accuracy: {success / len(test_sentences) * 100:.2f}%")
    print(f"Partial match accuracy: {success_partial / len(test_sentences) * 100:.2f}%")
    return (success / len(test_sentences) * 100)

# Experiment 1

In [35]:
indexes_001 = random.sample(range(len(train_set)), 210)
train_set_001= [train_set[i] for i in indexes_001]

indexes_002 = random.sample(range(len(train_set)-1), 420)
train_set_002= [train_set[i] for i in indexes_002]
print(len(indexes_001))
print(len(set(indexes_001)))

indexes_004 = random.sample(range(len(train_set)-1), 840)
train_set_004= [train_set[i] for i in indexes_004]

indexes_008 = random.sample(range(len(train_set)-1), int(len(train_set)*0.08))
train_set_008= [train_set[i] for i in indexes_008]

indexes_016 = random.sample(range(len(train_set)-1), int(len(train_set)*0.16))
train_set_016= [train_set[i] for i in indexes_016]

indexes_032 = random.sample(range(len(train_set)-1), int(len(train_set)*0.32))
train_set_032= [train_set[i] for i in indexes_032]

indexes_64 = random.sample(range(len(train_set)-1), int(len(train_set)*0.64))
train_set_64= [train_set[i] for i in indexes_64]

210
210


1%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_001, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=100, plot_every=500)

In [20]:
results = []

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result001 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result001)

2%

In [36]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_002, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)


train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

0m 18s (- 29m 43s) (1000 1%) 0.9912
0m 35s (- 28m 49s) (2000 2%) 0.5190
0m 52s (- 28m 21s) (3000 3%) 0.3514
1m 12s (- 28m 52s) (4000 4%) 0.2424
1m 29s (- 28m 15s) (5000 5%) 0.1906
1m 46s (- 27m 48s) (6000 6%) 0.1538
2m 4s (- 27m 40s) (7000 7%) 0.1147
2m 22s (- 27m 19s) (8000 8%) 0.1071
2m 39s (- 26m 56s) (9000 9%) 0.0842
2m 58s (- 26m 45s) (10000 10%) 0.0720
3m 15s (- 26m 22s) (11000 11%) 0.0741
3m 33s (- 26m 5s) (12000 12%) 0.0624
3m 51s (- 25m 49s) (13000 13%) 0.0431
4m 9s (- 25m 30s) (14000 14%) 0.0586
4m 26s (- 25m 8s) (15000 15%) 0.0391
4m 44s (- 24m 55s) (16000 16%) 0.0402
5m 2s (- 24m 35s) (17000 17%) 0.0414
5m 19s (- 24m 13s) (18000 18%) 0.0346
5m 36s (- 23m 56s) (19000 19%) 0.0427
5m 54s (- 23m 36s) (20000 20%) 0.0271
6m 12s (- 23m 19s) (21000 21%) 0.0295
6m 30s (- 23m 3s) (22000 22%) 0.0280
6m 47s (- 22m 45s) (23000 23%) 0.0229
7m 5s (- 22m 26s) (24000 24%) 0.0337
7m 23s (- 22m 10s) (25000 25%) 0.0211
7m 41s (- 21m 52s) (26000 26%) 0.0235
7m 58s (- 21m 33s) (27000 27%) 0.0201

In [32]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result002 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result002)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
partial succes  628 out of 1374
1374
1375
1376
succes  459 out of 1377
partial succes  629 out of 1377
1377
succes  460 out of 1378
partial succes  630 out of 1378
1378
succes  461 out of 1379
partial succes  631 out of 1379
1379
succes  462 out of 1380
partial succes  632 out of 1380
1380
1381
1382
1383
partial succes  633 out of 1384
1384
succes  463 out of 1385
partial succes  634 out of 1385
1385
succes  464 out of 1386
partial succes  635 out of 1386
1386
partial succes  636 out of 1387
1387
1388
succes  465 out of 1389
partial succes  637 out of 1389
1389
1390
succes  466 out of 1391
partial succes  638 out of 1391
1391
partial succes  639 out of 1392
1392
1393
partial succes  640 out of 1394
1394
1395
succes  467 out of 1396
partial succes  641 out of 1396
1396
1397
1398
1399
succes  468 out of 1400
partial succes  642 out of 1400
1400
1401
succes  469 out of 1402
partial succes  643 out of 1402
1402
partial succes

4%

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_004, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result004 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result004)

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_008, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=1000, plot_every=500)

In [None]:
eval = [eval_set[i] for i in range(4182)]
encoder.eval()
decoder.eval()
result008 = evaluate(encoder, decoder, eval, input_vocab, output_vocab)
results.append(result008)

# Experiment 2

In [None]:
# URLs of the files you want to download
file_urls = [
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/length_split/tasks_train_length.txt',
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/length_split/tasks_test_length.txt'  # Replace with the actual URL of the second file
]

# Use ThreadPoolExecutor to download files concurrently
with ThreadPoolExecutor(max_workers=len(file_urls)) as executor:
    results = list(executor.map(download_file, file_urls))

# Check if all downloads were successful
if all(result is not None for result in results):
    # Assuming each line in the text file is a sentence
    sentences_list = [result.split('\n') for result in results]

    # Create DataFrames for each file
    dfs = [pd.DataFrame({'Sentences': sentences}) for sentences in sentences_list]

    # Create DataFrames for each file
    train_df_exp2 = pd.DataFrame({'Sentences': sentences_list[0]})
    eval_df_exp2 = pd.DataFrame({'Sentences': sentences_list[1]})
else:
    print("One or more downloads failed.")

In [None]:
train_set_lenght = SentenceDataset(train_df_exp2['Sentences'])
eval_set_lenght = SentenceDataset(eval_df_exp2['Sentences'])

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_lenght, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=100, plot_every=500)

0m 4s (- 67m 22s) (100 0%) 1.7762
0m 5s (- 45m 21s) (200 0%) 1.4891
0m 6s (- 37m 39s) (300 0%) 1.3509
0m 8s (- 34m 1s) (400 0%) 1.1690
0m 10s (- 33m 56s) (500 0%) 1.0978
0m 12s (- 33m 19s) (600 0%) 0.9706
0m 13s (- 31m 45s) (700 0%) 0.9071
0m 14s (- 30m 44s) (800 0%) 0.8429
0m 16s (- 29m 47s) (900 0%) 0.8230
0m 17s (- 29m 18s) (1000 1%) 0.6910
0m 19s (- 28m 45s) (1100 1%) 0.6813
0m 20s (- 28m 23s) (1200 1%) 0.6559
0m 22s (- 28m 23s) (1300 1%) 0.6641
0m 24s (- 28m 43s) (1400 1%) 0.6171
0m 26s (- 28m 31s) (1500 1%) 0.5435
0m 27s (- 28m 11s) (1600 1%) 0.5534
0m 28s (- 27m 54s) (1700 1%) 0.5292
0m 30s (- 27m 35s) (1800 1%) 0.5306
0m 31s (- 27m 20s) (1900 1%) 0.4989
0m 33s (- 27m 17s) (2000 2%) 0.4452
0m 35s (- 27m 46s) (2100 2%) 0.4532
0m 37s (- 27m 58s) (2200 2%) 0.4294
0m 39s (- 27m 54s) (2300 2%) 0.3946
0m 40s (- 27m 41s) (2400 2%) 0.4039
0m 42s (- 27m 28s) (2500 2%) 0.3730
0m 44s (- 27m 43s) (2600 2%) 0.3749
0m 46s (- 28m 5s) (2700 2%) 0.3872
0m 48s (- 27m 53s) (2800 2%) 0.3320
0m 49s 

In [None]:
len(eval_set_lenght)

3921

In [None]:
eval = [eval_set_lenght[i] for i in range(3920)]
encoder.eval()
decoder.eval()
evaluate(encoder, decoder, eval, input_vocab, output_vocab)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
28
succes  27 out of 29
partial succes  27 out of 29
29
succes  28 out of 30
partial succes  28 out of 30
30
succes  29 out of 31
partial succes  29 out of 31
31
succes  30 out of 32
partial succes  30 out of 32
32
33
succes  31 out of 34
partial succes  31 out of 34
34
succes  32 out of 35
partial succes  32 out of 35
35
succes  33 out of 36
partial succes  33 out of 36
36
succes  34 out of 37
partial succes  34 out of 37
37
38
39
succes  35 out of 40
partial succes  35 out of 40
40
succes  36 out of 41
partial succes  36 out of 41
41
succes  37 out of 42
partial succes  37 out of 42
42
succes  38 out of 43
partial succes  38 out of 43
43
succes  39 out of 44
partial succes  39 out of 44
44
45
46
succes  40 out of 47
partial succes  40 out of 47
47
48
succes  41 out of 49
partial succes  41 out of 49
49
succes  42 out of 50
partial succes  42 out of 50
50
succes  43 out of 51
partial succes  43 out of 51
51
succes  44 ou

# Experiment 3

In [3]:
# URLs of the files you want to download
file_urls = [
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/add_prim_split/tasks_train_addprim_jump.txt',
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/add_prim_split/tasks_test_addprim_jump.txt'  # Replace with the actual URL of the second file
]

# Use ThreadPoolExecutor to download files concurrently
with ThreadPoolExecutor(max_workers=len(file_urls)) as executor:
    results = list(executor.map(download_file, file_urls))

# Check if all downloads were successful
if all(result is not None for result in results):
    # Assuming each line in the text file is a sentence
    sentences_list = [result.split('\n') for result in results]

    # Create DataFrames for each file
    dfs = [pd.DataFrame({'Sentences': sentences}) for sentences in sentences_list]

    # Create DataFrames for each file
    train_df_jump = pd.DataFrame({'Sentences': sentences_list[0]})
    eval_df_jump = pd.DataFrame({'Sentences': sentences_list[1]})
else:
    print("One or more downloads failed.")

rewrite vocabulary to include "JUMP"

In [15]:
train_set_jump = SentenceDataset(train_df_jump['Sentences'])
eval_set_jump = SentenceDataset(eval_df_jump['Sentences'])

In [None]:
for i in range(len(train_set_jump)-1):
  if "jump" in train_set_jump[i][0]:
    print(train_set_jump[i])

In [None]:
input_vocab = Vocab('IN')
output_vocab = Vocab('OUT')
for line in train_set_jump:
    input_vocab.addSentence(line[0])
    output_vocab.addSentence(line[1])

train and evaluate

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_jump, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=100, plot_every=500)

In [None]:
eval = [eval_set_jump[i] for i in range(len(eval_set_jump)-1)]
encoder.eval()
decoder.eval()
result = evaluate(encoder, decoder, eval, input_vocab, output_vocab)

now the with "TURN LEFT"

In [None]:
# URLs of the files you want to download
file_urls = [
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/add_prim_split/tasks_train_addprim_turn_left.txt',
    'https://raw.githubusercontent.com/brendenlake/SCAN/master/add_prim_split/tasks_test_addprim_turn_left.txt'  # Replace with the actual URL of the second file
]

# Use ThreadPoolExecutor to download files concurrently
with ThreadPoolExecutor(max_workers=len(file_urls)) as executor:
    results = list(executor.map(download_file, file_urls))

# Check if all downloads were successful
if all(result is not None for result in results):
    # Assuming each line in the text file is a sentence
    sentences_list = [result.split('\n') for result in results]

    # Create DataFrames for each file
    dfs = [pd.DataFrame({'Sentences': sentences}) for sentences in sentences_list]

    # Create DataFrames for each file
    train_df_tleft = pd.DataFrame({'Sentences': sentences_list[0]})
    eval_df_tleft = pd.DataFrame({'Sentences': sentences_list[1]})
else:
    print("One or more downloads failed.")

rewrite vocabulary to include "JUMP"

In [None]:
train_set_tleft = SentenceDataset(train_df_tleft['Sentences'])
eval_set_tleft = SentenceDataset(eval_df_tleft['Sentences'])

In [None]:
input_vocab = Vocab('IN')
output_vocab = Vocab('OUT')
for line in train_set_tleft:
    input_vocab.addSentence(line[0])
    output_vocab.addSentence(line[1])

train and evaluate

In [None]:
hidden_size = 200
batch_size = 1

train_dataset = CustomDataset(train_set_tleft, input_vocab, output_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

encoder = EncoderLSTM(input_vocab.n_words, hidden_size).to(device)
decoder = DecoderLSTM(hidden_size, output_vocab.n_words).to(device)

train(train_dataloader, encoder, decoder, 100000, print_every=100, plot_every=500)

In [None]:
eval = [eval_set_tleft[i] for i in range(len(eval_set_tleft)-1)]
encoder.eval()
decoder.eval()
result = evaluate(encoder, decoder, eval, input_vocab, output_vocab)