## Accessing the project directory on my Google Drive

In [1]:
import os
from google.colab import drive

drive.mount('/drive', force_remount=False)

Mounted at /drive


In [2]:
project_dir = "/drive/My Drive/RNN_seq2seq"

# change working directory to project_dir
os.chdir(project_dir)

## Dependencies

In [3]:
import random
from string import ascii_lowercase
from scripts.data import get_total_red_pair, n_words_of_length, generate_datasets

## Creating a new set of datasets

In [4]:
# define the alphbet. in this case, = 26 English letters
alphabet = ascii_lowercase

# where to store the generated data 
data_folder = "./data"

# define the in- and out-of- distribution ranges
in_distribution_ranges = [(6, 15)]
out_distribution_ranges = [(1, 5), (16, 30)]

# define the sizes for the train, dev, test, and gen sets
train_size, dev_size, test_size, gen_size = 3000, 3000, 5000, 5000

# random seed (may not help reproducibility though)
random.seed(8741983)

train_X, dev_X, test_X, gen_X = [], [], [], []
total = sum((train_size, dev_size, test_size))
split_1, split_2 = train_size, train_size + dev_size
    
for (l, h) in in_distribution_ranges:
    for n in range(l, h+1):
        data = n_words_of_length(total, n, alphabet)
        train_X.extend(data[:split_1])
        dev_X.extend(data[split_1:split_2])
        test_X.extend(data[split_2:])


for (l, h) in out_distribution_ranges:
    for n in range(l, h+1):
        gen_X.extend(n_words_of_length(gen_size, n, alphabet))

generate_datasets(train_X, dev_X, test_X, gen_X, get_total_red_pair, "total_red_follow_up", data_folder=data_folder)

./data/total_red_follow_up/train.txt saved!
./data/total_red_follow_up/dev.txt saved!
./data/total_red_follow_up/test.txt saved!
./data/total_red_follow_up/gen.txt saved!


## Automatically disconnect and delete the runtime 

In [None]:
from google.colab import runtime
runtime.unassign()