In [2]:
import numpy as np
import pickle as pkl
from tqdm import tqdm
import torch

def generate_sequences(seq_length, padding, vocabulary, 
                       delimiter, unknown, output_len):
    input = []
    output = []
    for index in range(seq_length):
        input.append(np.random.choice(vocabulary))
    output = input.copy()
    for index in range(padding):
        input.append(delimiter)
    output_padding = len(input)
    for index in range(output_len):
        input.append(unknown)
    output = output_padding * [unknown] + output[:output_len]
    return input, output


In [3]:
seq_length = [100, 200, 500, 1000]
vocabulary = ['a','x','c','r','y','w','b','t','o']
delimiter = '$'
unknown = ' '
vocabulary.extend([delimiter, unknown])
print(f"vocabulary: {vocabulary}")

char2idx = {char: idx for idx, char in enumerate(vocabulary)}
idx2char = {idx: char for idx, char in enumerate(vocabulary)}
idx_vocab = [char2idx[char] for char in vocabulary]
print(f"idx_vocab: {idx_vocab}")
idx_vocab.remove(char2idx[delimiter])
idx_vocab.remove(char2idx[unknown])

padding = [10, 20, 50]  # repeat delimiter for how many time steps
output_len = [50, 100, 200] # how many time steps to predict
batch_size = 32
input_size = len(vocabulary)
hidden_size = 128
n_epochs = 10
lr = 0.01
n_samples = 10000

np.random.seed(42)

X_train = []
Y_train = []
tqdm.write(f"Generating {n_samples} train samples...")
for index in tqdm(range(n_samples)):
    input, output = generate_sequences(seq_length[0], padding=padding[0], vocabulary=idx_vocab, delimiter=char2idx[delimiter], unknown=char2idx[unknown], output_len=output_len[0])
    X_train.append(input)
    Y_train.append(output)
X_train = np.array(X_train)
Y_train = np.array(Y_train)
pkl.dump((X_train, Y_train), open('copyTask_data_N10000_T100_P10_O50.pkl', 'wb'))

vocabulary: ['a', 'x', 'c', 'r', 'y', 'w', 'b', 't', 'o', '$', ' ']
idx_vocab: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Generating 10000 train samples...


100%|██████████| 10000/10000 [00:06<00:00, 1640.23it/s]


In [4]:
X_train.shape, Y_train.shape

((10000, 160), (10000, 160))

In [5]:
X_train[0]

array([ 6,  3,  7,  4,  6,  2,  6,  7,  4,  3,  7,  7,  2,  5,  4,  1,  7,
        5,  1,  4,  0,  5,  8,  0,  2,  6,  3,  8,  2,  4,  2,  6,  4,  8,
        6,  1,  3,  8,  1,  8,  4,  1,  3,  6,  7,  2,  0,  3,  1,  7,  3,
        1,  5,  5,  3,  5,  1,  1,  3,  7,  6,  8,  7,  4,  1,  4,  7,  8,
        8,  0,  8,  6,  8,  7,  0,  7,  7,  2,  0,  7,  2,  2,  0,  4,  6,
        8,  6,  8,  7,  1,  0,  6,  6,  7,  4,  2,  7,  5,  2,  0,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10])

In [9]:
temp = np.array([idx2char[i] for i in X_train[0]])
temp

array(['b', 'r', 't', 'y', 'b', 'c', 'b', 't', 'y', 'r', 't', 't', 'c',
       'w', 'y', 'x', 't', 'w', 'x', 'y', 'a', 'w', 'o', 'a', 'c', 'b',
       'r', 'o', 'c', 'y', 'c', 'b', 'y', 'o', 'b', 'x', 'r', 'o', 'x',
       'o', 'y', 'x', 'r', 'b', 't', 'c', 'a', 'r', 'x', 't', 'r', 'x',
       'w', 'w', 'r', 'w', 'x', 'x', 'r', 't', 'b', 'o', 't', 'y', 'x',
       'y', 't', 'o', 'o', 'a', 'o', 'b', 'o', 't', 'a', 't', 't', 'c',
       'a', 't', 'c', 'c', 'a', 'y', 'b', 'o', 'b', 'o', 't', 'x', 'a',
       'b', 'b', 't', 'y', 'c', 't', 'w', 'c', 'a', '$', '$', '$', '$',
       '$', '$', '$', '$', '$', '$', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' '], dtype='<U1')

In [10]:
Y_train[0]

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10,  6,  3,  7,  4,  6,  2,  6,  7,  4,
        3,  7,  7,  2,  5,  4,  1,  7,  5,  1,  4,  0,  5,  8,  0,  2,  6,
        3,  8,  2,  4,  2,  6,  4,  8,  6,  1,  3,  8,  1,  8,  4,  1,  3,
        6,  7,  2,  0,  3,  1,  7])

In [11]:
temp = np.array([idx2char[i] for i in Y_train[0]])
temp

array([' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
       ' ', ' ', ' ', ' ', ' ', ' ', 'b', 'r', 't', 'y', 'b', 'c', 'b',
       't', 'y', 'r', 't', 't', 'c', 'w', 'y', 'x', 't', 'w', 'x', 'y',
       'a', 'w', 'o', 'a', 'c', 'b', 'r', 'o', 'c', 'y', 'c', 'b', 'y',
       'o', 'b', 'x', 'r', 'o', 'x', 'o', 'y', 'x', 'r', 'b', 't', 'c',
       'a', 'r', 'x', 't'], dtype='<U1')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

print(f"len vocab: {len(vocabulary)}")


len vocab: 11


TypeError: one_hot(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [8]:
y_one_hot = F.one_hot(torch.tensor(Y_train), num_classes=len(vocabulary))
print(y_one_hot.shape)

torch.Size([10000, 160, 11])


In [9]:
a = np.zeros(10)
a[-5:] = 1
a

array([0., 0., 0., 0., 0., 1., 1., 1., 1., 1.])