In [1]:
import random
import torch
from d2l import torch as d2l

In [2]:
tokens = d2l.tokenize(d2l.read_time_machine())
corpus = [token for line in tokens for token in line]
vocab = d2l.Vocab(corpus)
vocab.token_freqs[:10]

[('the', 2261),
 ('i', 1267),
 ('and', 1245),
 ('of', 1155),
 ('a', 816),
 ('to', 695),
 ('was', 552),
 ('in', 541),
 ('that', 443),
 ('my', 440)]

In [3]:
bigram_tokens = [pair for pair in zip(corpus[:-1], corpus[1:])]
bigram_vocab = d2l.Vocab(bigram_tokens)
bigram_vocab.token_freqs[:10]

[(('of', 'the'), 309),
 (('in', 'the'), 169),
 (('i', 'had'), 130),
 (('i', 'was'), 112),
 (('and', 'the'), 109),
 (('the', 'time'), 102),
 (('it', 'was'), 99),
 (('to', 'the'), 85),
 (('as', 'i'), 78),
 (('of', 'a'), 73)]

In [4]:
def seq_data_iter_random(corpus, batch_size, num_steps): #@save
    corpus = corpus[random.randint(0, num_steps-1):]
    num_subseqs = (len(corpus)-1)//num_steps
    initial_indices = list(range(0, num_subseqs * num_steps, num_steps))
    random.shuffle(initial_indices)

    def data(pos):
        return corpus[pos:pos+num_steps]

    num_batches = num_subseqs // batch_size
    for i in range(0, batch_size*num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i:i+batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)

In [5]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY: ', Y)

X:  tensor([[ 6,  7,  8,  9, 10],
        [11, 12, 13, 14, 15]]) 
Y:  tensor([[ 7,  8,  9, 10, 11],
        [12, 13, 14, 15, 16]])
X:  tensor([[21, 22, 23, 24, 25],
        [16, 17, 18, 19, 20]]) 
Y:  tensor([[22, 23, 24, 25, 26],
        [17, 18, 19, 20, 21]])
X:  tensor([[ 1,  2,  3,  4,  5],
        [26, 27, 28, 29, 30]]) 
Y:  tensor([[ 2,  3,  4,  5,  6],
        [27, 28, 29, 30, 31]])


In [8]:
def seq_data_iter_sequential(corpus, batch_size, num_steps): #@save
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus)-offset-1)//batch_size)*batch_size
    Xs = torch.tensor(corpus[offset: offset+num_tokens])
    Ys = torch.tensor(corpus[offset+1:offset+1+num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1]//num_steps
    for i in range(0, num_steps*num_batches, num_batches):
        X = Xs[:, i:i+num_steps]
        Y = Ys[:, i:i+num_steps]
        yield X, Y

for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X: ', X, '\nY: ', Y)

X:  tensor([[ 1,  2,  3,  4,  5],
        [17, 18, 19, 20, 21]]) 
Y:  tensor([[ 2,  3,  4,  5,  6],
        [18, 19, 20, 21, 22]])
X:  tensor([[ 4,  5,  6,  7,  8],
        [20, 21, 22, 23, 24]]) 
Y:  tensor([[ 5,  6,  7,  8,  9],
        [21, 22, 23, 24, 25]])
X:  tensor([[ 7,  8,  9, 10, 11],
        [23, 24, 25, 26, 27]]) 
Y:  tensor([[ 8,  9, 10, 11, 12],
        [24, 25, 26, 27, 28]])
X:  tensor([[10, 11, 12, 13, 14],
        [26, 27, 28, 29, 30]]) 
Y:  tensor([[11, 12, 13, 14, 15],
        [27, 28, 29, 30, 31]])
X:  tensor([[13, 14, 15, 16],
        [29, 30, 31, 32]]) 
Y:  tensor([[14, 15, 16, 17],
        [30, 31, 32, 33]])


In [7]:
class SeqDataLoad: #@save
    def __init__(self, batch_size, num_steps, user_random_iter, max_tokens):
        if user_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential

        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

def load_data_time_machine(batch_size, num_steps, user_random_iter = False, max_tokens = 10000): #@save
    data_iter = SeqDataLoad(
        batch_size, num_steps, user_random_iter, max_tokens
    )
    return data_iter, data_iter.vocab