In [7]:
import d2l.torch as d2l
import random
import torch

生成小批量数据特征和标签

In [11]:
corpus, vocab = d2l.load_corpus_time_machine()


def seq_data_iter_random(corpus, batch_size, num_step):
    # 从第一个时间步序列随机丢弃一些数
    corpus = corpus[random.randint(0, num_step-1):]
    # 划分子序列数目
    num_subseqs = (len(corpus) - 1) // num_step
    # 按时间步初始化索引
    initial_indices = list(range(0, num_subseqs*num_step, num_step))
    # 将序列随机
    random.shuffle(initial_indices)

    def data(pos):
        return corpus[pos:pos+num_step]

    num_batches = num_subseqs // batch_size

    for i in range(0, batch_size*num_batches, batch_size):
        initial_indices_per_batch = initial_indices[i:i+batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)



In [12]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq, batch_size=2, num_step=5):
    print('X',X)
    print('Y',Y)

X tensor([[29, 30, 31, 32, 33],
        [14, 15, 16, 17, 18]])
Y tensor([[30, 31, 32, 33, 34],
        [15, 16, 17, 18, 19]])
X tensor([[ 9, 10, 11, 12, 13],
        [ 4,  5,  6,  7,  8]])
Y tensor([[10, 11, 12, 13, 14],
        [ 5,  6,  7,  8,  9]])
X tensor([[19, 20, 21, 22, 23],
        [24, 25, 26, 27, 28]])
Y tensor([[20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]])


顺序序列

In [16]:
def seq_data_iter_sequential(corpus, batch_size, num_steps):  #@save
    """使用顺序分区生成一个小批量子序列"""
    # 从随机偏移量开始划分序列
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size
    Xs = torch.tensor(corpus[offset: offset + num_tokens])
    Ys = torch.tensor(corpus[offset + 1: offset + 1 + num_tokens])
    Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1)
    num_batches = Xs.shape[1] // num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i: i + num_steps]
        Y = Ys[:, i: i + num_steps]
        yield X, Y

In [18]:
for X, Y in seq_data_iter_sequential(my_seq, batch_size=2, num_steps=5):
    print('X',X)
    print('Y',Y)

X tensor([[ 4,  5,  6,  7,  8],
        [19, 20, 21, 22, 23]])
Y tensor([[ 5,  6,  7,  8,  9],
        [20, 21, 22, 23, 24]])
X tensor([[ 9, 10, 11, 12, 13],
        [24, 25, 26, 27, 28]])
Y tensor([[10, 11, 12, 13, 14],
        [25, 26, 27, 28, 29]])
X tensor([[14, 15, 16, 17, 18],
        [29, 30, 31, 32, 33]])
Y tensor([[15, 16, 17, 18, 19],
        [30, 31, 32, 33, 34]])


In [19]:
class SeqDataLoader:  #@save
    """加载序列数据的迭代器"""
    def __init__(self, batch_size, num_steps, use_random_iter, max_tokens):
        if use_random_iter:
            self.data_iter_fn = d2l.seq_data_iter_random
        else:
            self.data_iter_fn = d2l.seq_data_iter_sequential
        self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens)
        self.batch_size, self.num_steps = batch_size, num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

In [20]:
def load_data_time_machine(batch_size, num_steps,  #@save
                           use_random_iter=False, max_tokens=10000):
    """返回时光机器数据集的迭代器和词表"""
    data_iter = SeqDataLoader(
        batch_size, num_steps, use_random_iter, max_tokens)
    return data_iter, data_iter.vocab