In [1]:
%config ZMQInteractiveShell.ast_node_interactivity = "all"
%pprint

Pretty printing has been turned OFF


In [2]:
import torch
import zipfile
import numpy as np

## RNN 

相对于多层感知机和能够有效处理空间信息的CNN，RNN主要用于处理时序数据，它引入状态变量来存储过去的信息，并用其于当前的输入共同决定当前的输出

$$h_{t} = f(W_{hx}x_t + W_{hh}h_{t-1} + b_h)$$
$$y_t = g(W_{yh}h_t + b_y)$$

### 矩阵验证

其实对于$W_{hx}, x_t, W_{hh}, h_{t-1}$，它们是可以不需要分开处理，只需将$x_t$和$h_{t-1}$按列合并，将$W_{xh}$和$W_{hh}$按行合并，并进行矩阵相乘，能获得一样的结果

In [3]:
# 假设xt-->(3, 1), W_hx--->(4, 3)，W_hh--->(4, 4), h_(t-1) ---> (3, 4)
xt = np.random.rand(3, 1)
w_hx = np.random.rand(1, 4)
w_hh = np.random.rand(4, 4)
h_t_1 = np.random.randn(3, 4)

h_t = xt@w_hx + h_t_1@w_hh
h_t 

array([[ 0.36117668,  0.84170464,  0.51634436,  1.0192241 ],
       [ 1.98568241,  1.65775323, -0.14316661,  0.47196442],
       [-0.75610492, -0.22715356, -0.76128796, -0.84124357]])

In [4]:
h_t = np.hstack((xt, h_t_1)) @ np.vstack((w_hx, w_hh))
h_t

array([[ 0.36117668,  0.84170464,  0.51634436,  1.0192241 ],
       [ 1.98568241,  1.65775323, -0.14316661,  0.47196442],
       [-0.75610492, -0.22715356, -0.76128796, -0.84124357]])

### 创建token映射

使用周杰伦歌词来建模，并对周杰伦歌词进行预处理

In [5]:
# f.read是读取整个文件数据字符串，f.readline是读取一行数据，f.readlines是读取一个数据列表
with zipfile.ZipFile("../data/jaychou_lyrics.txt.zip") as zfile:
    with zfile.open("jaychou_lyrics.txt", "r") as f:
        corpus_chars = f.read().decode("utf-8")
        
# 看一下数据长什么样
corpus_chars[:100]
len(corpus_chars)

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每天在想想想想著你\n这样的甜蜜\n让我开始乡相信命运\n感谢地心引力\n让我碰到你\n漂亮的让我面红的可爱女人\n温柔的让我心疼的可'

63282

整个数据集中有63282条数据，为了打印方便，将换行符来替换空格

In [6]:
corpus_chars.count("\n")
corpus_chars.count("\r")
corpus_chars = corpus_chars.replace("\n", " ")

5819

0

In [7]:
def load_data_jay_song(corpus_chars):
    """
    function: load the data of jay song
    params corpus_chars: the chars in corpus
    """
    # id --> chars, the index is id: list
    vocab_set = list(set(corpus_chars))
    # corpus char --> id: dict
    chars_to_idx = {chars: idx for idx, chars in enumerate(vocab_set)}
    # the length of vocab_list
    vocab_size = len(vocab_set)
    # the index of data
    corpus_index = [chars_to_idx[chars] for chars in corpus_chars]
    
    return corpus_index, chars_to_idx, vocab_set, vocab_size

In [8]:
corpus_index, chars_to_idx, vocab_set, vocab_size = load_data_jay_song(corpus_chars)

### 时序数据采样

在训练中我们每次需要随机读取小批量样本和标签，时序数据的一个样本通常包含连续字符
- 如果time-step=5的时候，就是输入的样本序列为5个字符（你-今-天-吃-了），标签中的每一个字符就是对应训练集中的下一个字符（今-天-吃-了-吗）
- 对时序数据的采样方式有两种，一个是随机采样，另外一种就是相邻采样

#### 随机采样

在随机采样中，每个样本是原始序列中任意截取的一段序列，相邻的两个随机小批量在原始序列中的位置不一定相毗邻
- 因此，我们不能用前一个小批量最后time_step的隐藏状态来初始化下一个小批量的隐藏状态
- 在训练模型的时候，如果使用随机采样，那么在每次随机采样之前都要重新初始化隐藏状态

In [7]:
def data_iter_random(corpus_index, batch_size, num_steps):
    """
    function: realize random sample
    params corpus_index: the idx with corpus --> list
    params batch_size: the size of each batch
    params num_steps: the number of time steps in a network
    """
    # because the index of y is equal to the index of x + 1, so when we calucate the number of example,
    # we should use len(corpus_index) - 1, "example_num" stand fot the number of example with the num_steps.
    example_num = (len(corpus_index) - 1) // num_steps
    sample_start = (len(corpus_index) - 1) % num_steps
    # the example is the combination of several char, so it must with num_steps. when the example is not the
    # factor of batch_size, we only retain the complete example
    if sample_start != 0:
        example_index = np.arange(np.random.randint(sample_start), 
                                  len(corpus_index), num_steps)[:example_num]
    else:
        example_index = np.arange(0, len(corpus_index), num_steps)[:example_num]
    np.random.shuffle(example_index)

    # extract batch example
    for idx in np.arange(0, len(example_index), batch_size):
        batch_example = example_index[idx:(idx+batch_size)]
        # extract example in each batch
        x = [corpus_index[pos:(pos+num_steps)] for pos in batch_example]
        y = [corpus_index[(pos+1):(pos+1+num_steps)] for pos in batch_example]
        yield torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [8]:
corpus_index = list(range(30))
for x, y in data_iter_random(corpus_index, 2, 6):
    print(f"x: {x}")
    print(f"y: {y}")
    print("\n")

x: tensor([[21., 22., 23., 24., 25., 26.],
        [15., 16., 17., 18., 19., 20.]])
y: tensor([[22., 23., 24., 25., 26., 27.],
        [16., 17., 18., 19., 20., 21.]])


x: tensor([[ 9., 10., 11., 12., 13., 14.],
        [ 3.,  4.,  5.,  6.,  7.,  8.]])
y: tensor([[10., 11., 12., 13., 14., 15.],
        [ 4.,  5.,  6.,  7.,  8.,  9.]])




### 相邻采样

我们还可以让相邻的两个随机小批量在原始序列上的位置相邻，这时我们就可以用一个小批量最终时间步的隐藏层来初始化下一个小批量的隐藏状态，那么下一个下批量的输出也取决于当前小批量的输入
- 这样做，在训练模型的时候，我们只需要一个迭代周期开始的时候初始化隐藏状态
- 当多个相邻小批量通过传递隐藏状态串联起来的时候，模型参数的梯度计算将依赖所有串联起来的小批量序列。随着迭代次数的增加，梯度计算开销回越来越大（此时可以在每次小批量前将隐藏状态从计算图中分离出来，让模型参数的梯度计算只依赖一次迭代读取的小批量序列）

In [11]:
def data_iter_consecutive(corpus_index, batch_size, num_step):
    """realize consecutive sample, the params is same as random sample"""
    example_num = (len(corpus_index) - 1) // num_step
    # avoid the situation of (len(corpus_index) - 1) % num_step == 0
    try:
        sample_start = np.random.randint((len(corpus_index) - 1) % num_step)
    except:
        sample_start = 0
    # extract consecutive index which will sample, change shape with batch_size
    corpus_index = torch.tensor(corpus_index[sample_start:sample_start+example_num*num_step], 
                                dtype=torch.float32).view(batch_size, -1)
    
    batch_num = corpus_index.shape[1] // num_step
    # the reason same as sample_start
    try:
        batch_start = np.random.randint(corpus_index.shape[1] % num_step)
    except:
        batch_start = 0
    # yield consecutive sample with num_step
    for i in range(batch_start, batch_start+batch_num*num_step, num_step):
        x = corpus_index[:, i:i+num_step]
        y = x + 1
        yield x, y

In [16]:
corpus_index = list(range(30))
for x, y in data_iter_consecutive(corpus_index, 2, 6):
    print(f"x: {x}")
    print(f"y: {y}")
    print("\n")

x: tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [12., 13., 14., 15., 16., 17.]])
y: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [13., 14., 15., 16., 17., 18.]])


x: tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [18., 19., 20., 21., 22., 23.]])
y: tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [19., 20., 21., 22., 23., 24.]])


