# 基础层介绍

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
conda env list

# conda environments:
#
TimeGPT                  D:\Python\envs\TimeGPT
baichuan                 D:\Python\envs\baichuan
py2                      D:\Python\envs\py2
py310                    D:\Python\envs\py310
so-vits-svc              D:\Python\envs\so-vits-svc
torch                 *  D:\Python\envs\torch
root                     D:\Python


Note: you may need to restart the kernel to use updated packages.


## RNM

https://pytorch.org/docs/stable/generated/torch.nn.RNN.html#torch.nn.RNN

In [10]:
# 参数大小
# input:
#     输入大小是三维tensor[seq_len, batch_size, input_dim]
#     seq_len: 句子的最大长度 或者 每组训练数据的时间长度
#     input_dim: 输入的维度 或者 数据的特征数

# 
# h0：
#    输入大小是三维tensor（num_layers*directions，batch_size，hidden_size）
rnn = nn.RNN(
    input_size=10, 
    hidden_size=20, 
    num_layers=2, 
    bidirectional=True)


In [11]:
# 5 seqence length
# 3 batch size
# 10 input size
input = torch.randn(5, 3, 10)
print(input.shape)


output, hn = rnn(input)

torch.Size([5, 3, 10])


### 输入顺序

注意，RNN输入的是序列，一次把批次的所有句子都输入到RNN中，因此hiddle和output也是三维的

每个时间步输入到RNN模块的维度是[batch_size, input_dim]

把seq_len放在第一位的好处是，可以同时处理每个句子：
1. 第一个时间步，输入所有句子的第一个单词 或者 第一个时间步上的所有特征
2. 第二个时间步，输入所有句子的第二个单词 或者 第二个时间步上的所有特征
...
3.以此类推，直到seq_len个时间步全部输入完成

### RNN的输出变量

#### output
    RNN模型在每个时间步的输出的张量，形状取决于输入数据的形状、RNN的参数设置以及输入序列的长度。

    具体来说，`output`张量的形状为`( batch_size, sequence_length, num_directions * hidden_size)`，其中：

- `sequence_length`是输入序列的长度，即时间步的数量。
- `batch_size`是输入数据的批次大小，即一次处理多少个序列。
- `num_directions`是一个可选的参数，如果你的RNN是双向的（`bidirectional=True`），则`num_directions`为2；如果是单向的，则为1。
- `hidden_size`是RNN的隐藏状态大小，即每个时间步的隐藏状态向量的维度。

        这个`output`张量包含了RNN在每个时间步的输出。通常，你可以在训练后对这些输出进行进一步处理，例如用于分类、回归或序列到序列的任务，或者用于获得序列中的某些信息。

#### h_n 最后一个时间步的输出 

    此外，`torch.nn.RNN`还返回一个包含最后一个时间步的隐藏状态的张量，通常称为`h_n`。这个张量的形状为`(num_layers * num_directions, batch_size, hidden_size)`，其中：

- `num_layers`是RNN模型的层数。
- `num_directions`是一个可选的参数，如果你的RNN是双向的，则`num_directions`为2；如果是单向的，则为1。
- `hidden_size`是RNN的隐藏状态大小，即每个时间步的隐藏状态向量的维度。

        `h_n`张量包含了每个层的最后一个时间步的隐藏状态，可以用于进行额外的处理或者作为下一个时间步的初始隐藏状态。

        综上所述，`output`和`h_n`是`torch.nn.RNN`模块的两个主要输出，它们提供了RNN在输入序列上的输出信息和最终的隐藏状态信息。


## LSTM

https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html#torch.nn.LSTM

In [15]:

rnn = nn.LSTM(input_size=10, 
              hidden_size=20, 
              num_layers=2)

input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)

output, (hn, cn) = rnn(input, (h0, c0))

## GRU

https://pytorch.org/docs/stable/generated/torch.nn.GRU.html#torch.nn.GRU

In [16]:

rnn = nn.GRU(input_size=10, 
              hidden_size=20, 
              num_layers=2)

input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)

output, hn = rnn(input, h0)

## Embedding
https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html#torch.nn.Embedding

In [18]:
# an Embedding module containing 10 tensors of size 3
# num_embeddings：词典的大小尺寸，比如总共出现5000个词，那就输入5000。此时index为（0-4999）
# embedding_dim：m嵌入向量的维度，即用多少维来表示一个符号
embedding = nn.Embedding(num_embeddings=10, 
                         embedding_dim=3)

# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
embedding(input)
# input shape: [2, 4, 3]
# 每行表示一个数字的embedding向量

tensor([[[-0.8517,  0.3146, -0.4182],
         [-1.2109,  0.3488,  0.6808],
         [ 0.8797,  0.6647,  0.4363],
         [-1.1474, -1.0755,  0.6982]],

        [[ 0.8797,  0.6647,  0.4363],
         [-0.7848,  0.6703, -0.5447],
         [-1.2109,  0.3488,  0.6808],
         [-0.6671,  0.7456,  1.0401]]], grad_fn=<EmbeddingBackward0>)

In [28]:
# example with padding_idx
# padding_idx表示需要填充的词索引，默认0，意思是把每个input中的索引为idx的词的embed置为0
embedding = nn.Embedding(10, 3, padding_idx=0)
input = torch.LongTensor([[0, 2, 0, 5]])
embedding(input)

tensor([[[ 0.0000,  0.0000,  0.0000],
         [-0.2247, -0.4837,  0.4901],
         [ 0.0000,  0.0000,  0.0000],
         [ 0.3420,  2.0672, -0.0243]]], grad_fn=<EmbeddingBackward0>)

In [29]:
# example of changing `pad` vector
padding_idx = 0
embedding = nn.Embedding(3, 3, padding_idx=padding_idx)
embedding.weight
with torch.no_grad():
    embedding.weight[padding_idx] = torch.ones(3)
embedding.weight

Parameter containing:
tensor([[ 1.0000,  1.0000,  1.0000],
        [-1.3636, -0.6373, -0.3953],
        [ 0.8767, -1.5321, -1.7318]], requires_grad=True)

## EmbeddingBag
https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html#torch.nn.EmbeddingBag

Computes sums or means of ‘bags’ of embeddings, without instantiating the intermediate embeddings. For bags of constant length, no per_sample_weights, no indices equal to padding_idx, and with 2D inputs, this class

- with mode="sum" is equivalent to Embedding followed by torch.sum(dim=1),
- with mode="mean" is equivalent to Embedding followed by torch.mean(dim=1),
- with mode="max" is equivalent to Embedding followed by torch.max(dim=1).

input (Tensor) – Tensor containing bags of indices into the embedding matrix.

offsets (Tensor, optional) – Only used when input is 1D. offsets determines the starting index position of each bag (sequence) in input.

In [30]:
# an EmbeddingBag module containing 10 tensors of size 3
embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')

# a batch of 2 samples of 4 indices each
input = torch.tensor([1,2,4,5,4,3,2,9], dtype=torch.long)
offsets = torch.tensor([0, 4], dtype=torch.long)

embedding_sum(input, offsets)

tensor([[ 0.4147,  0.4959, -2.3814],
        [-1.2412,  0.9358,  0.6643]], grad_fn=<EmbeddingBagBackward0>)

观察输出：应该是通过offset来控制sum的边界，把前4个的每个词的emb向量求sum，后4个的每个词的emb向量求sum

In [36]:
# Example with padding_idx
embedding_sum = nn.EmbeddingBag(10, 3, mode='sum', padding_idx=2)

input = torch.tensor([2, 2, 2, 2, 4, 3, 2, 9], dtype=torch.long)
offsets = torch.tensor([0,3, 6], dtype=torch.long)

embedding_sum(input, offsets)

tensor([[ 0.0000,  0.0000,  0.0000],
        [-0.8568, -0.8295, -0.3004],
        [-0.0642,  0.2780,  0.9792]], grad_fn=<EmbeddingBagBackward0>)

# Tagging案例

In [41]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

training_data

[(['The', 'dog', 'ate', 'the', 'apple'], ['DET', 'NN', 'V', 'DET', 'NN']),
 (['Everybody', 'read', 'that', 'book'], ['NN', 'V', 'DET', 'NN'])]

In [46]:

# 把单词转换为id
word_to_ix = {}

for sent, tags in training_data:
#     print(sent)
#     print(tags)
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
print(word_to_ix)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}


In [62]:
# Assign each tag with a unique index
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 8

In [65]:
# 定义LSTM模型

class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim)
        
        # 线性层
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
    def forward(self, sentence):
        embeds = self.embedding(sentence)

        # 因为LSTM的input shape：[seq_len, batch_size, input_dim]
#         (seq_len, embedding_dim) -> [seq_len, batch_size, embedding_dim]
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        
        # ( 1, sequence_length, hidden_size) -》 (seq_len, hidden_size)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [66]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    print(inputs)
    tag_scores = model(inputs)
    print(tag_scores)


tensor([0, 1, 2, 3, 4])
tensor([[-1.3716, -0.8594, -1.1304],
        [-1.3318, -0.8797, -1.1360],
        [-1.4187, -0.8491, -1.1082],
        [-1.4662, -0.8074, -1.1296],
        [-1.4931, -0.7918, -1.1322]])


In [67]:

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(tag_scores)

tensor([[-0.0352, -4.2743, -3.8784],
        [-4.0797, -0.0243, -4.9451],
        [-3.4386, -4.9927, -0.0397],
        [-0.0471, -4.0156, -3.5777],
        [-5.0312, -0.0081, -6.4522]])
