## 使用GPT2示例


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# 初始化分词器和模型
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
# 待向量化的文本
text = "LLM with me"
# 分词并转化为索引
input = tokenizer(text, return_tensors="pt")
print(input)


In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
text = "LLM with me"
input = tokenizer(text, return_tensors="pt")
print(input)

In [None]:
text = "LLM"
inputs = tokenizer(text, return_tensors="pt")
print(inputs)

# 查看索引对应的token
print(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

In [None]:
# 获取GPT2分词器的长度
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
print(f"分词器的词汇表长度为：{len(tokenizer)}")

In [None]:
from transformers import GPT2Tokenizer, GPT2Model

# 初始化分词器和模型
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
# 待处理的文本
text = "LLM with me"
# 分词并转换为索引
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
# 获取模型的嵌入层
embeddings = model.get_input_embeddings()
# 将索引转换为嵌入向量
input_embeddings = embeddings(input_ids)
print(input_embeddings)
print(input_embeddings.shape)


In [26]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from   tokenizers.pre_tokenizers import Whitespace

# 创建一个空的BPE分词器，使用空格进行预分词，闯进啊一个分词器训练器
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()
# 添加两个占位来解决不认识的词和结束语
trainer = BpeTrainer(special_tokens=['[UNK]',"<EOS>"])

# 准备一些训练数据
train_data = ["LLM with me","I love learning","I want to drink water"]
# 训练分词器
tokenizer.train_from_iterator(train_data,trainer)
#保存分词器到文件
tokenizer.save("custom_tokenizer.json")
#测试分词器
output = tokenizer.encode(str(train_data))
print(output.tokens)

['[UNK]', '[UNK]', 'LLM', 'with', 'me', '[UNK]', '[UNK]', '[UNK]', 'I', 'love', 'learning', '[UNK]', '[UNK]', '[UNK]', 'I', 'want', 'to', 'drink', 'water', '[UNK]', '[UNK]']


In [27]:
from transformers import PreTrainedTokenizerFast
input_ids = output.ids
print(input_ids)
# 加载自定义分词器，编码文本并返回pytorch张量
tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")
inputs = tokenizer(text,return_tensors="pt")
input_ids = inputs["input_ids"]
print(input_ids)


[0, 0, 40, 44, 30, 0, 0, 0, 2, 42, 45, 0, 0, 0, 2, 38, 33, 41, 39, 0, 0]
tensor([[40, 44, 30]])


In [29]:
from transformers import PreTrainedTokenizerFast, GPT2Config,GPT2Model
# 加载自定义分词器
tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")
#  创建模型配置以及初始化模型
config = GPT2Config(vocab_size=tokenizer.vocab_size,n_embd=768,n_layer=12,n_head=12)
model = GPT2Model(config=config)
# 待处理的文本 & 分词并转换为索引
text = "LLM with me"
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

# 获取模型的嵌入层 & 将索引转换为嵌入向量
embeddings = model.get_input_embeddings()
input_embeddings = embeddings(input_ids)
print(input_embeddings)
print(input_embeddings.shape)

tensor([[[-0.0059,  0.0014,  0.0045,  ...,  0.0042,  0.0058,  0.0277],
         [-0.0076, -0.0321,  0.0156,  ...,  0.0027, -0.0430, -0.0204],
         [-0.0022,  0.0157,  0.0055,  ...,  0.0270,  0.0320, -0.0338]]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([1, 3, 768])


In [30]:
import torch
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")
# 待处理的文本 & 分词并转换为索引
text = "LLM with me"
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]

#假设我们的自定义分词器有一个很小的词汇表
vocab_size = tokenizer.vocab_size  # 从自定义分词器获取词汇表大小
n_embd = 10  # 设置嵌入维度为10
# 创建一个随机初始化的嵌入矩阵，这里我们使用正太分布随机初始化，与实际模型初始化类似
embedding_matrix = torch.randn(vocab_size, n_embd)
# 假设input_ids是一个包含索引的张量
token_indices = input_ids[0]
token_embeddings = embedding_matrix[token_indices]
print(token_embeddings)
print(token_embeddings.shape)


tensor([[ 1.5592, -0.1788,  1.1371,  1.9707, -0.8604,  1.8147,  1.2781, -0.2853,
         -1.2177,  1.0596],
        [ 1.3030,  0.8386, -0.9091, -0.5357,  1.6616,  0.9948,  0.6228,  0.3903,
         -0.7626, -1.4527],
        [ 1.7207,  1.8723,  0.7415,  0.2367, -1.9379, -0.2421,  0.8787,  0.8633,
          0.2912, -0.4687]])
torch.Size([3, 10])


In [31]:
print(embedding_matrix[13])
print(embedding_matrix[14])
print(embedding_matrix[11])

tensor([ 3.0015, -1.3700, -0.7345, -1.2543,  0.3699,  0.4769,  0.2865, -1.6468,
         1.1635,  0.4964])
tensor([ 1.2094,  1.0582, -1.9204, -0.0973, -0.5215,  1.7699, -0.5906,  1.5935,
         1.0168,  0.5269])
tensor([ 0.1870, -1.9807, -0.0842,  1.0848,  0.9552, -0.2962,  1.0483,  0.3700,
        -0.8440,  0.3670])


In [33]:
import torch
import torch.nn as nn
import torch.optim as optim

vocab_size = tokenizer.vocab_size; n_embd = 10
embedding_matrix = torch.empty(vocab_size, n_embd)
nn.init.xavier_uniform_(embedding_matrix)
# 定义一个简化版的GPT模型
class SimpleGPT(nn.Module):
    def __init__(self, vocab_size, n_embd):
        super(SimpleGPT, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, n_embd)
        self.ffn = nn.Linear(n_embd, n_embd)
        self.logits = nn.Linear(n_embd, vocab_size)
        nn.init.xavier_uniform_(self.embeddings.weight)  # 使用Xavier初始化嵌入层

    def forward(self, input_ids):
        x = self.embeddings(input_ids)  # 嵌入层
        x = self.ffn(x)  # 前馈网络
        logits = self.logits(x)  # 输出层
        return logits

# 创建模型实例 & 定义损失函数和优化器
model = SimpleGPT(vocab_size, n_embd)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 假设我们有一些训练数据
input_ids = torch.tensor([[1, 2, 3, 4], [2, 3, 4, 5]])  # 示例输入
labels = torch.tensor([[2, 3, 4, 5], [3, 4, 5, 6]])  # 示例目标

# 训练循环
for epoch in range(100):  # 假设训练100个epoch
    logits = model(input_ids)  # 前向传播
    loss = loss_fn(logits.view(-1, vocab_size), labels.view(-1))  # 计算损失
    # 反向传播
    optimizer.zero_grad()
    loss.backward()
    # 梯度下降，根据loss值去更新模型的参数，里面则包括embeddings
    optimizer.step()
    # 打印损失
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss.item()}')


Epoch 10, Loss: 3.9529201984405518
Epoch 20, Loss: 3.8673152923583984
Epoch 30, Loss: 3.779238700866699
Epoch 40, Loss: 3.686739206314087
Epoch 50, Loss: 3.58780837059021
Epoch 60, Loss: 3.480212926864624
Epoch 70, Loss: 3.3615834712982178
Epoch 80, Loss: 3.2297677993774414
Epoch 90, Loss: 3.083065986633301
Epoch 100, Loss: 2.9204678535461426


In [34]:
token_indices = input_ids[0]
token_embeddings = model.embeddings(token_indices)
print(token_embeddings)
print(token_embeddings.shape)

tensor([[-0.0508,  0.3242, -0.3024,  0.0752, -0.4042,  0.4311,  0.1327,  0.1756,
         -0.1218,  0.2336],
        [-0.3098,  0.1813, -0.0902,  0.2466, -0.1132,  0.3045, -0.0063, -0.1865,
         -0.0918,  0.2803],
        [-0.1231,  0.2970,  0.2392, -0.1095,  0.4116,  0.1114, -0.1603,  0.2977,
         -0.0465,  0.4548],
        [-0.4237, -0.2317, -0.4082,  0.0113, -0.1408,  0.4488, -0.1965, -0.1343,
         -0.0252,  0.2996]], grad_fn=<EmbeddingBackward0>)
torch.Size([4, 10])


In [37]:
import torch.nn.functional as F
# 假设model是训练好的模型实例 & 假设tokenizer是加载好的分词器
model.eval() # 将模型设置为评估模式
input_text = "LLM with me"  # 输入文本
input_ids = tokenizer.encode(input_text, return_tensors="pt")  # 将文本编码为token索引
temperature = 0.7  # 设置温度参数 & 一般设置为0到1之间的值
generated_text = input_text + " A:"
for _ in range(50):  # 假设我们想生成50个单词
    with torch.no_grad():  # 不需要计算梯度
        logits = model(input_ids)
        logits = logits / temperature  # 应用温度调整
        # 使用softmax函数将logits转换为概率分布 & 根据概率分布随机选择下一个单词
        probabilities = F.softmax(logits[:, -1, :], dim=-1)
        predicted_id = torch.multinomial(probabilities, num_samples=1)
        # 将预测的token添加到输入序列中 & 将预测的token解码为文本并添加到生成的文本中
        input_ids = torch.cat((input_ids, predicted_id), dim=1)
        generated_text += tokenizer.decode(predicted_id[0])

print(generated_text)
eos_token = '<EOS>'  # 在生成文本后根据<EOS>进行切割
generated_text_parts = generated_text.split(eos_token)
final_text = generated_text_parts[0] + eos_token if len(generated_text_parts) > 1 else generated_text_parts[0]
print(final_text)

LLM with me A:waiLwantitLLMlovewantlearninerLarkgkinhitMterIwithlearningwantloMLninwldvklirnarLwaMveklokitlodi
LLM with me A:waiLwantitLLMlovewantlearninerLarkgkinhitMterIwithlearningwantloMLninwldvklirnarLwaMveklokitlodi
