<a href="https://colab.research.google.com/github/hust-lizewei/MetaFormer/blob/master/tutorial_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
! nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [16]:
import torch
import torch.nn as nn
import math

# Tokenizer


In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True, padding_side='left')
max_token_len = 100
input_text = '明月几时有,把酒问青天，不知天上宫阙'
output = tokenizer.encode_plus(input_text, max_length=max_token_len, padding=True, return_tensors='pt', truncation=True)

print(output['input_ids'].shape, output['attention_mask'].shape)
print(f"******"*10)

# 解码input_ids，不跳过特殊token，不清理token化空格
full_tokens = tokenizer.batch_decode(output['input_ids'], skip_special_tokens=False, clean_up_tokenization_spaces=False)

print(f"输入Text: {input_text}")
print(f"分词后结果: {output['input_ids']}")
print(f"解码后结果: {full_tokens}")

# 打印每个token的解码结果
for i in range(len(output['input_ids'][0])):
    token = output['input_ids'][0][i].item()  # 使用.item()获取单个值
    token_text = tokenizer.convert_ids_to_tokens(token)
    print(f"Token ID: {token}, Token Text: {token_text}, Decoded: {tokenizer.decode([token], skip_special_tokens=False)}")

torch.Size([1, 17]) torch.Size([1, 17])
************************************************************
输入Text: 明月几时有,把酒问青天，不知天上宫阙
分词后结果: tensor([[ 30858,   9754,  99195,  13343,  18830,     11,  99360,  99525,  56007,
          99467,  35727,   3837, 102085, 110154,  99921, 119082, 151643]])
解码后结果: ['明月几时有,把酒问青天，不知天上宫阙<|endoftext|>']
Token ID: 30858, Token Text: æĺİ, Decoded: 明
Token ID: 9754, Token Text: æľĪ, Decoded: 月
Token ID: 99195, Token Text: åĩł, Decoded: 几
Token ID: 13343, Token Text: æĹ¶, Decoded: 时
Token ID: 18830, Token Text: æľī, Decoded: 有
Token ID: 11, Token Text: ,, Decoded: ,
Token ID: 99360, Token Text: æĬĬ, Decoded: 把
Token ID: 99525, Token Text: éħĴ, Decoded: 酒
Token ID: 56007, Token Text: éĹ®, Decoded: 问
Token ID: 99467, Token Text: éĿĴ, Decoded: 青
Token ID: 35727, Token Text: å¤©, Decoded: 天
Token ID: 3837, Token Text: ï¼Į, Decoded: ，
Token ID: 102085, Token Text: ä¸įçŁ¥, Decoded: 不知
Token ID: 110154, Token Text: å¤©ä¸Ĭ, Decoded: 天上
Token ID: 99921, Token Text: å®«,

In [18]:
class CausalAttention(nn.Module):
  """    因果注意力    """
  def __init__(
    self,
    heads,  # 自注意力的头数
    head_dim,  # 单头embed_size
    embed_dim,  # hidden_size
    dropout=0.0,
    ):
    super(CausalAttention, self).__init__()
    self.heads = heads
    self.head_dim = head_dim
    self.emb_dim = embed_dim
    assert embed_dim == head_dim * heads
    self.dropout = dropout
    # 通过wordEmbedding-->扩展得到QKV矩阵(N, 1, embed_dim) -> (N, 1 3*emb_dim)
    self.qkv_projection = nn.Linear(self.emb_dim, 3*self.emb_dim)
    self.dropout1 = nn.Dropout(self.dropout)
    self.dropout2 = nn.Dropout(self.dropout)
    self.ffn = nn.Linear(self.emb_dim, self.emb_dim)

  def forward(self, hidden_states, attention_mask):
    """
    :param hidden_states: (N, seqLen, embed_dim), Context hidden_state
    :param attention_mask: (N, seqLen) --> (N, 1, seqLen, seqLen), 0: valid,
    :return: attention之后的hidden_state        """
    bs, seq_len, _ = hidden_states.size()
    qkv = self.qkv_projection(hidden_states)
    # 每个QKV的形状为 (N, seqLen, embed_dim=heads*head_dim)
    q, k, v = torch.split(qkv, qkv.size(-1)//3, dim=-1)
    # 将q,k,v划分为多头 (N, seqLen, head_dim)  --> (N, seqLen, heads, head_dim) --> (N, heads, seqLen, head_dim)
    q = q.view(bs, seq_len, self.heads, self.head_dim).transpose(1, 2).contiguous()
    k = k.view(bs, seq_len, self.heads, self.head_dim).transpose(1, 2).contiguous()
    v = v.view(bs, seq_len, self.heads, self.head_dim).transpose(1, 2).contiguous()
    # 注意力计算-点积注意力: Attention = softmax[(Q*K')/sqrt(d)]*V
    # QK': (N, heads, seqLen, head_dim) * (N, heads, head_dim, seqLen) --> (N, heads, seqLen, seqLen)
    #      (N, heads, i, seqLen): 表示位置为i的token对所有的seqLen个token的注意力
    logits = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(self.heads)
    logits = logits.to(torch.float32)
    # 构造上三角阵形式的mask，实现causal效果: 等价于将对于位置i的token而言将位置i+1之后的所有logits全部置为无穷小
    mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.long)).to(logits.device)
    # ******** mask.shape=torch.Size([208, 208])|torch.int64, logits.shape=torch.Size([32, 8, 208, 208])|torch.float16
    # print(f"******** mask.shape={mask.shape}|{mask.dtype}, logits.shape={logits.shape}|{logits.dtype}")
    logits = logits.masked_fill(mask == 0, -1e19)
    # print(f"============= logits.shape={logits.shape}||{attention_mask.shape}")
    # logits = logits.masked_fill(attention_mask == 0, -1e12)
    logits = torch.nn.functional.softmax(logits, dim=-1)
    logits = self.dropout1(logits)
    # (N, heads, seqLen) *  (N, heads, seqLen, head_dim) --> (N, heads, seqLen, head_dim) --> (N, seqLen, embed_dim)
    weighted_v = torch.matmul(logits, v).transpose(1, 2).contiguous().view(bs, seq_len, -1).contiguous()
    weighted_v = self.ffn(weighted_v)
    weighted_v = self.dropout2(weighted_v)
    return weighted_v

In [20]:
class DecoderLayer(nn.Module):
  """
  解码层: 对于当前位置为i的token，对0-i范围的context进行编码，生成i+1及之后新token的预测概率 --> 自回归生成过程
  Note: 最终层需要生成预测概率，但中间层则只需要做CasualAttention生成表示即可
  结构:Input_X-> LN -> CausalAttention-> LN-> FFN-> Activation
  """
  def __init__(
    self,
    heads,  # 自注意力的头数
    head_dim,  # 单头embed_size
    embed_dim,  # hidden_size
    ffn_expand=3,  # FFN中上采样比例
    ffn_dropout=0.0,
    attn_dropout=0.0,
    max_seq_len=512
  ):
    super(DecoderLayer, self).__init__()
    self.embed_dim = embed_dim
    self.ffn_expand = ffn_expand
    self.ffn_dropout = ffn_dropout
    self.ln1 = nn.LayerNorm(self.embed_dim)
    self.ln2 = nn.LayerNorm(self.embed_dim)
    self.casual_attention = CausalAttention(
        heads,
        head_dim,
        embed_dim,
        attn_dropout
    )
    self.act = nn.GELU()
    self.ffn = nn.Sequential(
        nn.Linear(embed_dim, self.ffn_expand * embed_dim),
        nn.GELU(),
        nn.Linear(self.ffn_expand * embed_dim, embed_dim),
        nn.Dropout(self.ffn_dropout))


  def forward(self, x, attention_mask):
    """
    :param x: HiddenState (N, seqLen, emb_dim)
    :return: HiddenState (N, seqLen, emb_dim)
    """
    x1 = self.ln1(x)
    x1 = self.casual_attention(x1, attention_mask)
    x = x1 + x
    x2 = self.ffn(x)
    x = x2 + x
    x = self.act(x)
    return x



In [21]:

class ChatLLM(nn.Module):
  """
  模型结构: 采用DecodeOnly结构，直接采用多层解码器堆叠即可,并封装: 推理预测功能
  self.tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)
  """
  def __init__(
    self,
    heads,  # 自注意力的头数
    head_dim,  # 单头embed_size
    embed_dim,  # hidden_size
    ffn_expand=4,  # FFN中上采样比例
    ffn_dropout=0.0,
    attn_dropout=0.0,
    decoder_layers=4,
    max_seq_len=512,
    vocab_size=-1  # 词库规模
    ):
    super(ChatLLM, self).__init__()
    self.heads = heads
    self.head_dim = head_dim
    self.embed_dim = embed_dim
    self.ffn_expand = ffn_expand
    self.ffn_dropout = ffn_dropout
    self.attn_dropout = attn_dropout
    self.decoder_layers = decoder_layers
    self.max_seq_len = max_seq_len
    self.vocab_size = vocab_size
    self.token_embedding = torch.nn.Embedding(vocab_size, embedding_dim=embed_dim)
    self.position_embedding = torch.nn.Embedding(max_seq_len, embedding_dim=embed_dim)
    self.decoders = nn.Sequential(
        *[DecoderLayer(heads, head_dim, embed_dim, ffn_expand, ffn_dropout, attn_dropout, max_seq_len) for _ in range(decoder_layers)]
        )
    # 自回归生成head
    self.lm_head = nn.Linear(embed_dim, vocab_size)
    self.temp = nn.Parameter(torch.Tensor([0.1]))
    self.apply(init_weights)

    def from_pretrain(self, pretrain):
      state_dict = torch.load(pretrain, map_location='cpu')
      self.load_state_dict(state_dict, strict=True)
      self.to(self.device)
      return self

    @property
    def device(self):
      if torch.cuda.is_available():
          return torch.device('cuda')
      return torch.device('cpu')

    def forward(self, input_ids, attention_mask, labels=None):
      """
      :param input_ids: (N, seqLen)
      :param attention_mask: (N, seqLen)
      :param labels: 未进行了label_shift
      :return:
      """
      # (N, seqLen, embed_dim)
      # print(f"======**** vocab_size={self.vocab_size}, input_ids.device={input_ids.shape}, device={input_ids.shape}||token_embedding.device={self.token_embedding.weight.shape}||position_embedding.device={self.position_embedding.weight.shape}")
      # raise NotImplementedError
      token_embedding = self.token_embedding(input_ids)
      position_idx = torch.arange(0, input_ids.size(1), dtype=torch.long, device=self.device).unsqueeze(0)
      position_embedding = self.position_embedding(position_idx)
      x = token_embedding + position_embedding
      for decoder in self.decoders:
          x = decoder(x, attention_mask)
      logits = self.lm_head(x) / self.temp  # (N, seqLen, vocab_size)
      loss = None
      if labels is not None:
          assert labels.size(-1) == input_ids.size(-1)
          labels = labels.to(logits.device)
          shifted_logits = logits[:, :-1, :].contiguous()  # [0, 1, 2, ...,n-1]
          shifted_labels = labels[:, 1:].contiguous()  # [1, 2, 3, ...,n]
          shifted_logits = shifted_logits.view(-1, self.vocab_size)
          shifted_labels = shifted_labels.view(-1)
          loss = torch.nn.functional.cross_entropy(shifted_logits, shifted_labels)
      return logits, loss


In [22]:
@torch.inference_mode()
def generate(self, input_ids, attention_mask, max_generate_tokens=100, temperature=1.0, **kwargs):
  """
  生成过程计聊天的回答过程, 将进行逐个token生成，直到出现EOS或者达到最大长度限制为止
  :param input_ids:
  :param attention_mask:
  :param max_generate_tokens:
  :param temperature::return: (N, generated_token_nums)
  """
  top_k = 8
  full_token_id = input_ids[..., :self.max_seq_len]
  attention_mask = attention_mask[..., :self.max_seq_len]
  for _ in range(max_generate_tokens):
    logits, _ = self.forward(full_token_id, attention_mask, None)
    next_token_logits = logits[:, -1, :] / temperature  # (N, vocab_size)
    tops, _ = torch.topk(next_token_logits, 500, dim=-1)
    next_token_logits = next_token_logits.masked_fill(next_token_logits < tops[:, [-1]], -1e9)
    # (N, vocab_size)
    next_token_logits = torch.nn.functional.softmax(next_token_logits, dim=-1)
    # 获取topK的nex_token作为候选池, 并进行采样
    tops, _ = torch.topk(next_token_logits, top_k, dim=-1)
    next_token_logits = next_token_logits.masked_fill(next_token_logits < tops[:, [-1]], 0)
    # (N, 1)
    next_token_id = torch.multinomial(next_token_logits, 1)
    full_token_id = torch.concat([full_token_id, next_token_id], dim=-1)
  return full_token_id

In [23]:
!pwd

/content


In [None]:
import os
import torch
import argparse
from omegaconf import OmegaConf
from trainer import MyTrainer
from models.chat import ChatLLM
from models.tokenizer import tokenizer
print(torch.cuda.is_available())

cfg_path = 'models/config.yaml'
conf = OmegaConf.load(cfg_path)
model_config = OmegaConf.to_container(conf.trainer_config.model_config)
model = ChatLLM(**model_config).to('cuda')
model.eval()
input_text = '诗词题目：独坐窗台'
full_tokens = model.chat(input_text, tokenizer, max_generate_tokens=50, temperature=0.8)
print(full_tokens)
