In [3]:
import torch
from modelscope.hub.snapshot_download import snapshot_download
from transformers import BertModel, BertTokenizer
import onnx
import netron

# 使用 ModelScope 下载 BERT 模型
model_name = "iic/nlp_bert_backbone_base_std"
# cache_dir = "./modelscope_bert"

# 下载模型到本地
model_dir = snapshot_download(model_name)


Downloading: 100%|██████████| 232k/232k [00:00<00:00, 2.16MB/s]
Downloading: 100%|██████████| 520/520 [00:00<00:00, 1.02MB/s]
Downloading: 100%|██████████| 1.13k/1.13k [00:00<00:00, 2.56MB/s]
Downloading: 100%|██████████| 390M/390M [01:25<00:00, 4.76MB/s] 
Downloading: 100%|██████████| 3.57k/3.57k [00:00<00:00, 7.39MB/s]
Downloading: 100%|██████████| 107k/107k [00:00<00:00, 3.38MB/s]


In [4]:
# 从本地目录加载模型和tokenizer
model = BertModel.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

# 准备输入数据
text = "Hello, my dog is cute"
inputs = tokenizer(text, return_tensors="pt")

# 设置模型为评估模式
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
# 转换为ONNX
onnx_model_path = "bert_model.onnx"
torch.onnx.export(model, 
                  (inputs['input_ids'], inputs['attention_mask']), 
                  onnx_model_path,
                  input_names=['input_ids', 'attention_mask'],
                  output_names=['output'],
                  dynamic_axes={'input_ids': {0: 'batch_size'}, 
                                'attention_mask': {0: 'batch_size'},
                                'output': {0: 'batch_size'}})
print(f"Model saved to {onnx_model_path}")


Model saved to bert_model.onnx


In [6]:
# 可视化ONNX模型
netron.start(onnx_model_path)

Serving 'bert_model.onnx' at http://localhost:8080


('localhost', 8080)

In [7]:


import torch
from modelscope import AutoModel, AutoTokenizer
import onnx
import netron
# 下载并加载预训练的LLaMA模型和tokenizer
model_name = "qwen/Qwen1.5-0.5B-Chat"  # 请确保你有权限访问此模型
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: 100%|██████████| 661/661 [00:00<00:00, 1.59MB/s]
Downloading: 100%|██████████| 51.0/51.0 [00:00<00:00, 125kB/s]
Downloading: 100%|██████████| 206/206 [00:00<00:00, 414kB/s]
Downloading: 100%|██████████| 7.11k/7.11k [00:00<00:00, 6.87MB/s]
Downloading: 100%|██████████| 1.59M/1.59M [00:00<00:00, 4.43MB/s]
Downloading: 100%|█████████▉| 1.15G/1.15G [04:18<00:00, 4.79MB/s]
Downloading: 100%|██████████| 4.15k/4.15k [00:00<00:00, 6.42MB/s]
Downloading: 100%|██████████| 6.70M/6.70M [00:01<00:00, 5.29MB/s]
Downloading: 100%|██████████| 1.26k/1.26k [00:00<00:00, 3.80MB/s]
Downloading: 100%|██████████| 2.65M/2.65M [00:00<00:00, 7.33MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# 准备输入数据
text = "Hello, my dog is cute"
inputs = tokenizer(text, return_tensors="pt")

# 设置模型为评估模式
model.eval()

# 转换为ONNX
onnx_model_path = "llama_model.onnx"
torch.onnx.export(model, 
                  (inputs['input_ids'], inputs['attention_mask']), 
                  onnx_model_path,
                  input_names=['input_ids', 'attention_mask'],
                  output_names=['output'],
                  dynamic_axes={'input_ids': {0: 'batch_size'}, 
                                'attention_mask': {0: 'batch_size'},
                                'output': {0: 'batch_size'}})
print(f"Model saved to {onnx_model_path}")

# 可视化ONNX模型
netron.start(onnx_model_path)


  elif sliding_window is None or key_value_length < sliding_window:
  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if seq_len > self.max_seq_len_cached:
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-usi

Model saved to llama_model.onnx
Serving 'llama_model.onnx' at http://localhost:8081


('localhost', 8081)

In [9]:
from __future__ import print_function
import torch

In [None]:
x = torch.empty(5, 3)
