In [None]:
! pip install accelerate
! pip show onnxruntime-gpu

In [1]:
import torch
from transformers import pipeline

generate_text = pipeline(model="databricks/dolly-v2-3b", torch_dtype=torch.float16, trust_remote_code=True, device_map="auto")
res = generate_text("Explain to me the difference between nuclear fission and fusion.")
print(res[0]["generated_text"])

Nuclear fission is the splitting of a light atom or molecule into two smaller atoms or molecules. This happens when subatomic particles called neutrons collide with the nucleus of the atom or molecule. The process releases more neutrons which collide with other atoms or molecules and release more neutrons which collide with still other atoms or molecules until the entire atom or molecule is broken down.

Nuclear fusion is when two or more atomic nuclei, each containing a nucleus of one or more atoms, join together to form one or more atomic nuclei, containing two or more atoms. This occurs when high energy particles, called protons, bombard two or more nuclei, and the particles bouncing back and forth form nuclear heat. It can also occur naturally as hydrogen in the sun's core fuses to release helium which powers the earth.

The process in a stars core is called nuclear fusion, and is responsible for the stars power.


In [4]:
from transformers import AutoModelForCausalLM
dolly_v2 = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b")

## Huggingface Conversion

In [None]:
# Failed: gpt-neox is not supported yet
!python -m transformers.onnx --model=databricks/dolly-v2-3b onnx/

### Optimum Conversion

In [None]:
optimum-cli export onnx --model databricks/dolly-v2-3b model/optimum_onnx/
# from optimum.onnxruntime import ORTModelForCausalLM
# ort_dolly_v2 = ORTModelForCausalLM.from_pretrained("databricks/dolly-v2-3b")

## Torch Conversion

In [27]:
def create_input_tensors():
    return torch.ones(1, 1024, dtype=torch.int64), torch.ones(1, 1024, dtype=torch.int64)

torch.onnx.export(
    dolly_v2,
    create_input_tensors(),
    "model/fp32/dolly_v2.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    opset_version=17,
    # dynamic_axes={
    #     "input_ids": {"0": "batch_size", "1": "seq_length"},
    #     "attention_mask": {"0": "batch_size", "1": "seq_length"},
    # }, # Failed: _jit_pass_onnx_set_dynamic_input_shape(): incompatible function arguments. The following argument types are supported
    do_constant_folding=True,
)


In [31]:
dolly_v2_fp16 = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", torch_dtype=torch.float16, device_map="auto")
torch.onnx.export(
    dolly_v2_fp16,
    create_input_tensors(),
    "model/fp16/dolly_v2.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    opset_version=17,
    do_constant_folding=True,
)

In [32]:
# dolly_v2_fp16 = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", torch_dtype=torch.float16)
dolly_v2_bf16 = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b", torch_dtype=torch.bfloat16, device_map="auto")

torch.onnx.export(
    dolly_v2_bf16,
    create_input_tensors(),
    "model/bf16/dolly_v2.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["output"],
    opset_version=17,
    do_constant_folding=True,
)

## Olive optimization

### Olive Conversion

In [1]:
import onnx
fp32_ort_dolly = onnx.load("model/fp32/dolly_v2.onnx")

In [2]:
print("Creating an initial run dolly decoder from model/fp32/dolly_v2.onnx ")
from onnxruntime.transformers.convert_generation import generate_gpt2_init_decoder
generate_gpt2_init_decoder("model/fp32/dolly_v2.onnx", "model/fp32/dolly_decoder_init_ort.pt", True)

Creating an initial run dolly decoder from model/fp32/dolly_v2.onnx 


False

In [3]:
import onnx
eos_token_id = 0
bos_token_id = 0
fp32_ort_dolly.graph.name = "Dolly decoder"
inputs = [
    "input_ids",
    "max_length",
    "min_length",
    "repetition_penalty",
    "",
    "",
    "",
    "",
]
outputs = ["sequences"]
node = onnx.helper.make_node(
    "Sampling",
    inputs=inputs,
    outputs=outputs,
    name=f"Sampling_dolly",
)
node.domain = "com.microsoft"
attr_to_extend = [
    onnx.helper.make_attribute("eos_token_id", eos_token_id),
    onnx.helper.make_attribute("pad_token_id", -1),
    onnx.helper.make_attribute("model_type", 0),
    onnx.helper.make_attribute("no_repeat_ngram_size", 1),
    onnx.helper.make_attribute("temperature", 1.0),
    onnx.helper.make_attribute("top_p", 1.0),
    onnx.helper.make_attribute("filter_value", -float("Inf")),
    onnx.helper.make_attribute("min_tokens_to_keep", 1),
    onnx.helper.make_attribute("custom", 1),
    onnx.helper.make_attribute("presence_penalty", 0.0),
]

In [4]:
node.attribute.extend(attr_to_extend)

In [6]:
from onnxruntime.transformers.convert_generation import move_initializers
initializers = move_initializers(fp32_ort_dolly.graph)
print(f"{len(initializers)} initializers from the decoder are moved to the main graph")
node.attribute.append(onnx.helper.make_attribute("decoder", fp32_ort_dolly.graph))

from onnx import TensorProto
input_ids = onnx.helper.make_tensor_value_info("input_ids", TensorProto.INT32, [1, 1024])
max_length = onnx.helper.make_tensor_value_info("max_length", TensorProto.INT32, [1])
min_length = onnx.helper.make_tensor_value_info("min_length", TensorProto.INT32, [1])
num_beams = onnx.helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1])
num_return_sequences = onnx.helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
length_penalty = onnx.helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
repetition_penalty = onnx.helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
graph_inputs = [
    input_ids,
    max_length,
    min_length,
    repetition_penalty,
]

388 initializers from the decoder are moved to the main graph


In [10]:
sequences = onnx.helper.make_tensor_value_info(
    "sequences",
    TensorProto.INT32,
    ["batch_size", "max_length"],
)

In [None]:
graph_outputs = [sequences]
new_graph = onnx.helper.make_graph(
    [node],
    "dolly v2 beam search",
    graph_inputs,
    graph_outputs,
    initializers,
)

In [9]:
    # Create the model
new_model = onnx.helper.make_model(
    new_graph,
    producer_name="onnxruntime.transformers",
    opset_imports=fp32_ort_dolly.opset_import,
)

In [11]:

from onnxruntime.transformers.onnx_model import OnnxModel
OnnxModel.save(
    new_model,
    "model/fp32/beam_search/dolly_v2.onnx",
    save_as_external_data=True,
    all_tensors_to_one_file=True,
)


In [None]:
from onnxruntime import InferenceSession
sess = InferenceSession(new_model)