Skip to content

Commit

Permalink
Update example scripts for ONNXRT DML EP (#1455)
Browse files Browse the repository at this point in the history
Signed-off-by: chensuyue <suyue.chen@intel.com>
Co-authored-by: Wang, Mengni <mengni.wang@intel.com>
  • Loading branch information
chensuyue and mengniwang95 committed Dec 13, 2023
1 parent f11c51b commit 098401d
Show file tree
Hide file tree
Showing 8 changed files with 96 additions and 35 deletions.
Expand Up @@ -264,7 +264,7 @@ def eval_func(model, dataloader, metric, backend):
model = onnx.load(args.model_path)
dataloader = Dataloader(args.dataset_location, args.label_path, args.batch_size)
top1 = TopK()
backend = 'default' if args.device == 'cpu' else 'onnxrt_dml_ep'
backend = 'onnxrt_dml_ep' if args.device == 'npu' else 'default'
def eval(onnx_model):
return eval_func(onnx_model, dataloader, top1, backend)

Expand All @@ -289,7 +289,5 @@ def eval(onnx_model):
device=args.device,
backend=backend)

q_model = quantization.fit(model, config, calib_dataloader=dataloader,
eval_func=eval)

q_model = quantization.fit(model, config, calib_dataloader=dataloader, eval_func=eval)
q_model.save(args.output_model)
Expand Up @@ -191,7 +191,8 @@ def fetcher(ids):

def eval_func(model, dataloader, metric):
metric.reset()
sess = ort.InferenceSession(model.SerializeToString(), providers=ort.get_available_providers())
provider = 'DmlExecutionProvider' if backend == 'onnxrt_dml_ep' else 'CPUExecutionProvider'
sess = ort.InferenceSession(model.SerializeToString(), providers=[provider])
input_names = [i.name for i in sess.get_inputs()]
for input_data, label in dataloader:
output = sess.run(None, dict(zip(input_names, [input_data])))
Expand Down Expand Up @@ -252,29 +253,42 @@ def eval_func(model, dataloader, metric):
default=1,
type=int,
)
parser.add_argument(
'--device',
type=str,
default='cpu',
choices=['cpu', 'npu'],
)
args = parser.parse_args()
backend = 'onnxrt_dml_ep' if args.device == 'npu' else 'default'

model = onnx.load(args.model_path)
dataloader = Dataloader(args.dataset_location, args.label_path, args.batch_size)
top1 = TopK()

def eval(onnx_model):
return eval_func(onnx_model, dataloader, top1)

if args.benchmark:
if args.mode == 'performance':
from neural_compressor.benchmark import fit
from neural_compressor.config import BenchmarkConfig
conf = BenchmarkConfig(warmup=10, iteration=1000, cores_per_instance=4, num_of_instance=1)
conf = BenchmarkConfig(warmup=10,
iteration=1000,
cores_per_instance=4,
num_of_instance=1,
device=args.device,
backend=backend)
fit(model, conf, b_dataloader=dataloader)
elif args.mode == 'accuracy':
acc_result = eval(model)
print("Batch size = %d" % dataloader.batch_size)
print("Accuracy: %.5f" % acc_result)
if args.tune:
from neural_compressor import quantization, PostTrainingQuantConfig
config = PostTrainingQuantConfig(quant_format=args.quant_format)
config = PostTrainingQuantConfig(quant_format=args.quant_format,
device=args.device,
backend=backend)

q_model = quantization.fit(model, config, calib_dataloader=dataloader,
eval_func=eval)

q_model.save(args.output_model)
q_model = quantization.fit(model, config, calib_dataloader=dataloader, eval_func=eval)
q_model.save(args.output_model)
Expand Up @@ -115,6 +115,7 @@ def result(self):
return 0
return self.num_correct / self.num_sample


class Dataloader:
def __init__(self, dataset_location, image_list, batch_size):
self.batch_size = batch_size
Expand Down Expand Up @@ -206,15 +207,18 @@ def fetcher(ids):
except StopIteration:
return


def eval_func(model, dataloader, metric):
metric.reset()
sess = ort.InferenceSession(model.SerializeToString(), providers=ort.get_available_providers())
provider = 'DmlExecutionProvider' if backend == 'onnxrt_dml_ep' else 'CPUExecutionProvider'
sess = ort.InferenceSession(model.SerializeToString(), providers=[provider])
input_names = [i.name for i in sess.get_inputs()]
for input_data, label in dataloader:
output = sess.run(None, dict(zip(input_names, [input_data])))
metric.update(output, label)
return metric.result()


if __name__ == "__main__":
logger.info("Evaluating ONNXRuntime full precision accuracy and performance:")
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -275,7 +279,14 @@ def eval_func(model, dataloader, metric):
default=1,
type=int,
)
parser.add_argument(
'--device',
type=str,
default='cpu',
choices=['cpu', 'npu'],
)
args = parser.parse_args()
backend = 'onnxrt_dml_ep' if args.device == 'npu' else 'default'

model = onnx.load(args.model_path)
dataloader = Dataloader(args.dataset_location, args.label_path, args.batch_size)
Expand All @@ -297,6 +308,8 @@ def eval(onnx_model):
cores_per_instance=4,
num_of_instance=1,
diagnosis=args.diagnose,
device=args.device,
backend=backend,
)
fit(model, conf, b_dataloader=dataloader)
elif args.mode == 'accuracy':
Expand All @@ -308,9 +321,9 @@ def eval(onnx_model):
config = PostTrainingQuantConfig(
quant_format=args.quant_format,
diagnosis=args.diagnose,
device=args.device,
backend=backend
)

q_model = quantization.fit(model, config, calib_dataloader=dataloader,
eval_func=eval)

q_model = quantization.fit(model, config, calib_dataloader=dataloader, eval_func=eval)
q_model.save(args.output_model)
17 changes: 15 additions & 2 deletions examples/onnxrt/nlp/bert/quantization/ptq_static/main.py
Expand Up @@ -350,8 +350,17 @@ def result(self):
choices=["distilbert", "bert", "mobilebert", "roberta"],
help="model type"
)
parser.add_argument(
'--device',
type=str,
default='cpu',
choices=['cpu', 'npu'],
)
args = parser.parse_args()

# set config for npu test
backend = 'onnxrt_dml_ep' if args.device == 'npu' else 'default'

dataset = ONNXRTBertDataset(args.model_path,
data_dir=args.data_path,
model_name_or_path=args.model_name_or_path,
Expand All @@ -364,8 +373,8 @@ def result(self):

def eval_func(model):
metric.reset()
session = onnxruntime.InferenceSession(model.SerializeToString(),
providers=onnxruntime.get_available_providers())
provider = 'DmlExecutionProvider' if backend == 'onnxrt_dml_ep' else 'CPUExecutionProvider'
session = onnxruntime.InferenceSession(model.SerializeToString(), providers=[provider])
ort_inputs = {}
len_inputs = len(session.get_inputs())
inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
Expand All @@ -388,6 +397,8 @@ def eval_func(model):
iteration=100,
cores_per_instance=4,
num_of_instance=1,
device=args.device,
backend=backend
)
fit(model, conf, b_dataloader=dataloader)
elif args.mode == "accuracy":
Expand Down Expand Up @@ -425,6 +436,8 @@ def eval_func(model):
quant_format=args.quant_format,
calibration_sampling_size=[8, 16, 32],
recipes={"optypes_to_exclude_output_quant": ["MatMul", "Gemm", "Attention", "FusedGemm"]},
device=args.device,
backend=backend
)
q_model = quantization.fit(model,
config,
Expand Down
Expand Up @@ -340,8 +340,15 @@ def result(self):
default=768,
type=int,
)
parser.add_argument(
'--device',
type=str,
default='cpu',
choices=['cpu', 'npu'],
)

args = parser.parse_args()
backend = 'onnxrt_dml_ep' if args.device == 'npu' else 'default'

dataset = ONNXRTBertDataset(args.model_path,
data_dir=args.data_path,
Expand All @@ -352,8 +359,8 @@ def result(self):

def eval_func(model, *args):
metric.reset()
session = ort.InferenceSession(model.SerializeToString(),
providers=ort.get_available_providers())
provider = 'DmlExecutionProvider' if backend == 'onnxrt_dml_ep' else 'CPUExecutionProvider'
session = ort.InferenceSession(model.SerializeToString(), providers=[provider])
ort_inputs = {}
len_inputs = len(session.get_inputs())
inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
Expand All @@ -374,7 +381,9 @@ def eval_func(model, *args):
from neural_compressor.config import BenchmarkConfig
conf = BenchmarkConfig(iteration=100,
cores_per_instance=28,
num_of_instance=1)
num_of_instance=1,
device=args.device,
backend=backend)
fit(model, conf, b_dataloader=dataloader)
elif args.mode == 'accuracy':
acc_result = eval_func(model)
Expand Down Expand Up @@ -413,6 +422,8 @@ def eval_func(model, *args):
if args.model_name_or_path == 'Alireza1044/albert-base-v2-sst2':
specific_quant_config['recipes'] = {'first_conv_or_matmul_quantization': False}
config = PostTrainingQuantConfig(approach='dynamic',
device=args.device,
backend=backend,
**specific_quant_config)
q_model = quantization.fit(model,
config,
Expand Down
Expand Up @@ -27,9 +27,9 @@
from data_utils import ComposeTransform, ResizeTransform, LabelBalanceCOCORawFilter

logger = logging.getLogger(__name__)
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.WARN)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.WARN)
logger.info("Evaluating ONNXRuntime full precision accuracy and performance:")
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
Expand Down Expand Up @@ -89,7 +89,14 @@
default=16,
help="quantization format"
)
parser.add_argument(
'--device',
type=str,
default='cpu',
choices=['cpu', 'npu'],
)
args = parser.parse_args()
backend = 'onnxrt_dml_ep' if args.device == 'npu' else 'default'

if __name__ == "__main__":
model = onnx.load(args.model_path)
Expand All @@ -106,8 +113,8 @@

def eval_func(model):
metric.reset()
session = ort.InferenceSession(model.SerializeToString(),
providers=ort.get_available_providers())
provider = 'DmlExecutionProvider' if backend == 'onnxrt_dml_ep' else 'CPUExecutionProvider'
session = ort.InferenceSession(model.SerializeToString(), providers=[provider])
ort_inputs = {}
len_inputs = len(session.get_inputs())
inputs_names = [session.get_inputs()[i].name for i in range(len_inputs)]
Expand Down Expand Up @@ -143,6 +150,8 @@ def eval_func(model):
cores_per_instance=4,
num_of_instance=1,
diagnosis=args.diagnose,
device=args.device,
backend=backend,
)
fit(model, conf, b_dataloader=eval_dataloader)
elif args.mode == 'accuracy':
Expand All @@ -161,6 +170,8 @@ def eval_func(model):
quant_format=args.quant_format,
calibration_sampling_size=[50],
diagnosis=args.diagnose,
device=args.device,
backend=backend,
)
q_model = quantization.fit(model, config, calib_dataloader=calib_dataloader, eval_func=eval_func)
q_model.save(args.output_model)
q_model.save(args.output_model)
17 changes: 9 additions & 8 deletions neural_compressor/adaptor/onnxrt.py
Expand Up @@ -1315,14 +1315,15 @@ def query_fw_capability(self, model):
attention_matmul = []
for _, node in enumerate(self.pre_optimized_model.nodes()):
if node.op_type in ["Conv", "MatMul", "Attention"]:
# get first Conv or MatMul node
if len(first_quantizable_node) == 0:
first_quantizable_node.append(node)

# get last Conv or MatMul node
if len(last_quantizable_node) != 0:
last_quantizable_node.pop()
last_quantizable_node.append(node)
if node.op_type in optype_wise:
# get first Conv or MatMul node
if len(first_quantizable_node) == 0:
first_quantizable_node.append(node)

# get last Conv or MatMul node
if len(last_quantizable_node) != 0:
last_quantizable_node.pop()
last_quantizable_node.append(node)

all_conv_matmul.append(node)
if node.op_type != "Conv":
Expand Down
2 changes: 1 addition & 1 deletion neural_compressor/benchmark.py
Expand Up @@ -513,7 +513,7 @@ def fit(model, conf, b_dataloader=None, b_func=None):
assert sys.platform in ["linux", "win32", "darwin"], "platform not supported..."
# disable multi-instance for running benchmark on GPU device
set_all_env_var(conf)
if conf.device == "gpu" or sys.platform == "darwin":
if conf.device == "gpu" or conf.device == "npu" or sys.platform == "darwin":
set_env_var("NC_ENV_CONF", True, overwrite_existing=True)

if conf.diagnosis and os.environ.get("NC_ENV_CONF", None) in [None, "False"]:
Expand Down

0 comments on commit 098401d

Please sign in to comment.