-
Notifications
You must be signed in to change notification settings - Fork 203
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Engine] Inner Product FP8 weight compression format dispatch for LLM (…
- Loading branch information
Showing
7 changed files
with
248 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
intel_extension_for_transformers/backends/neural_engine/compile/optimizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (c) 2021 Intel Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""The neural engine optimizer module.""" | ||
|
||
from .graph import Graph | ||
from . import graph_utils as util | ||
from . import logger | ||
|
||
OPTIMIZED_WEIGHT_FORMAT_TAG = {'FP8': ['ANY', 'INT8', 'FP8_4E3M', 'FP8_5E2M']} | ||
|
||
|
||
class Optimizer: | ||
"""The defintion of the neural engine optimizer.""" | ||
|
||
def __init__(self, graph, input_shape=None, *args, **kwargs): | ||
"""The optimizer initialization. | ||
Args: | ||
graph: neural engine Graph class | ||
input_shape: list of list, model input data shape list | ||
""" | ||
assert isinstance(graph, Graph), 'graph must be an instance of Graph class' | ||
self.graph = graph | ||
self.input_shape = input_shape | ||
self.cast_dtype = util.get_autocast_info()['cast_type'] | ||
self.weight_dtype = util.get_autocast_info().get('weight_dtype', 'native') | ||
try: | ||
util.get_environ_info() | ||
except: | ||
util.environ_info_init() | ||
|
||
def optimize(self): | ||
"""Optimize the graph.""" | ||
self.weight_optimization() | ||
# Set env vars before inference. These env vars could help accelerate inference speed. | ||
util.set_environ_vars(util.get_environ_info()) | ||
|
||
def weight_optimization(self): | ||
"""Optimize weight format.""" | ||
if self.cast_dtype == 'bf16' and self.weight_dtype.upper() in \ | ||
OPTIMIZED_WEIGHT_FORMAT_TAG['FP8']: | ||
self._weight_fp8_dispatch(self.weight_dtype.upper()) | ||
|
||
def _weight_fp8_dispatch(self, w_tag): | ||
"""Optimize BF16 graph by using FP8 weight format.""" | ||
tag2env = {'INT8': 'NE_WEIGHT_INT8', 'FP8_4E3M': 'NE_WEIGHT_FP8_4E3M', | ||
'FP8_5E2M': 'NE_WEIGHT_FP8_5E2M'} | ||
util.del_environ_vars(list(tag2env.values())) | ||
util.remove_environ_info_items(list(tag2env.values())) | ||
if w_tag == 'ANY': | ||
# TODO: Consider to add best fp8 weight format search | ||
best_tag = 'INT8' | ||
logger.info('Using FP8 weight storage format {} for BF16 model inference'.format( | ||
best_tag)) | ||
util.insert_environ_info(tag2env[best_tag], '1') | ||
elif w_tag in tag2env: | ||
env_key = tag2env[w_tag] | ||
logger.info('Using FP8 weight storage format {} for BF16 model inference'.format( | ||
w_tag)) | ||
util.insert_environ_info(env_key, '1') | ||
else: | ||
logger.warning('Unknown FP8 weight compression format, please use {}'.format( | ||
OPTIMIZED_WEIGHT_FORMAT_TAG['FP8'])) |
89 changes: 89 additions & 0 deletions
89
intel_extension_for_transformers/backends/neural_engine/test/pytest/test_graph_optimizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (c) 2022 Intel Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import unittest | ||
import numpy as np | ||
import shutil | ||
from intel_extension_for_transformers.backends.neural_engine.compile.ops.op import OPERATORS | ||
from intel_extension_for_transformers.backends.neural_engine.compile.ops.tensor import Tensor | ||
from intel_extension_for_transformers.backends.neural_engine.compile.graph import Graph | ||
from intel_extension_for_transformers.backends.neural_engine.compile import compile, autocast | ||
import copy | ||
|
||
|
||
def fp32_to_bf16(fp32_np): | ||
if fp32_np.dtype == np.int16: | ||
return fp32_np | ||
tmp = copy.deepcopy(fp32_np) | ||
tmp = tmp.view(dtype=np.int32) | ||
tmp = tmp >> 16 | ||
tmp = tmp.astype(np.int16) | ||
return tmp | ||
|
||
class TestExecutionOptions(unittest.TestCase): | ||
@classmethod | ||
def setUpClass(self): | ||
|
||
self.ir_path = 'optimizer_ir' | ||
graph = Graph() | ||
input_data_node = OPERATORS['Input']() | ||
input_tensors = [] | ||
output_tensors = [Tensor(name="activation", shape=[-1, -1], dtype="bf16")] | ||
input_data_node.construct('input_data', 'Input', input_tensors=input_tensors, | ||
output_tensors=output_tensors) | ||
ip_node = OPERATORS['InnerProduct']() | ||
input_tensors = [Tensor(name="activation", shape=[-1, -1], dtype="bf16"), | ||
Tensor(name="weight", shape=[256, 256], dtype="bf16", | ||
data=fp32_to_bf16(np.random.randn(256, 256).astype(np.float32))), | ||
Tensor(name="bias", shape=[256], dtype="bf16", | ||
data=fp32_to_bf16(np.random.randn(256).astype(np.float32)))] | ||
output_tensors = [Tensor(name='ip:0', source_op=['ip'], dest_op=['output_data'])] | ||
ip_node.construct('ip', 'InnerProduct', input_tensors=input_tensors, | ||
output_tensors=output_tensors) | ||
output_node = OPERATORS['Output']() | ||
input_tensors = [Tensor(name='ip:0', source_op=['ip'], dest_op=['output_data'])] | ||
output_tensors = [] | ||
output_node.construct('output_data', 'Output', input_tensors=input_tensors, | ||
output_tensors=output_tensors) | ||
graph.insert_nodes(len(graph.nodes), [input_data_node, ip_node, output_node]) | ||
graph.save(self.ir_path) | ||
del graph | ||
|
||
@classmethod | ||
def tearDownClass(self): | ||
shutil.rmtree(self.ir_path) | ||
|
||
def test_fp8_weight_compression(self): | ||
graph = None | ||
data = fp32_to_bf16(np.random.randn(128, 256).astype(np.float32)) | ||
graph = compile(self.ir_path) | ||
g_ret = copy.deepcopy(graph.inference([data])['ip:0']) | ||
fp8_ret = [] | ||
for w_tag in ['any', 'int8', 'fp8_5e2m', 'fp8_4e3m']: | ||
with autocast('bf16', weight_dtype= w_tag): | ||
graph = compile(self.ir_path) | ||
fp8_ret.append(copy.deepcopy(graph.inference([data])['ip:0'])) | ||
|
||
flag = True | ||
for ret in fp8_ret: | ||
flag = np.allclose(g_ret, ret, atol=1e0, equal_nan=True) | ||
if not flag: | ||
break | ||
self.assertTrue(flag) | ||
|
||
if __name__ == "__main__": | ||
unittest.main() |