Skip to content

Commit

Permalink
Update lm-eval evaluate in ort llm example (#1813)
Browse files Browse the repository at this point in the history
Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
  • Loading branch information
yuwenzho committed May 24, 2024
1 parent eaa3a58 commit 54f039d
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 46 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import onnxruntime as ort
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import LlamaConfig, LlamaTokenizer

Expand Down Expand Up @@ -198,28 +197,33 @@ def replace_architectures(json_path):
json.dump(data, file, indent=4)

def eval_func(model):
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser

model_dir = model
if isinstance(model, str) and model.endswith(".onnx"):
model_dir = os.path.dirname(model)

replace_architectures(os.path.join(model_dir, "config.json"))

results = evaluate(
model="hf-causal",
model_args="pretrained=" + model_dir + ",tokenizer="+ args.tokenizer,
eval_args = LMEvalParser(
model="hf",
model_args="pretrained=" + model_dir + ",tokenizer=" + args.tokenizer + ",model_format=onnx",
batch_size=args.batch_size,
tasks=args.tasks,
model_format="onnx",
tasks=','.join(args.tasks),
device="cpu",
)
results = evaluate(eval_args)

eval_acc = 0
for task_name in args.tasks:
if task_name == "wikitext":
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]))
eval_acc += results["results"][task_name]["word_perplexity"]
print("Accuracy for %s is: %s" %
(task_name, results["results"][task_name]["word_perplexity,none"]))
eval_acc += results["results"][task_name]["word_perplexity,none"]
else:
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]))
eval_acc += results["results"][task_name]["acc"]
print("Accuracy for %s is: %s" %
(task_name, results["results"][task_name]["acc,none"]))
eval_acc += results["results"][task_name]["acc,none"]

if len(args.tasks) != 0:
eval_acc /= len(args.tasks)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ onnxruntime-extensions; python_version < '3.11'
datasets
optimum
evaluate
intel-extension-for-transformers
intel-extension-for-transformers >= 1.4.1
peft
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
lm-eval==0.4.2
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import onnxruntime as ort
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from intel_extension_for_transformers.llm.evaluation.lm_eval import evaluate
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import LlamaConfig, LlamaTokenizer

Expand Down Expand Up @@ -135,28 +134,33 @@ def replace_architectures(json_path):
json.dump(data, file, indent=4)

def eval_func(model):
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser

model_dir = model
if isinstance(model, str) and model.endswith(".onnx"):
model_dir = os.path.dirname(model)

replace_architectures(os.path.join(model_dir, "config.json"))

results = evaluate(
model="hf-causal",
model_args="pretrained=" + model_dir + ",tokenizer="+ args.tokenizer,
eval_args = LMEvalParser(
model="hf",
model_args="pretrained=" + model_dir + ",tokenizer=" + args.tokenizer + ",model_format=onnx",
batch_size=args.batch_size,
tasks=args.tasks,
model_format="onnx",
tasks=','.join(args.tasks),
device="cpu",
)
results = evaluate(eval_args)

eval_acc = 0
for task_name in args.tasks:
if task_name == "wikitext":
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]))
eval_acc += results["results"][task_name]["word_perplexity"]
print("Accuracy for %s is: %s" %
(task_name, results["results"][task_name]["word_perplexity,none"]))
eval_acc += results["results"][task_name]["word_perplexity,none"]
else:
print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]))
eval_acc += results["results"][task_name]["acc"]
print("Accuracy for %s is: %s" %
(task_name, results["results"][task_name]["acc,none"]))
eval_acc += results["results"][task_name]["acc,none"]

if len(args.tasks) != 0:
eval_acc /= len(args.tasks)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ onnxruntime-extensions; python_version < '3.11'
datasets
optimum
evaluate
intel-extension-for-transformers
intel-extension-for-transformers >= 1.4.1
peft
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
lm-eval==0.4.2
49 changes: 27 additions & 22 deletions neural_compressor/adaptor/ox_utils/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,8 @@ def make_matmul_weight_only_node(
op_type = "MatMulNBits"

# pack quantized weight
for i in range(q_weight.shape[0]):
for k in range(0, group_size, 2):
packed[i][k // 2] = q_weight[i][k] | q_weight[i][k + 1] << 4
q_weight_pairs = q_weight[:, ::2] | q_weight[:, 1::2] << 4
packed[:, :] = q_weight_pairs[:, :blob_size]
packed = np.reshape(packed, (-1, k_blocks, blob_size))

# build scale tensor
Expand All @@ -120,15 +119,14 @@ def make_matmul_weight_only_node(
packed_zp = np.reshape(zero_point, (1, -1)).astype("uint8")
else:
packed_zp = np.full((zero_point.shape[0] + 1) // 2, 136, dtype="uint8")
for i in range(zero_point.shape[0] // k_blocks):
for j in range(k_blocks):
idx = i * k_blocks + j
zp = zero_point[idx]
packed_zp[idx // 2] = (
((packed_zp[idx // 2] & 0x0F) | (zp << 4))
if (idx & 1)
else ((packed_zp[idx // 2] & 0xF0) | zp)
)
# create an index array
idx = np.arange(zero_point.shape[0] // k_blocks * k_blocks).reshape(-1)
# separate odd and even indices
even_idx = idx[::2]
odd_idx = idx[1::2]
# vectorized operation for even and odd indices
packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)

zp_tensor = onnx.helper.make_tensor(
name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
Expand Down Expand Up @@ -224,9 +222,8 @@ def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ra
if scheme == "sym":
max_range = np.maximum(np.abs(rmin), np.abs(rmax))
scale = np.ones(rmax.shape)
scale[max_range > 0] = np.array(
[float(i) / (maxq - minq) for i in (max_range[max_range > 0] * 2.0).flatten().tolist()]
)
mask = max_range > 0
scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
zero_point = (
np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
)
Expand All @@ -240,7 +237,14 @@ def quant_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ra
if dtype == "int"
else np.maximum(0, np.minimum(maxq, ((np.zeros(scale.shape) - rmin) / scale).round())).astype("uint8")
)
return np.clip((data / scale + zero_point).round(), minq, maxq), scale, zero_point

q_weight = np.empty_like(data, dtype=scale.dtype)
np.divide(data, scale, out=q_weight)
np.add(q_weight, zero_point, out=q_weight)
np.round(q_weight, out=q_weight)
np.clip(q_weight, minq, maxq, out=q_weight)

return q_weight, scale, zero_point


def qdq_tensor(data, num_bits=4, group_size=32, scheme="asym", dtype="int", ratio=1.0):
Expand Down Expand Up @@ -756,6 +760,7 @@ def awq_quantize(
model.remove_tensors_from_outputs([i.name for i in org_output])

output_names = []

for node in model.nodes():
if (
node.op_type in ["MatMul"]
Expand Down Expand Up @@ -927,8 +932,8 @@ def find_params(weight):
perm = np.argsort(np.diag(H))[::-1]
W = W[perm, :]
H = H[perm, :][:, perm]
Losses = np.zeros(W.shape)
Q = np.zeros(W.shape)
Losses = np.zeros_like(W)
Q = np.zeros_like(W)
damp = percdamp * np.mean(np.diag(H))
diag = np.arange(shape[0])
H[diag, diag] += damp # add a average value of
Expand All @@ -939,9 +944,9 @@ def find_params(weight):
count = i2 - i1

W1 = copy.deepcopy(W[i1:i2, :])
Q1 = np.zeros(W1.shape)
Err1 = np.zeros(W1.shape)
Losses1 = np.zeros(W1.shape)
Q1 = np.zeros_like(W1)
Err1 = np.zeros_like(W1)
Losses1 = np.zeros_like(W1)
Hinv1 = Hinv[i1:i2, i1:i2]

for i in range(count): # within a block, channel wise
Expand All @@ -952,7 +957,7 @@ def find_params(weight):
if (i1 + i) % group_size == 0:
scale, zp = find_params(W[(i1 + i) : (i1 + i + group_size), :])

q = (scale * (np.clip(np.round(np.expand_dims(w, axis=1) / scale) + zp, 0, maxq) - zp)).flatten()
q = (scale * (np.clip(np.round(w[:, np.newaxis] / scale) + zp, 0, maxq) - zp)).flatten()
Q1[i, :] = q
Losses1[i, :] = (w - q) ** 2 / d**2

Expand Down

0 comments on commit 54f039d

Please sign in to comment.