diff --git a/examples/vllm/vllm_acceleration_example.py b/examples/vllm/vllm_acceleration_example.py index b56487c38bd..468ef26c7cb 100644 --- a/examples/vllm/vllm_acceleration_example.py +++ b/examples/vllm/vllm_acceleration_example.py @@ -37,42 +37,46 @@ def main(args_in: Optional[List[str]] = None) -> None: print(args) if args.benchmark: - if args.use_neural_speed: - os.environ["NEURAL_SPEED_VERBOSE"] = "1" - woq_config = RtnConfig(bits=4, weight_dtype="int4", compute_dtype="int8", scale_dtype="bf16") - model_with_ns = AutoModelForCausalLM.from_pretrained(args.model_path, quantization_config=woq_config) - - tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) - inputs = tokenizer(args.prompt, return_tensors="pt").input_ids + sampling_params = SamplingParams(max_tokens=32) + config = RtnConfig(compute_dtype="int8", + group_size=128, + scale_dtype="bf16", + weight_dtype="int4_clip", + bits=4) + print(config) + prompts = [args.prompt] + llm = LLM(model=args.model_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(args.model_path, use_vllm=True, config=config) - T5 = time.time() - output = model_with_ns.generate(inputs, max_new_tokens=32) - T6 = time.time() - print("neural speed output = ", output) + for prompt in prompts: + vllm_outputs = llm.generate(prompt, sampling_params) # Generate texts from the prompts. + qbits_output = model.generate(prompt, sampling_params) - llm = LLM(model=args.model_path, trust_remote_code=True) - sampling_params = SamplingParams(max_tokens=32) - T1 = time.time() - original_outputs = llm.generate(args.prompt, sampling_params) # Generate texts from the prompts. - T2 = time.time() - vllm_latency = (T2 - T1) * 1000 + print("vLLM input_tokens_length = ", len(vllm_outputs[0].prompt_token_ids), + "output_tokens_length = ", len(vllm_outputs[0].outputs[0].token_ids)) + print('The vLLM generate = ', + vllm_outputs[0].metrics.finished_time - vllm_outputs[0].metrics.arrival_time, "s") + print("The vLLM first token time = ", + vllm_outputs[0].metrics.first_token_time - vllm_outputs[0].metrics.first_scheduled_time) - model = AutoModelForCausalLM.from_pretrained(args.model_path, use_vllm=True) - T3 = time.time() - optimized_output = model.generate(args.prompt, sampling_params) - T4 = time.time() - qbits_latency = (T4 - T3) * 1000 + print("QBits_vLLM input_tokens_length = ", len(qbits_output[0].prompt_token_ids), + "output_tokens_length = ", len(qbits_output[0].outputs[0].token_ids)) + print('The QBits optimized generate = ', + qbits_output[0].metrics.finished_time - qbits_output[0].metrics.arrival_time, "s") + print("The QBits first token time = ", + qbits_output[0].metrics.first_token_time - qbits_output[0].metrics.first_scheduled_time) - print("original outputs = ", original_outputs) - print("input_tokens_length = ", len(original_outputs[0].prompt_token_ids)) - print("output_tokens_length = ", len(original_outputs[0].outputs[0].token_ids)) + if args.use_neural_speed: + os.environ["NEURAL_SPEED_VERBOSE"] = "1" + woq_config = RtnConfig(bits=4, weight_dtype="int4", compute_dtype="int8", scale_dtype="bf16") + model_with_ns = AutoModelForCausalLM.from_pretrained(args.model_path, + quantization_config=woq_config) - print("optimized outputs = ", optimized_output) - print("input_tokens_length = ", len(optimized_output[0].prompt_token_ids)) - print("output_tokens_length = ", len(optimized_output[0].outputs[0].token_ids)) + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + inputs = tokenizer(args.prompt, return_tensors="pt").input_ids - print('The qbits optimized generate:%.2f ms' % qbits_latency) - print('The original vLLM generate:%.2f ms' % vllm_latency) + output = model_with_ns.generate(inputs, max_new_tokens=32) + print("neural speed output = ", output) return diff --git a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py index 0e073b258bc..70e5eefc387 100644 --- a/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py +++ b/intel_extension_for_transformers/transformers/llm/quantization/nn/modules.py @@ -146,8 +146,10 @@ def forward(self, x: torch.Tensor): bias = None if self.bias is None else self.bias.data.float() if not x.is_contiguous(): x = x.contiguous() + + # Only FP32 activation supports gemv which benefits next-token. out = matmul_kbit( - x.view(m, shape[-1]), + x.view(m, shape[-1]).float(), self.weight, bias, out, diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py index a56f0645376..8d89cb955fc 100644 --- a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py +++ b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py @@ -419,11 +419,15 @@ def forward(self, input: torch.Tensor) -> tuple[torch.Tensor, None]: model.load_weights(weights_iterator) print("INC quantizing...") - config = RtnConfig(compute_dtype="bf16", - group_size=128, - scale_dtype="bf16", - weight_dtype="int4_clip", - bits=4) + config = kwargs.pop("config", None) + if config is None: + config = RtnConfig(compute_dtype="int8", + group_size=128, + scale_dtype="bf16", + weight_dtype="int4_clip", + bits=4) + print("using default RTNConfig = ", config) + print("Using customized config = ", config) model = convert_to_quantized_model(model, config) return llm