intel-analytics · chtanch · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024 · Feb 27, 2024
diff --git a/modules/benchmark_util.py b/modules/benchmark_util.py
diff --git a/modules/models.py b/modules/models.py
@@ -324,6 +324,7 @@ def AutoAWQ_loader(model_name):
 def bigdl_llm_loader(model_name):
 
     from bigdl.llm.transformers import AutoModelForCausalLM, AutoModel, AutoModelForSeq2SeqLM
+    from modules.benchmark_util import BenchmarkWrapper
 
     path_to_model = Path(f'{shared.args.model_dir}/{model_name}')
 
@@ -356,6 +357,10 @@ def bigdl_llm_loader(model_name):
 
     tokenizer = AutoTokenizer.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code)
 
+    if shared.args.run_benchmark:
+        model = BenchmarkWrapper(model)
+        print('BenchmarkWrapper applied')
+
     return model, tokenizer
 
 

diff --git a/modules/shared.py b/modules/shared.py
@@ -169,6 +169,7 @@
 #group.add_argument('--lightweight-bmm', action='store_true', help='Whether to replace the torch.bmm ops, may need to set it to `True` when running BigDL-LLM on GPU on Windows.')
 group.add_argument('--use-cache', action='store_true', default=True, help='If use_cache is True, past key values are used to speed up decoding if applicable to model.')
 group.add_argument('--trust-remote-code', action='store_true', default=True, help='Set trust_remote_code=True while loading the model. Necessary for some models.')
+group.add_argument('--run-benchmark', action='store_true', default=False, help='Set run-benchmark to run model warm up and generate benchmark output.')
 
 # HQQ
 group = parser.add_argument_group('HQQ')

diff --git a/modules/text_generation.py b/modules/text_generation.py
@@ -375,6 +375,12 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings
         pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(filtered_params)
         print()
 
+    # warm-up
+    if shared.args.run_benchmark:
+        with torch.no_grad():
+            shared.model.generate(**generate_params)
+            torch.xpu.synchronize()
+
     t0 = time.time()
     try:
         if not is_chat and not shared.is_seq2seq:
@@ -403,11 +409,6 @@ def generate_with_callback(callback=None, *args, **kwargs):
             def generate_with_streaming(**kwargs):
                 return Iteratorize(generate_with_callback, [], kwargs, callback=None)
 
-            # warm-up
-            with torch.no_grad():
-                shared.model.generate(**generate_params)
-                torch.xpu.synchronize()
-
             with generate_with_streaming(**generate_params) as generator:
                 cumulative_reply = ''
                 starting_from = 0 if shared.is_seq2seq else len(input_ids[0])
@@ -431,6 +432,11 @@ def generate_with_streaming(**kwargs):
         original_tokens = len(original_input_ids[0])
         new_tokens = len(output) - (original_tokens if not shared.is_seq2seq else 0)
         print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
+
+        if shared.args.run_benchmark:
+            first_cost = shared.model.first_cost*1000
+            rest_cost_mean = np.mean(shared.model.last_token_time)*1000
+            print(f'Latency 1st/2nd+: {first_cost:.2f}ms, {rest_cost_mean:.2f}ms. Input/output tokens: {original_tokens}, {new_tokens}')
         return
 
 

diff --git a/modules/ui.py b/modules/ui.py
@@ -65,6 +65,7 @@ def list_model_elements():
         'cpu_embedding',
         #'lightweight_bmm',
         'use_cache',
+        'run_benchmark',
         'compute_dtype',
         'quant_type',
         'use_double_quant',

diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py
@@ -139,6 +139,7 @@ def create_ui():
                             #shared.gradio['lightweight_bmm'] = gr.Checkbox(label="lightweight-bmm", value=shared.args.lightweight_bmm, info="Whether to replace the torch.bmm ops.")
                             shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='To enable this option, start the web UI with the --trust-remote-code flag. It is necessary for some models.')
                             shared.gradio['use_cache'] = gr.Checkbox(label="use-cache", value=shared.args.use_cache, info="Wether to use past_key_values to speed up model decoding.")
+                            shared.gradio['run_benchmark'] = gr.Checkbox(label="run-benchmark", value=shared.args.run_benchmark, info='Enable this option to run model warm up and generate benchmark output.')
                             shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Create an additional cache for CFG negative prompts.')
                             shared.gradio['logits_all'] = gr.Checkbox(label="logits_all", value=shared.args.logits_all, info='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.')
                             shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.')