llm example run_accuracy.py: load model to meta device for quantizati…

…on (#2195) * llm example run_accuracy.py: load model to meta device for quantization * Add more print * Print exception * Fix typo ipex._IPEXOnDevice -> ipex.IPEXOnDevice * ipex.IPEXOnDevice -> ipex.OnDevice * Fix typo _from_config -> from_config * Remove mem usage print
intel · Oct 25, 2023 · 0cd2502 · 0cd2502
1 parent c3a77f3
commit 0cd2502
Showing 1 changed file with 21 additions and 7 deletions.
diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py
@@ -124,13 +124,27 @@ def __init__(
                     config, torchscript=with_jit, trust_remote_code=True
                 )
 
-            self.model = model_class[0].from_pretrained(
-                model_id,
-                low_cpu_mem_usage=True,
-                config=self.config,
-                torch_dtype=load_dtype,
-                trust_remote_code=True,
-            )
+            if self._dtype == "int8":
+                try:
+                    with ipex.OnDevice(dtype=torch.float, device="meta"):
+                        self.model = AutoModelForCausalLM.from_config(self.config)
+                except (RuntimeError, AttributeError) as e:
+                    print('Warning: Loading model to meta device failed:', e)
+                    self.model = model_class[0].from_pretrained(
+                        model_id,
+                        low_cpu_mem_usage=True,
+                        config=self.config,
+                        torch_dtype=load_dtype,
+                        trust_remote_code=True,
+                    )
+            else:
+                self.model = model_class[0].from_pretrained(
+                    model_id,
+                    low_cpu_mem_usage=True,
+                    config=self.config,
+                    torch_dtype=load_dtype,
+                    trust_remote_code=True,
+                )
 
             self.model = self.model.eval()