import argparse import torch from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, ) import intel_extension_for_pytorch as ipex torch._C._jit_set_texpr_fuser_enabled(False) try: ipex._C.disable_jit_linear_repack() except Exception: pass parser = argparse.ArgumentParser("Generation script (fp32/bf16 path)", add_help=False) parser.add_argument("--deployment-mode", action="store_true") args = parser.parse_args() amp_dtype = getattr(torch, "bfloat16") config = AutoConfig.from_pretrained( "EleutherAI/gpt-j-6b", torchscript=args.deployment_mode, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "EleutherAI/gpt-j-6b", torch_dtype=amp_dtype, config=config, low_cpu_mem_usage=True, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b", trust_remote_code=True) model = model.eval() model = model.to(memory_format=torch.channels_last) model = ipex.llm.optimize( model, dtype=amp_dtype, inplace=True, deployment_mode=args.deployment_mode,) if args.deployment_mode: raise SystemExit("[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode to False if want to use torch.compile.") else: model.forward = torch.compile(model.forward, dynamic=True, backend="ipex") num_warmup = 1 num_iter = 2 total_time = 0 batch_size = 24 vocab_size = model.config.vocab_size sample_input = torch.randint(vocab_size, size=[24, 384]) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( enabled=True ): from torch.profiler import profile, record_function, ProfilerActivity for i in range(int(num_warmup)): outputs = model(sample_input) for i in range(int(num_iter)): start_time = time.time() outputs = model(sample_input) end_time = time.time() total_time += (end_time - start_time) * 1e3 # ms time_per_step = total_time / int(num_iter) if batch_size == 1: print("%s: Latency = %.2f ms" % ("EleutherAI/gpt-j-6b", time_per_step)) else: print( "%s: Throughput = %.2f QPS" % ("EleutherAI/gpt-j-6b", (24 * 1000) / time_per_step) )