import argparse
import torch

from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
)

import intel_extension_for_pytorch as ipex

torch._C._jit_set_texpr_fuser_enabled(False)
try:
    ipex._C.disable_jit_linear_repack()
except Exception:
    pass

parser = argparse.ArgumentParser("Generation script (fp32/bf16 path)", add_help=False)
parser.add_argument("--deployment-mode", action="store_true")

args = parser.parse_args()

amp_dtype = getattr(torch, "bfloat16")

config = AutoConfig.from_pretrained(
        "EleutherAI/gpt-j-6b", torchscript=args.deployment_mode, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
        "EleutherAI/gpt-j-6b",
        torch_dtype=amp_dtype,
        config=config,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b", trust_remote_code=True)

model = model.eval()
model = model.to(memory_format=torch.channels_last)

model = ipex.llm.optimize(
        model,
        dtype=amp_dtype,
        inplace=True,
        deployment_mode=args.deployment_mode,)

if args.deployment_mode:
        raise SystemExit("[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode to False if want to use torch.compile.")
else:
    model.forward = torch.compile(model.forward, dynamic=True, backend="ipex")


num_warmup = 1
num_iter = 2
total_time = 0
batch_size = 24
vocab_size = model.config.vocab_size
sample_input = torch.randint(vocab_size, size=[24, 384])

with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast(
    enabled=True
):
    from torch.profiler import profile, record_function, ProfilerActivity
    for i in range(int(num_warmup)):
        outputs = model(sample_input)
    for i in range(int(num_iter)):
        start_time = time.time()
        outputs = model(sample_input)
    end_time = time.time()
    total_time += (end_time - start_time) * 1e3  # ms
    time_per_step = total_time / int(num_iter)
    if batch_size == 1:
        print("%s: Latency = %.2f ms" % ("EleutherAI/gpt-j-6b", time_per_step))
    else:
        print(
            "%s: Throughput = %.2f QPS"
            % ("EleutherAI/gpt-j-6b", (24 * 1000) / time_per_step)
        )