#  LLaMA2* inference on Intel® Xeon® 4th generation Scalable Processor with Intel® Extension for PyTorch*


### <a href="https://ai.meta.com/llama/">LLaMA2</a> should need no introduction.

In [None]:
# Imports
import os
import time
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoConfig
import torch
# For this demo, we are using a custom version of Intel® Extension for PyTorch*
# These features would be in Intel® Extension for PyTorch* version 2.1
# For now, https://github.com/intel/intel-extension-for-pytorch/tree/llm_feature_branch/examples/cpu/inference/python/llm/README.md has instructions for installing it
import intel_extension_for_pytorch as ipex

### Select Config

In [None]:
num_beams = 1
# Reference: https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/text_generation#transformers.GenerationConfig
config = AutoConfig.from_pretrained("meta-llama/Llama-2-7b-hf", torchscript=True)
config.text_max_length = 64
generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=num_beams,
                       max_new_tokens=32, min_new_tokens=32)

### Select Model

In [None]:
model =  LlamaForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf", config=config, low_cpu_mem_usage=True
)

### Select Tokenizer

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

#### Some specific optimizations are unable to be applied directly on HF generation via the Torchscript , such as indirect kv cache, and first token optimization. They need modifications on the generation loop (like beam search function), while these modifications are related to <i>Intel® Extension for PyTorch*</i> optimization only, which makes them unable to do any upstream to HF.

Please note that this is more of a "preview" feature for you because the IPEX release alongside PyTorch* 2.1 would have formal support for them

In [None]:
model = ipex._optimize_transformers(
    model.eval(), dtype=torch.int8, inplace=True
)

### Load prequantized model

In [None]:
# Reference: https://github.com/intel/intel-extension-for-pytorch/tree/llm_feature_branch/examples/cpu/inference/python/llm#single-instance-performance

torch._C._jit_set_texpr_fuser_enabled(False)
self_jit = torch.jit.load("saved_results/int8_quantized_llama2_7b.pt")
self_jit = torch.jit.freeze(self_jit.eval())
setattr(model, "trace_graph", self_jit)

### Select prompt

In [None]:
prompt = "Once upon a time, there existed a little girl who liked to have adventures. She wanted to go to places and meet new people, and have fun"

### Benchmark

<i>Intel® Extension for PyTorch*</i> uses oneDNN Graph® fusions with int8 static quantization to speed up inference.
We also leverage BFloat16 based Automatic Mixed Precision.

In [None]:
total_time = 0.0
num_iter = 5
num_warmup = 3
prompt = [prompt] * 1
total_list = []
with torch.inference_mode(), torch.no_grad(), torch.autocast(
    device_type="cpu",
    enabled=True,
    dtype=torch.bfloat16,
):
    for i in range(num_iter):
        tic = time.time()
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(torch.device("cpu"))
        output = model.generate(input_ids, **generate_kwargs)
        gen_ids = output
        gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        toc = time.time()
        input_tokens_lengths = [x.shape[0] for x in input_ids]
        output_tokens_lengths = [x.shape[0] for x in gen_ids]
        total_new_tokens = [
            o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)
        ]
        print(gen_text, total_new_tokens, flush=True)
        print("Iteration: %d, Time: %.6f sec" % (i, toc - tic), flush=True)
        if i >= num_warmup:
            total_time += toc - tic
print("\n", "-" * 10, "Summary:", "-" * 10)
latency = total_time / (num_iter - num_warmup)
print("Inference latency: %.4f sec." % latency)

### Let's profile it!

In [None]:
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=3,
        active=1),
) as prof:
    for i in range(5):
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(torch.device("cpu"))
        output = model.generate(input_ids, **generate_kwargs)
        gen_ids = output
        gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        prof.step()
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1))

### Benchmark token latency

#### First restart kernel, then run code cells 1 through 7 (until the prompt selection step), followed by the following one -

In [None]:
model.config.token_latency = True

total_time = 0.0
num_iter = 5
num_warmup = 3
prompt = [prompt] * 1
total_list = []

with torch.inference_mode(), torch.no_grad(), torch.autocast(
    device_type="cpu",
    enabled=True,
    dtype=torch.bfloat16,
):
    for i in range(num_iter):
        tic = time.time()
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(torch.device("cpu"))
        output = model.generate(input_ids, **generate_kwargs)
        gen_ids = output[0]
        gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        toc = time.time()
        input_tokens_lengths = [x.shape[0] for x in input_ids]
        output_tokens_lengths = [x.shape[0] for x in gen_ids]
        total_new_tokens = [
            o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths)
        ]
        print(gen_text, total_new_tokens, flush=True)
        print("Iteration: %d, Time: %.6f sec" % (i, toc - tic), flush=True)
        if i >= num_warmup:
            total_time += toc - tic
            total_list.append(output[1])

print("\n", "-" * 10, "Summary:", "-" * 10)
latency = total_time / (num_iter - num_warmup)
print("Inference latency: %.4f sec." % latency)

import numpy as np
from itertools import chain
first_latency = np.mean([x[0] for x in total_list])
average_2n = list(chain(*[x[1:] for x in total_list]))
average_2n.sort()
average_2n_latency = np.mean(average_2n)
p90_latency = average_2n[int(len(average_2n) * 0.9)]
p99_latency = average_2n[int(len(average_2n) * 0.99)]
print("First token average latency: %.4f sec." % first_latency)
print("Average 2... latency: %.4f sec." % average_2n_latency)
print("P90 2... latency: %.4f sec." % p90_latency)
print("P99 2... latency: %.4f sec." % p99_latency)

* Other names & brands may be claimed as the property of others