# Simple app to download models from Hugging Face and use the intel_npu_acceleration_library for running Inference on Intel NPU

## Pre-Requisites

Make sure you have a clean conda environment to begin with. <br>Python version should be 3.10<br>
You can create a conda env with conda create -n intel-npu python=3.10 <br><br>
Install VC++ redist from https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist <br>

## Note
If the notebook fails to execute on the first run, please restart the kernel and try again. Some dependent python packages may require kernel restart.
To set the build env inside conda env, use <br>
"c:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\VsDevCmd.bat" <br>
Then create a new conda environment python=3.10. Actiavate and build library from source using - <br>
pip install "intel-npu-acceleration-library @ git+https://github.com/intel/intel-npu-acceleration-library.git"


In [None]:
from pathlib import Path
import platform
import importlib
import time
from timeit import default_timer as timer
import subprocess

assert 'Intel' in platform.processor(), "Only INTEL processors are supported"
!python -m pip install -r requirements.txt -q

# Import NPU acceleration library
import intel_npu_acceleration_library
import intel_npu_acceleration_library.backend

import os
import openvino
import openvino.torch
import numpy as np
from subprocess import Popen
from PIL import Image
import shutil
import evaluate
import toml
import json
import torchaudio
from huggingface_hub import login
import requests
import validators

gleu = evaluate.load('google_bleu')

#Clear cache
if os.path.exists("./cache"):
    shutil.rmtree("./cache")

#Setup the test config
#Read the JSON config to get the tokenizer and model ID
with open('TestConfigNPU.toml', 'r') as f:
    testconfig = toml.load(f)

#Set the number of iterations here
numiter = testconfig["Configuration"]["num-iter"]

# Flag to enable additional run on CPU
run_additional_cpu = testconfig["Configuration"]["run_additional_cpu"]

#Question and answers
input_text_vqa = testconfig["Configuration"]["input_text_vqa"]
ref_answer_vqa = testconfig["Configuration"]["ref_answer_vqa"]
input_text_ttot = testconfig["Configuration"]["input_text_ttot"]
ref_answer_ttot = testconfig["Configuration"]["ref_answer_ttot"]
ref_answer_aud = testconfig["Configuration"]["ref_answer_aud"]

#Input image file
input_img_file = testconfig["Configuration"]["input_img_file"]
assert ".jpg" in input_img_file, "Only jpg files are supproted"
if validators.url(input_img_file):
    if os.path.exists("./test.jpg"):
        os.remove("./test.jpg")
    with requests.get(input_img_file, stream=True) as r:
        with open("./test.jpg", 'wb') as f:
            shutil.copyfileobj(r.raw, f)
        input_img_file = "./test.jpg"

#Input Audio file and Sampling rate
input_aud_file = testconfig["Configuration"]["input_aud_file"]
assert ".wav" in input_aud_file, "Only wav files are supproted"
if validators.url(input_aud_file):
    with requests.get(input_aud_file, stream=True) as r:
        with open("./test.wav", 'wb') as f:
            shutil.copyfileobj(r.raw, f)
        input_aud_file = "./test.wav"

#Clean previous results flag
clean_prev_results = testconfig["Configuration"]["clean_prev_results"]

# Flag to enable Pytorch profiling
enable_pytorch_profiling = testconfig["Configuration"]["enable_pytorch_profiling"]

if enable_pytorch_profiling:
    from torch.profiler import profile, record_function, ProfilerActivity

#HF Access Token
hfaccesstoken = testconfig["Configuration"]["HFaccessToken"]
assert hfaccesstoken, "Hugging Face token is not set. Please set it in TestConfig TOML"
login(token=hfaccesstoken)

# Import NPU acceleration library
import intel_npu_acceleration_library

#Get NPU driver version
npu_driver_version = subprocess.run(
    [
        "powershell.exe",
        "--NoProfile",
        "--ExecutionPolicy", "Bypass",
        "Get-WmiObject", "Win32_PnPSignedDriver | select devicename, driverversion | Select-String -Pattern 'Intel(R) AI Boost' -CaseSensitive -SimpleMatch"
    ],
    capture_output=True,
    text=True
)
npu_driver_version_string = npu_driver_version.stdout
#Book-Keeping
results = {
    "NPU-Version": npu_driver_version_string
}
inf_time_list = []
pytorch_accuracy_list = []
run_time = time.strftime("%Y%m%d-%H%M%S")
results["runtimestamp"] = run_time
results_folder = './results'
if clean_prev_results:
    if os.path.exists(results_folder):
        shutil.rmtree(results_folder)
if not os.path.exists(results_folder):
    os.makedirs(results_folder)
results_folder = './results/' + 'results-' + str(run_time) +"/"
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

#Clear the cache dir and contents
if os.path.exists('./cache'):
    shutil.rmtree("./cache")

#Results dumper
def results_dumper(results):
    #Dump Results to File
    with open(results_folder + 'results-' + str(run_time) + '.json', 'w') as f:
            json.dump(results, f, indent=4)

    # Latency Results to CSV
    csv_column_headers = [ "XPU",
                        "Model", 
                        "Pytorch First Inference Latency(ms)",
                        "PytorchInf(ms)(Avg over " + str(numiter - 1) + " iterations)", 
                        "Pytorch Accuracy Score(%)(Avg over " + str(numiter) + " iterations)",            
                        ]
    csv_data = []
    csv_data.append(csv_column_headers)
    for model_entry in results:
        if "runtimestamp" not in model_entry:
            csv_column_data = []
            for header in csv_column_headers:
                if header in results[model_entry]:
                    csv_column_data.append(results[model_entry][header])
                else:
                    csv_column_data.append("N/A")
            csv_data.append(csv_column_data)

    #Dump Results CSV to File
    with open(results_folder + 'results-' + str(run_time) + '.csv', 'w') as f:
        for row_entry in csv_data:
            to_write = ""
            for entry in row_entry:
                to_write = to_write + entry + ","
            f.write(to_write + '\n')


#Main inference function
def run_inference(tokenizer_type, model_id, model_gen, inputs_types, useNPUAccelerationLibrary, input_q, ref_answer, import_library, attn_impl, target_dtype):
    torch_lib = importlib.import_module("torch")
    target_dtype = getattr(torch_lib, target_dtype)
    transformerlib = importlib.import_module("transformers")
    imported_lib = importlib.import_module(import_library)
    tokenattr = getattr(transformerlib, tokenizer_type)
    modelattr = getattr(imported_lib, model_gen)
    if import_library == "transformers":
        if attn_impl != "none":
            model = modelattr.from_pretrained(model_id, attn_implementation=attn_impl, trust_remote_code=True)
        else:
            model = modelattr.from_pretrained(model_id, trust_remote_code=True)
    else:
        model = modelattr.from_pretrained(model_id, trust_remote_code=True)
    xpu_type = "CPU"
    first_latency = 0

    if useNPUAccelerationLibrary:
        xpu_type = "NPU"
        dtype = target_dtype
        model = intel_npu_acceleration_library.compile(model.eval(), dtype=dtype)
    tokenizer = tokenattr.from_pretrained(model_id, trust_remote_code=True)
    first_output_str=""
    for i in range(1,numiter+1):
        start_time = timer()
        if "pixel_values" in inputs_types:
            image = Image.open(input_img_file).convert("RGB")
            inputs = tokenizer(images=image, text=input_q, return_tensors="pt").pixel_values
            if enable_pytorch_profiling:
                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
                    with record_function("model_inference"):
                        outputs = model.generate(inputs, max_new_tokens=50)                    
            else:
                outputs = model.generate(inputs, max_new_tokens=50)
        elif "image" in inputs_types:
            image = Image.open(input_img_file).convert("RGB")
            inputs = tokenizer(images=image, text=input_q, return_tensors="pt")
            if enable_pytorch_profiling:
                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
                    with record_function("model_inference"):
                        outputs = model.generate(**inputs)                    
            else:
                outputs = model.generate(**inputs)
        elif "audio" in inputs_types:
            input_q = "N/A"
            inputs, sampling_rate = torchaudio.load(input_aud_file)
            waveform = torchaudio.functional.resample(inputs, orig_freq=sampling_rate, new_freq=tokenizer.feature_extractor.sampling_rate).squeeze().numpy()
            sampling_rate = tokenizer.feature_extractor.sampling_rate
            inputs = tokenizer(waveform, sampling_rate=sampling_rate, return_tensors="pt")
            if enable_pytorch_profiling:
                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
                    with record_function("model_inference"):
                        outputs = model.generate(inputs.input_features)
            else:
                outputs = model.generate(inputs.input_features)
        else:
            inputs = tokenizer(input_q, return_tensors="pt")
            if enable_pytorch_profiling:
                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
                    with record_function("model_inference"):
                        outputs = model.generate(**inputs)
            else:
                outputs = model.generate(**inputs)
        
        latency_metric = (timer()-start_time) * 1000
    
        if enable_pytorch_profiling:
            proftracefile = results_folder + model_id.replace("/", "-") + "-prof-trace-" + str(i) + "-" + xpu_type
            with open(proftracefile + ".txt", "w") as f:
                f.write(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
            prof.export_chrome_trace(proftracefile + "-trace.json")
        
        batch_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)
        output_str = str(batch_outputs)
        if input_q in output_str:
            output_str = output_str.split(input_q)[1]
        if len(output_str) > 0:
            gleu_score = gleu.compute(predictions=[output_str], references=[ref_answer])
        else:
            gleu_score = {'google_bleu': 0, 'status': 'Output string is empty'}        
        pytorch_accuracy_list.append(round(float(gleu_score['google_bleu']), 2))
        if i == 1:
            first_output_str = output_str
            first_latency = latency_metric
        else:
            inf_time_list.append(latency_metric)
    # End of iterations loop
    
    inf_time = str(round(np.average(inf_time_list),2))
    acc_avg = round(np.average(pytorch_accuracy_list), 4)
    
    results[model_id + "-" + xpu_type]={
        "XPU": xpu_type,
        "Model": model_id,
        "Precision": str(target_dtype),
        "ATTN Impl": attn_impl,
        "Pytorch First Inference Latency(ms)": str(round(first_latency,2)),
        "Pytorch Raw inf (ms)": ",".join(str(round(inf,2)) for inf in inf_time_list),
        "PytorchInf(ms)(Avg over " + str(numiter - 1) + " iterations)": inf_time,
        "Pytorch Raw Accuracy (%)": ",".join(str(round(acc * 100, 2)) for acc in pytorch_accuracy_list),
        "Pytorch Accuracy Score(%)(Avg over " + str(numiter) + " iterations)": str(acc_avg * 100),
        "Input-Question": input_q,
        "First iteration Response": first_output_str,
        "Ref Answer": ref_answer
    }
    inf_time_list.clear()
    pytorch_accuracy_list.clear()    
    # Dump intermedeate results
    results_dumper(results)


#Main Function
for model_blob in testconfig:
    entry = testconfig[model_blob]
    if "tokenizer" in entry:
        tokenizer_type = entry["tokenizer"]
        model_id = entry["id"]
        model_gen = entry["modelgen"]
        inputs_types = entry["inputs"] if "inputs" in entry else "text"
        useNPUAccelerationLibrary = entry["useNPUAccelerationLibrary"] if "useNPUAccelerationLibrary" in entry else False
        input_q = entry["input_q"] if "input_q" in entry else input_text_ttot
        input_q_vqa = entry["input_q_vqa"] if "input_q_vqa" in entry else input_text_vqa
        ref_answer = entry["ref_answer"] if "ref_answer" in entry else ref_answer_ttot
        ref_answer_vqa = entry["ref_answer_vqa"] if "ref_answer_vqa" in entry else ref_answer_vqa
        input_q_aud = entry["input_q_aud"] if "input_q_aud" in entry else input_text_ttot
        ref_answer_aud = entry["ref_answer_aud"] if "ref_answer_aud" in entry else ref_answer_aud
        import_library = entry["import_library"] if "import_library" in entry else "transformers"
        attn_impl = entry["attn_impl"] if "attn_impl" in entry else "none"
        target_dtype = entry["dtype"] if "dtype" in entry else "int8"
        if "image" in inputs_types or "pixel_values" in inputs_types:
            input_q = input_q_vqa
            ref_answer = ref_answer_vqa
        if "audio" in inputs_types:
            input_q = input_q_aud
            ref_answer = ref_answer_aud
        print(f"Model inference started for {model_id}")
        run_inference(tokenizer_type, model_id, model_gen, inputs_types, useNPUAccelerationLibrary, input_q, ref_answer, import_library, attn_impl, target_dtype)
        intel_npu_acceleration_library.backend.runtime._model_cache={} #Clear cache after every iteration
        if useNPUAccelerationLibrary & run_additional_cpu:
            #Run inference on CPU also for comparison
            run_inference(tokenizer_type, model_id, model_gen, inputs_types, False, input_q, ref_answer, import_library, attn_impl, target_dtype)
        print("Model Inference complete")

#Clear cache
if os.path.exists("./cache"):
    shutil.rmtree("./cache")

#Dump Final results to file
results_dumper(results)
