In [None]:
from nntool.api import NNGraph
from nntool.api.utils import RandomIter
from nntool.api.utils import model_settings
import numpy as np

### Load network and prepare it for nntool

Tensorflow already collects the neccesary statistics needed during [post training static quantization](https://huggingface.co/docs/optimum/concept_guides/quantization). We can import them into nntool using `load_quantization=True`.

In [None]:
G = NNGraph.load_graph("model.tflite", load_quantization=True)
G.adjust_order()
G.fusions("scaled_match_group")

In [None]:
G.quantize(
    graph_options={
            "scheme": "SQ8",  # Specify the quantization. You can choose betwen "float" and "SQ8"
            "use_ne16": True    # Specify if we want to use the NE16 accelerator
        },
)

Plotting the memory consumption of each layer can give a good indication on how to improve the model. Notice the difference between the quantization schemes. Make sure to use `SQ8` when deploying to gvsoc or GAP9.

In [None]:
print(G.plot_mem_usage())

### Deploy to gvsoc

In [None]:
test_input = np.load('sample.npy')
test_label = np.load('label.npy')
labels = ['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes']
print("The loaded sample has label:", test_label.item(), "which corresponds to:", labels[test_label.item()])

In [None]:

res = G.execute_on_target(
    directory="kws_test_deploy_gap",
    input_tensors=[test_input],
    at_loglevel=0,
    at_log=True,
    print_output=True,
    settings=model_settings(
        tensor_directory="tensors",
        model_directory="model_dir",
        l1_size=128000,
        l2_size=1000000,
        graph_const_exec_from_flash=True
    ),
)