https://gemini.google.com/app/eb62c40f8924dab1

In [1]:
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

Mounted at /content/drive/


# TinyLlama/TinyLlama-1.1B-Chat-v1.0 (quantized)



https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

## setup

In [None]:
# Install remaining required libraries
!pip install -U "optimum[onnxruntime]" transformers accelerate datasets fsspec huggingface_hub

Collecting transformers
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [None]:
import torch
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
from optimum.onnxruntime import AutoQuantizationConfig

In [None]:
# Define path for unquantized model
source_path = '/content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0/'

# Verify that the model folders exist
print("Checking for model directories...")
if os.path.exists(source_path):
    print(f"Found base directory: {source_path}")
    print("Contents:", os.listdir(source_path))
else:
    print(f"ERROR: The directory '{source_path}' was not found.")
    print("Please check the folder name and its location in your Google Drive.")

Checking for model directories...
Found base directory: /content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0/
Contents: ['.git', 'README.md', 'config.json', '.gitattributes', 'eval_results.json', 'generation_config.json', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'tokenizer.model', 'model.safetensors']


In [None]:
# Create save_path only if it does not already exist
save_path = '/content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0_quantized_onnx/'
os.makedirs(save_path, exist_ok=True)

# Add tokenizer model directory inside save_path
tokenizer_path = save_path+'tokenizer/'
os.makedirs(tokenizer_path, exist_ok=True)

# Add CPU optimized quantized model directory inside save_path
model_cpu = save_path+'model_cpu/'
os.makedirs(model_cpu, exist_ok=True)

# Add GPU optimized quantized model directory inside save_path
model_gpu = save_path+'model_gpu/'
os.makedirs(model_gpu, exist_ok=True)

In [None]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(source_path)

In [None]:
tokenizer.save_pretrained(tokenizer_path)

print(f"\nTokenizer saved to: {tokenizer_path}")
print("\nFiles in the tokenizer directory:")
!ls -lh {tokenizer_path}


Tokenizer saved to: /content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0_quantized_onnx/tokenizer/

Files in the tokenizer directory:
total 4.0M
-rw------- 1 root root  551 Jun 11 16:13 special_tokens_map.json
-rw------- 1 root root 1.4K Jun 11 16:13 tokenizer_config.json
-rw------- 1 root root 3.5M Jun 11 16:13 tokenizer.json
-rw------- 1 root root 489K Jun 11 16:13 tokenizer.model


In [None]:
# Load the base model
model = ORTModelForCausalLM.from_pretrained(source_path, export=True)

  or not self.key_cache[layer_idx].numel()  # the layer has no cache
  if sequence_length != 1:
  elif (
  is_causal = query.shape[2] > 1 and causal_mask is None


In [None]:
# Save ONNX unquantized model to local drive
# model.save_pretrained(unquantized_path)

# print(f"Unquantized ONNX model saved to: {unquantized_path}")
# print("\nFiles in the unquantized directory:")
# !ls -lh {unquantized_path}


Unquantized ONNX model saved to: /content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0_quantized_onnx/onnx_unquantized/

Files in the unquantized directory:
total 4.1G
-rw------- 1 root root  675 Jun 11 15:40 config.json
-rw------- 1 root root  124 Jun 11 15:40 generation_config.json
-rw------- 1 root root 965K Jun 11 15:40 model.onnx
-rw------- 1 root root 4.1G Jun 11 15:40 model.onnx_data


In [None]:
# Create a quantizer object from loading the ONNX unquantized model
quantizer = ORTQuantizer.from_pretrained(model)

In [None]:
# Create the quantization config using a pre-made recipe for dynamic quantization on CPU
# is_static=True optimizes the quantized model upfront for faster inference
# is_static=False allows the quantized model to optimize on the fly, with slower inference
quantization_config = AutoQuantizationConfig.avx512_vnni(is_static=False)

In [None]:
# Apply quantization and save the new, smaller model optimized for CPU
quantizer.quantize(
    save_dir=model_cpu,
    quantization_config=quantization_config,
)

print(f"Quantized model saved to: {model_cpu}")
print("\nFiles in your final quantized directory:")
!ls -lh {model_cpu}

Quantized model saved to: /content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0_quantized_onnx/model_cpu/

Files in your final quantized directory:
total 1.1G
-rw------- 1 root root  675 Jun 11 15:57 config.json
-rw------- 1 root root 1.1G Jun 11 15:57 model_quantized.onnx
-rw------- 1 root root  762 Jun 11 15:57 ort_config.json
-rw------- 1 root root  551 Jun 11 15:57 special_tokens_map.json
-rw------- 1 root root 1.4K Jun 11 15:57 tokenizer_config.json
-rw------- 1 root root 3.5M Jun 11 15:57 tokenizer.json
-rw------- 1 root root 489K Jun 11 15:57 tokenizer.model


## inference

In [None]:
# Load the tokenizer directly from local drive
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [None]:
# Load the quantized model  directly from local drive
model = ORTModelForCausalLM.from_pretrained(model_cpu)

In [None]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


In [None]:
prompt = "The main benefits of using a small language model are"
prompt = "What's the meaning of life?"
prompt = "1 2 3 4 5 6 6 6 6 5 5 5 5 7 7 7 7 7 8 8 8 8 5 5 5 5 6 6 6 6 7 7 7 7 4 4 4 4 3 3 3 3 5 5 5 5 2 2 2 2 1"
prompt = "1 2 3 7 5 6 1 2 3 4 1 2"

In [None]:
result = generator(prompt, max_new_tokens=100)

print("\n--- INFERENCE RESULT ---")
print(result[0]['generated_text'])


--- INFERENCE RESULT ---
1 2 3 7 5 6 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4


# OpenVINO/TinyLlama-1.1B-Chat-v1.0-int8-ov

https://huggingface.co/OpenVINO/TinyLlama-1.1B-Chat-v1.0-int8-ov

In [None]:
!pip install -q "optimum[openvino]" "transformers" "openvino-tokenizers"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.6/70.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.7/13.7 MB[0m [31m107.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.6/342.6 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import AutoTokenizer, pipeline
from optimum.intel import OVModelForCausalLM

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [None]:
model_path = "/content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0-int8-ov"

In [None]:
# For OpenVINO models, it's recommended to use the model's tokenizer if available
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# The key step: Use OVModelForCausalLM to load the optimized model
model = OVModelForCausalLM.from_pretrained(model_path)

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


In [None]:
prompt = "The best advice for a new programmer is"
prompt = "1 2 3 7 5 6 1 2 3 4 1 2"

In [None]:
result = pipe(prompt, max_new_tokens=100)

print("--- Model Output ---")
print(result[0]['generated_text'])

--- Model Output ---
1 2 3 7 5 6 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4


# TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF

https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF

In [None]:
# 1. Install the necessary library# 1. Install the necessary library
# llama-cpp-python is specifically designed to run GGUF models efficiently.
# The CMAKE_ARGS are flags to compile the library to take advantage of modern CPU features.
!CMAKE_ARGS="-DLLAMA_CUBLAS=OFF -DLLAMA_CUDA_F16=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_F16C=ON -DLLAMA_FMA=ON -DLLAMA_SSE3=ON -DLLAMA_SSSE3=ON" pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.9-cp311-cp311-linux_x86_64.whl size=4067751 sha256=186401f69ff5a5e603b0

In [None]:
from llama_cpp import Llama

In [None]:
#model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf"

In [None]:
# 6. Load the GGUF model
#    n_ctx: The context window size (max number of tokens).
#    n_threads: Number of CPU threads to use. Colab usually has 2.
print("Loading GGUF model...")
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
)

Loading GGUF model...


llama_model_loader: loaded meta data with 23 key-value pairs and 201 tensors from /content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = tinyllama_tinyllama-1.1b-chat-v1.0
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   4:                          llama.block_count u32              = 22
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 5632
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 64
llama_model_loade

In [None]:
# 7. Set up for inference and run a prompt
prompt = "The best advice for a new programmer is"
# Note: Llama.cpp uses a specific chat format. We can create it manually.
# For TinyLlama, the format is: <|system|>\n{system_prompt}<|end|>\n<|user|>\n{user_prompt}<|end|>\n<|assistant|>
prompt_template = f"<|system|>\nYou are a helpful assistant.<|end|>\n<|user|>\n{prompt}<|end|>\n<|assistant|>"

In [None]:
print(f"\nGenerating response for prompt: '{prompt}'")
response = llm(
    prompt_template,
    max_tokens=100,
    stop=["<|end|>", "user:"], # Stop generating when the model thinks its turn is over
    echo=False # Don't repeat the prompt in the output
)

print("\n--- Model Output ---")
print(response["choices"][0]["text"])


Generating response for prompt: 'The best advice for a new programmer is'


llama_perf_context_print:        load time =     754.07 ms
llama_perf_context_print: prompt eval time =     753.85 ms /    44 tokens (   17.13 ms per token,    58.37 tokens per second)
llama_perf_context_print:        eval time =    2092.19 ms /    41 runs   (   51.03 ms per token,    19.60 tokens per second)
llama_perf_context_print:       total time =    2867.08 ms /    85 tokens



--- Model Output ---

It's a cliché, but it's a great piece of advice. It's a reminder that a new programmer can learn and grow, and that their efforts are appreciated.


In [None]:
prompt = "1 2 3 4 5 6"

In [None]:
response = llm(
    prompt,
    max_tokens=100,
    stop=["\n"], # Stop generating when the model thinks its turn is over
    echo=False # Don't repeat the prompt in the output
)

print("\n--- Model Output ---")
print(response["choices"][0]["text"])

Llama.generate: 1 prefix-match hit, remaining 12 prompt tokens to eval
llama_perf_context_print:        load time =     754.07 ms
llama_perf_context_print: prompt eval time =     682.88 ms /    12 tokens (   56.91 ms per token,    17.57 tokens per second)
llama_perf_context_print:        eval time =    3468.65 ms /    69 runs   (   50.27 ms per token,    19.89 tokens per second)
llama_perf_context_print:       total time =    4188.05 ms /    81 tokens



--- Model Output ---
 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30


# Tests

## Setup

In [2]:
!pip install -U "optimum[onnxruntime]" "optimum[openvino]" "transformers" "openvino-tokenizers"

Collecting openvino-tokenizers
  Downloading openvino_tokenizers-2025.1.0.0-py3-none-manylinux2014_x86_64.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optimum[onnxruntime]
  Downloading optimum-1.25.3-py3-none-any.whl.metadata (16 kB)
Collecting onnx (from optimum[onnxruntime])
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting onnxruntime>=1.11.0 (from optimum[onnxruntime])
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting optimum-intel>=1.23.0 (from optimum-intel[openvino]>=1.23.0; extra == "openvino"->optimum[openvino])
  Downloading optimum_intel-1.23.0-py3-none-any.whl.metadata (14 kB)
Collecting openvino~=2025.1.0.dev (from openvino-tokenizers)
 

In [3]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=OFF -DLLAMA_CUDA_F16=OFF -DLLAMA_AVX=ON -DLLAMA_AVX2=ON -DLLAMA_F16C=ON -DLLAMA_FMA=ON -DLLAMA_SSE3=ON -DLLAMA_SSSE3=ON" pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.9-cp311-cp311-linux_x86_64.whl size=4067704 sha256=569aeeb8acb4bdf51539

In [4]:
import torch
from transformers import AutoTokenizer, pipeline

## Load Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 (quantized)

In [5]:
from optimum.onnxruntime import ORTModelForCausalLM, ORTQuantizer
from optimum.onnxruntime import AutoQuantizationConfig

In [6]:
# Create save_path only if it does not already exist
save_path = '/content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0_quantized_onnx/'

# Add tokenizer model directory inside save_path
tokenizer_path = save_path+'tokenizer/'

# Add CPU optimized quantized model directory inside save_path
model_cpu = save_path+'model_cpu/'

In [10]:
# Load the tokenizer directly from local drive
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [8]:
# Load the quantized model  directly from local drive
model = ORTModelForCausalLM.from_pretrained(model_cpu)

In [11]:
generator_quantized_onnx = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


## Load Model: OpenVINO/TinyLlama-1.1B-Chat-v1.0-int8-ov

In [12]:
from optimum.intel import OVModelForCausalLM

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [13]:
model_path = "/content/drive/MyDrive/dev/.models/TinyLlama-1.1B-Chat-v1.0-int8-ov"

In [14]:
# For OpenVINO models, it's recommended to use the model's tokenizer if available
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [15]:
# The key step: Use OVModelForCausalLM to load the optimized model
model = OVModelForCausalLM.from_pretrained(model_path)

In [16]:
generator_int8_ov = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cpu


## Load Model: TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF

In [17]:
from llama_cpp import Llama

In [66]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
generator_q2_k = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    verbose=False,
)

In [65]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
generator_q4_k_m = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    verbose=False,
)

In [77]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
generator_q6_k = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    verbose=False,
)

## Results

In [20]:
import time

In [85]:
generators = {
    'quantized_onnx': {'func': generator_quantized_onnx, 'max_tokens': 'max_new_tokens'},
    'int8_ov': {'func': generator_int8_ov, 'max_tokens': 'max_new_tokens'},
    'q2_k': {'func': generator_q2_k, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q4_k_m': {'func': generator_q4_k_m, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q6_k': {'func': generator_q6_k, 'max_tokens': 'max_tokens', 'stop': 'stop'}
}

In [79]:
def time_execution(generator, prompt, params):
    start_time = time.time()
    response = generator(prompt, **params)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return {'response': response, 'elapsed_time': elapsed_time}

In [80]:
prompt = "1 2 3 4 5"
max_tokens = 100
stop = ["\n"]

In [86]:
results = {}
for key in generators:
    params = {
        generators[key]['max_tokens']: max_tokens
    }
    if key=='q2_k' or key=='q4_k_m':
        params['stop'] = stop
    print('Running ',key)
    results[key] = time_execution(generators[key]['func'], prompt, params)


Running  quantized_onnx
Running  int8_ov
Running  q2_k
Running  q4_k_m
Running  q6_k


In [87]:
for key in results:
    print(key,': ',results[key]['elapsed_time'])

quantized_onnx :  7.08148193359375
int8_ov :  8.38093113899231
q2_k :  2.451695680618286
q4_k_m :  6.073390245437622
q6_k :  8.125742197036743


In [88]:
print('quantized_onnx: ',results['quantized_onnx']['response'][0]['generated_text'])
print('int8_ov: ', results['int8_ov']['response'][0]['generated_text'])
print('q2_k: ', results['q2_k']['response']['choices'][0]['text'])
print('q4_k_m: ', results['q4_k_m']['response']['choices'][0]['text'])
print('q6_k: ', results['q6_k']['response']['choices'][0]['text'])

quantized_onnx:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 4
int8_ov:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 4
q2_k:   6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
q4_k_m:   6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 4
q6_k:   6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 4
