In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
!nvidia-smi

Thu Jun 12 17:03:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             41W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

# Setup

I struggled with the installation witb GPU support, only able to fix from https://gemini.google.com/app/f3ec74b59f92f56f

In [8]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

CUDA available: True
CUDA version: 12.4


In [5]:
# Install the pre-built wheel for CUDA 12.4
!pip uninstall -y llama-cpp-python
!pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu124
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: fil

In [6]:
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

Mounted at /content/drive/


In [7]:
from llama_cpp import Llama

In [34]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q2_K.gguf"
generator_q2_k = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [35]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q2_k.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 33 layers to the GPU.


In [36]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
generator_q4_k_m = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [37]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q4_k_m.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 33 layers to the GPU.


In [38]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
generator_q6_k = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=33,
    verbose=False,
)

In [39]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q6_k.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 33 layers to the GPU.


In [44]:
model_path = "/content/drive/MyDrive/dev/.models/tinyllama-1.1b-chat-v1.0.Q8_0.gguf"
generator_q8_0 = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=2,
    n_gpu_layers=-1,
    verbose=False,
)

In [46]:
# Check the number of layers offloaded to the GPU
offloaded_layers = generator_q8_0.model_params.n_gpu_layers
print(f"✅ Successfully offloaded {offloaded_layers} layers to the GPU")

✅ Successfully offloaded 2147483647 layers to the GPU


# Tests

In [26]:
import time
import numpy as np

In [27]:
tokens = [1, 2, 3, 4, 5, 6]
probabilities = [0.1, 0.2, 0.3, 0.2, 0.1, 0.1]
size = (50, 20)

In [28]:
def create_prompt(tokens, probabilities, size, seed=1293):
    x = []
    np.random.seed(seed)
    prompts = np.random.choice(tokens, size=size, p=probabilities)
    for prompt in prompts:
        x = x+[' '.join(str(s) for s in prompt)]
    return x

In [29]:
prompts = create_prompt(tokens=tokens, probabilities=probabilities, size=size)
for prompt in prompts:
    print(prompt)

1 3 2 5 3 5 3 3 6 1 2 2 2 1 3 3 2 3 2 2
4 3 1 3 3 5 3 4 6 1 2 5 6 2 3 4 1 4 4 3
3 1 4 4 2 6 1 3 2 3 2 3 3 5 6 5 5 5 1 3
4 5 1 3 5 2 3 1 3 3 1 3 1 6 4 1 3 4 3 1
3 4 3 2 4 6 4 2 4 3 4 6 4 4 3 4 1 3 2 2
5 3 3 3 4 1 3 3 5 2 1 2 1 6 3 2 2 2 3 5
6 6 3 5 2 1 1 4 2 3 4 6 3 4 6 2 4 3 1 4
3 3 4 3 1 4 6 3 4 2 2 2 3 4 2 3 3 3 2 3
1 5 3 3 3 2 2 2 5 1 3 6 3 3 3 2 4 2 6 3
4 4 2 3 4 4 1 1 4 1 4 1 4 5 5 4 1 3 4 2
1 4 3 3 3 3 3 6 6 2 4 3 2 4 6 6 1 2 3 3
6 4 5 3 4 4 6 2 2 6 2 6 6 2 1 2 1 3 4 3
1 3 3 2 1 4 2 3 1 3 5 4 3 3 4 4 4 3 3 6
3 3 5 3 5 4 2 5 3 2 3 2 5 3 3 4 1 4 3 4
2 3 1 1 4 4 5 4 3 2 3 5 4 3 4 4 1 6 3 5
3 3 3 6 4 2 5 2 5 4 6 3 2 2 4 2 2 3 1 6
2 2 3 5 6 6 3 6 6 1 3 6 3 3 6 5 4 5 3 1
4 2 2 3 4 2 3 3 4 3 1 2 2 3 1 3 2 4 2 2
2 3 3 4 2 3 1 4 1 3 5 4 5 4 4 6 4 1 2 3
3 2 2 2 3 3 3 6 4 3 2 3 5 3 3 2 6 4 2 2
3 1 6 3 1 1 4 4 5 3 2 2 3 5 4 1 1 6 6 3
1 4 2 3 5 4 3 3 3 3 4 4 3 3 4 4 6 4 2 3
2 2 3 1 4 4 4 6 4 3 4 1 3 3 3 5 5 4 3 5
6 3 1 3 5 4 3 5 4 3 3 3 2 6 2 4 4 3 3 3
6 3 2 4 3 1 2 2 6 1 3 5 6 6 4 5 3 2 6 1


In [48]:
generators = {
    'q2_k': {'func': generator_q2_k, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q4_k_m': {'func': generator_q4_k_m, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q6_k': {'func': generator_q6_k, 'max_tokens': 'max_tokens', 'stop': 'stop'},
    'q8_0': {'func': generator_q8_0, 'max_tokens': 'max_tokens', 'stop': 'stop'}
}

In [31]:
def time_execution(generator, prompt, params):
    start_time = time.time()
    response = generator(prompt, **params)
    end_time = time.time()
    elapsed_time = end_time - start_time
    return {'response': response, 'elapsed_time': elapsed_time}

In [32]:
max_tokens = 100
stop = ["\n"]

In [41]:
results = {}
r = []
t = []

for key in generators:
    results[key] = {}
    params = {
        generators[key]['max_tokens']: max_tokens
    }
    if key=='q2_k' or key=='q4_k_m':
        params['stop'] = stop
    print('Running ',key)
    for prompt in prompts:
        result = time_execution(generators[key]['func'], prompt, params)
        r = r+[result['response']['choices'][0]['text']]
        t = t+[result['elapsed_time']]
    results[key]['response'] = r
    results[key]['elapsed_time'] = t

Running  q2_k
Running  q4_k_m
Running  q6_k


In [49]:
# Running only q8_0
results = {}
r = []
t = []

for key in ['q8_0']:
    results[key] = {}
    params = {
        generators[key]['max_tokens']: max_tokens
    }
    if key=='q2_k' or key=='q4_k_m':
        params['stop'] = stop
    print('Running ',key)
    for prompt in prompts:
        result = time_execution(generators[key]['func'], prompt, params)
        r = r+[result['response']['choices'][0]['text']]
        t = t+[result['elapsed_time']]
    results[key]['response'] = r
    results[key]['elapsed_time'] = t

Running  q8_0


In [50]:
# GPU running slower than in CPUs
for key in results:
    print(key,': ',np.mean(results[key]['elapsed_time']))

q8_0 :  7.859624714851379


In [42]:
# GPU running slower than in CPUs
results

{'q2_k': {'response': [' 5 3 5 3 3 5 3 5 3 3 4 3 3 3 5 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3',
   ' 1 4 3 3 5 0 5 1 7 6 3 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
   ' 1 0 1 0 1 1 2 3 3 3 3 3 6 5 6 7 7 6 6 6 7 7 7 7 7 6 7 7 7 6 6 6 6 7 7 6 7 7 6 7 7 6 6 7 7 7 7 7 7 7',
   ' 3 2 3 1 3 1 3 4 5 4 5 3 1 1 1 1 3 3 1 1 3 1 1 5 1 1 1 3 1 1 3 1 1 1 1 1 3 1 1 1 3 3 1 3 1 1 1 3 1 3',
   ' 4 3 4 6 4 4 1 3 2 2 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4 4 3 4 6 4',
   ' 3 6 3 3 3 3 4 4 3 5 3 5 3 3 6 3 4 4 4 3 4 5 2 4 5 4 4 4 4 5 2 4 5 3 5 5 6 5 4 5 5 5 4 5 4 4 5 5 3 5',
   ' 3 1 5 2 3 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0',
   ' 4 4 3 4 4 5 5 6 4 5 6 6 5 5 6 6 5 6 6 6 6 6 6 6 5 6 5 6 6 5 6 6 6 6 6 6 6 6 6 5 6 7 5 7 6 7 6 7 5 6',
   ' 7 3 5 2 3 5 3 2 6 3 4 3 3 3 2 6 3 3 6 2 3 3 2 2 5 3 3 3 3 3 3 2 6 3 3 3 3 3 2 3 2 4 3 3 3 4 3 4 3 3',
   ' 3 3 3 4 4 5 

In [43]:
# GPU running slower than in CPUs
for key in results:
    print(key,': ',np.mean(results[key]['elapsed_time']))

q2_k :  5.874327306747436
q4_k_m :  6.052161073684692
q6_k :  6.56054488658905


In [39]:
# Results on CPU
print(results)

Results in CPU:
{'q2_k': {'response': [' 1 1 1 2 2', '', ' 3 8 5 0', ' 5 7 1', ' 4 ', ' 5 0 3 7 3 3 3 4 3 6 3 3 3 6 3 3 5 3 5 3 8 3 3 3 5 2 1 2 3 4 3 3 4 1 2 5 0 0 3 7 5 3 2 1 1 1 6 1 0 ', ' 7 7 7 7', ' ', ' 2 3 0 0 0 0 0 0', ' 1 ', ' 1', ' 5 5 ', ' 5 2 4 2 1 0 7 8', ' ', '', '', ' 0 1 6 4 4 0', ' 3 ', ' 5 6 7 8 9 10 11 12 13 14 15 16 17 18', ' 3 2 0 0 0 0 0 0 0 0 0 0 0 0 0', '', ' 5 4', ' 4 3 4', ' ', ' 1 ', '', ' 8 7 5 5 1 1 0', ' 2 2 2 0 4 8 9 7 2 0 3 4 3 3 5 3', ' 0', ' 3 2 3 3', ' 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 4', ' ', ' 6 8 7', ' 5 1 ', ' 1 1 0 9 7 6 5 5 4 3 3 3 3 3 4 3 3 2 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 ', '', ' 4 ', ' 2', ' 4', ' 0 0', ' 7', ' 0 0 0', ' 2 2 4 5 6 7 8 9 ', ' 1 2 10 8 1', ' 5 1 0 9 7 6', ' ', ' 1 4 1 2 0 2 0 4 9 4', ' 2 4 3 6 3 2 5 2 ', '', ' 4 5 6 0 0 0 2'], 'elapsed_time': [1.0324163436889648, 0.46370482444763184, 0.9405839443206787, 0.8096699714660645, 0.6914944648742676, 7.5140745639801025, 