In [1]:
!pip install llama-cpp-python



In [2]:
!pip install datasets



In [3]:
!pip install -U jupyterlab ipywidgets jupyterlab-widgets



In [39]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [40]:
!huggingface-cli whoami

arnavk2001


In [41]:
!pip install transformers



In [42]:
from llama_cpp import Llama

In [101]:
import time
import torch
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
import psutil

In [110]:
class CPUModel:
    def __init__(self, args):
        """Initialize the model."""
        model_path = hf_hub_download(repo_id=args["repo_id"], filename=args["filename"])
        self.model = Llama(model_path=model_path, verbose=False)
        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B", use_auth_token=args.get("auth"))

    def get_response(self, args):
        """Generate response from the model."""
        return self.model(args["input_text"], max_tokens=args.get("max_tokens", 100))

    def get_text_response(self, args):
        """Extract text response from model output."""
        output = self.get_response(args)
        return output['choices'][0]['text']

    def evaluate(self, args):
        """Measure accuracy, latency, throughput, and RAM usage."""
        torch.cuda.empty_cache()
        start_time = time.time()
        
        output_text = self.get_text_response(args)
        latency = time.time() - start_time

        # Get initial RAM usage
        process = psutil.Process()
        
        # Get peak RAM usage after execution
        max_ram = process.memory_info().rss / (1024 ** 3)  # Convert to GB
        
        # Tokenize the output for accurate measurement
        tokenized_output = self.tokenizer(output_text)
        tokens_generated = len(tokenized_output['input_ids'])
        
        # Simulating accuracy calculation (this should be replaced with a real benchmark)
        accuracy = tokens_generated / len(args["prompt"].split())  # Placeholder metric
        
        # Throughput calculation (tokens per second)
        throughput = tokens_generated / latency
        
        return {
            "latency": latency,
            "throughput": throughput,
            "max_ram": max_ram,
            "accuracy": accuracy
        }

In [135]:
    models = [
        {"repo_id": "MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF", "filename": "Llama-3.2-1B-Instruct.Q4_K_M.gguf"},
        {"repo_id": "bartowski/Llama-3.2-3B-Instruct-GGUF", "filename": "Llama-3.2-3B-Instruct-Q4_K_M.gguf"},
        {"repo_id": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", "filename": "Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"},
        {"repo_id": "Qwen/Qwen2-1.5B-Instruct-GGUF", "filename": "qwen2-1_5b-instruct-q4_k_m.gguf"},
        {"repo_id": "Qwen/Qwen2-7B-Instruct-GGUF", "filename": "qwen2-7b-instruct-q4_k_m.gguf"}
    ]

In [None]:
max_tokens_list = [50, 100, 250, 500, 1000]  # Different max token values
args_base = {"prompt": "What is the capital of France?", "auth": "access token"}

results = []

for model in models:
    print(f"\nTesting model: {model['repo_id']}")
    cpu_model = CPUModel(model)
    
    for max_tokens in max_tokens_list:
        args = args_base.copy()
        args["max_tokens"] = max_tokens  # Set max tokens for this run
        
        print(f"  Running with max tokens: {max_tokens}")
        result = cpu_model.evaluate(args)
        results.append({"model": model['repo_id'], **result})
        print(result)

print("\nFinal Results:")
for res in results:
    print(res)


Testing model: MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

  Running with max tokens: 50
{'latency': 1.021738052368164, 'throughput': 16.638315427910058, 'max_ram': 1.1270599365234375, 'accuracy': 2.8333333333333335}
  Running with max tokens: 100
{'latency': 0.2308661937713623, 'throughput': 73.63572692169865, 'max_ram': 1.12896728515625, 'accuracy': 2.8333333333333335}
  Running with max tokens: 250
{'latency': 0.23482012748718262, 'throughput': 72.39583838880219, 'max_ram': 1.130523681640625, 'accuracy': 2.8333333333333335}
  Running with max tokens: 500
{'latency': 0.2342391014099121, 'throughput': 72.57541502539517, 'max_ram': 1.13055419921875, 'accuracy': 2.8333333333333335}
  Running with max tokens: 1000
{'latency': 0.2314600944519043, 'throughput': 73.4467858930753, 'max_ram': 1.13055419921875, 'accuracy': 2.8333333333333335}

Testing model: bartowski/Llama-3.2-3B-Instruct-GGUF


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

  Running with max tokens: 50
{'latency': 2.2254068851470947, 'throughput': 7.63905248674394, 'max_ram': 2.161407470703125, 'accuracy': 2.8333333333333335}
  Running with max tokens: 100
{'latency': 0.5209472179412842, 'throughput': 32.632864548507996, 'max_ram': 2.161895751953125, 'accuracy': 2.8333333333333335}
  Running with max tokens: 250
{'latency': 0.5021388530731201, 'throughput': 33.855177499129915, 'max_ram': 2.16192626953125, 'accuracy': 2.8333333333333335}
  Running with max tokens: 500
{'latency': 0.48175692558288574, 'throughput': 35.287505165455414, 'max_ram': 2.161773681640625, 'accuracy': 2.8333333333333335}
  Running with max tokens: 1000
{'latency': 0.48558902740478516, 'throughput': 35.00902829467945, 'max_ram': 2.161773681640625, 'accuracy': 2.8333333333333335}

Testing model: bartowski/Meta-Llama-3.1-8B-Instruct-GGUF


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_b

  Running with max tokens: 50
{'latency': 4.635629892349243, 'throughput': 3.667247039729642, 'max_ram': 4.4509124755859375, 'accuracy': 2.8333333333333335}
  Running with max tokens: 100
{'latency': 1.0599491596221924, 'throughput': 16.03850509779117, 'max_ram': 4.4394378662109375, 'accuracy': 2.8333333333333335}
  Running with max tokens: 250
{'latency': 1.0653948783874512, 'throughput': 15.956524988866734, 'max_ram': 4.4354705810546875, 'accuracy': 2.8333333333333335}
  Running with max tokens: 500
{'latency': 1.2351281642913818, 'throughput': 13.7637538285375, 'max_ram': 4.4061737060546875, 'accuracy': 2.8333333333333335}
  Running with max tokens: 1000
{'latency': 0.9801740646362305, 'throughput': 17.343858211866856, 'max_ram': 4.4109039306640625, 'accuracy': 2.8333333333333335}

Testing model: Qwen/Qwen2-1.5B-Instruct-GGUF


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf

  Running with max tokens: 50
{'latency': 1.3070688247680664, 'throughput': 13.006201110348245, 'max_ram': 1.2251434326171875, 'accuracy': 2.8333333333333335}
  Running with max tokens: 100
{'latency': 0.2666940689086914, 'throughput': 59.99383512903675, 'max_ram': 1.1796722412109375, 'accuracy': 2.6666666666666665}
  Running with max tokens: 250
{'latency': 0.2932560443878174, 'throughput': 57.969819634879535, 'max_ram': 1.1798095703125, 'accuracy': 2.8333333333333335}
  Running with max tokens: 500
{'latency': 0.2812838554382324, 'throughput': 56.88204171928903, 'max_ram': 1.1815948486328125, 'accuracy': 2.6666666666666665}
  Running with max tokens: 1000
{'latency': 0.2927968502044678, 'throughput': 58.06073387786942, 'max_ram': 1.18182373046875, 'accuracy': 2.8333333333333335}

Testing model: Qwen/Qwen2-7B-Instruct-GGUF


llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (32768) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf

  Running with max tokens: 50
{'latency': 4.8487389087677, 'throughput': 3.5060662823605253, 'max_ram': 4.1511077880859375, 'accuracy': 2.8333333333333335}
  Running with max tokens: 100
{'latency': 1.01888108253479, 'throughput': 16.684969709818446, 'max_ram': 4.1476287841796875, 'accuracy': 2.8333333333333335}
  Running with max tokens: 250
{'latency': 1.025998830795288, 'throughput': 16.569219661607896, 'max_ram': 4.1506500244140625, 'accuracy': 2.8333333333333335}
  Running with max tokens: 500
{'latency': 1.0183849334716797, 'throughput': 16.693098494738045, 'max_ram': 4.152801513671875, 'accuracy': 2.8333333333333335}
  Running with max tokens: 1000
{'latency': 0.9887049198150635, 'throughput': 17.19420998044577, 'max_ram': 4.1533660888671875, 'accuracy': 2.8333333333333335}

Final Results:
{'model': 'MaziyarPanahi/Llama-3.2-1B-Instruct-GGUF', 'latency': 1.021738052368164, 'throughput': 16.638315427910058, 'max_ram': 1.1270599365234375, 'accuracy': 2.8333333333333335}
{'model': '