In [2]:
models = [
    '/Users/kunalbhandarkar/Downloads/DeepSeek-R1-Distill-Qwen-7B-Q6_K.gguf',
    '/Users/kunalbhandarkar/Downloads/deepseek-r1-distill-llama-8b-q2_k.gguf',
    '/Users/kunalbhandarkar/Downloads/Llama-3.2-1B-Instruct.Q5_K_S.gguf',           # smaller llama model
    '/Users/kunalbhandarkar/Downloads/Llama-3.2-1B-Instruct.Q8_0.gguf',             # larger llama model
    '/Users/kunalbhandarkar/Downloads/DeepSeek-R1-Distill-Qwen-1.5B-Q6_K.gguf',     # smaller supposedly better model
    '/Users/kunalbhandarkar/Downloads/DeepSeek-R1-Distill-Qwen-1.5B-f32.gguf',      # 3 GB model
    '/Users/kunalbhandarkar/Downloads/DeepSeek-R1-Distill-Qwen-1.5B-Q8_0.gguf',     # larger size model
    '/Users/kunalbhandarkar/Downloads/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf'
]

In [3]:
import json

def load_prompts(file_path):
    """
    Loads the list of prompt strings from a JSON file.
   
    Args:
        file_path (str): Path to the JSON file containing the prompt strings.
   
    Returns:
        list: The list of prompt strings.
    """
    with open(file_path, 'r') as f:
        prompts = json.load(f)
    return prompts

output_file = 'new_prompts.json'
# Load back the list to verify
loaded_prompts = load_prompts(output_file)
print(f"Loaded {len(loaded_prompts)} prompts from {output_file}")


Loaded 1319 prompts from new_prompts.json


In [4]:
print(loaded_prompts[0])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

### Response: Let's think step by step.


In [5]:
'''
This class takes in the path to a gguf file representing an LLM
and allows a user to interact with the LLM running on a CPU.

'''

from llama_cpp import Llama

class CPUModel:

    # pass in the path to the gguf file
    def __init__(self, model_path):
        self.llm = Llama(model_path=model_path)

    # returns the complete response (a dictionary) of the LLM given an input
    # args is a dict containing input_text: string that is the input to the LLM
    # as well as other optional model parameters like temperature (float),
    # max_tokens (int), and top_p (float)
    def get_response(self, args: dict):
        return self.llm(
            args['input_text'],
            max_tokens=args.get('max_tokens'),
            temperature=args.get('temperature'),
            top_p=args.get('top_p')
        )

    # returns the first "choice" that the model outputs. this is just the text
    # that the model outputs without the other information on the response
    # args is a dict containing input_text: string that is the input to the LLM
    # as well as other optional model parameters like temperature (float),
    # max_tokens (int), and top_p (float)
    def get_text_response(self, args: dict):
        response = self.get_response(args)
        return response['choices'][0]['text']

    # to be implemented
    def evaluate(self): pass

In [7]:
cpu_model = CPUModel(models[0])

llama_model_load_from_file_impl: using device Metal (Apple M3 Pro) - 8891 MiB free
llama_model_loader: loaded meta data with 30 key-value pairs and 339 tensors from /Users/kunalbhandarkar/Downloads/DeepSeek-R1-Distill-Qwen-7B-Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = DeepSeek R1 Distill Qwen 7B
llama_model_loader: - kv   3:                           general.basename str              = DeepSeek-R1-Distill-Qwen
llama_model_loader: - kv   4:                         general.size_label str              = 7B
llama_model_loader: - kv   5:                          qwen2.block_count u32              = 28
llama_model_load

In [8]:
args = {
    'temperature': 0.5,
    'top_p': 0.3,
    'max_tokens': 1000
}
args['input_text'] = loaded_prompts[0]
print('prompt:')
print(loaded_prompts[0])

response = cpu_model.get_text_response(args)
print('RESPONSE:')
print(response)
print(len(response))

prompt:
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?

### Response: Let's think step by step.


KeyboardInterrupt: 

In [23]:
# get a sense for average time taken

import time 

LIMIT = 10
sum = 0
for i in range(LIMIT):
    print('ITERATION', i)
    args['input_text'] = loaded_prompts[i]
    start_time = time.time()
    cpu_model.get_response(args)
    time_taken = time.time() - start_time
    sum += time_taken

print('Average time taken:', sum / LIMIT)

ITERATION 0


Llama.generate: 95 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    4649.61 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =   15979.73 ms /   252 runs   (   63.41 ms per token,    15.77 tokens per second)
llama_perf_context_print:       total time =   16045.59 ms /   253 tokens
Llama.generate: 22 prefix-match hit, remaining 36 prompt tokens to eval


ITERATION 1


KeyboardInterrupt: 

# Looking at other ways to get models

In [1]:
from mlx_lm import load, generate

model, tokenizer = model, tokenizer = load("mlx-community/DeepSeek-R1-Distill-Qwen-1.5B")

prompt = loaded_prompts[0]
if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
    messages = [{"role": "user", "content": prompt}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

response = generate(model, tokenizer, prompt=prompt, verbose=True)



Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model.safetensors.index.json:   0%|          | 0.00/24.0k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/6.76k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/485 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

NameError: name 'loaded_prompts' is not defined