In [21]:
! pip install -U --quiet llama-cpp-python

### Repo

Clone and build Llama.cpp, following instructions here:

https://github.com/ggerganov/llama.cpp

### Model

Download a local LLM, ideally one that is capable of tool calling to use all features discussed below:
 
* Weights: `meta-llama-3-8b-instruct.Q8_0.gguf`
* Link: https://huggingface.co/SanctumAI/Meta-Llama-3-8B-Instruct-GGUF

### Function calling

First, we can see that llama-cpp-python does support function calling.

https://llama-cpp-python.readthedocs.io/en/latest/

In [4]:
local_model = "/Users/rlm/Desktop/Code/llama.cpp/models/meta-llama-3-8b-instruct.Q8_0.gguf"

In [None]:
from llama_cpp import Llama
llm = Llama(model_path=local_model, chat_format="chatml-function-calling")

In [3]:
llm.create_chat_completion(
      messages = [
        {
          "role": "system",
          "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"

        },
        {
          "role": "user",
          "content": "Extract Jason is 25 years old"
        }
      ],
      tools=[{
        "type": "function",
        "function": {
          "name": "UserDetail",
          "parameters": {
            "type": "object",
            "title": "UserDetail",
            "properties": {
              "name": {
                "title": "Name",
                "type": "string"
              },
              "age": {
                "title": "Age",
                "type": "integer"
              }
            },
            "required": [ "name", "age" ]
          }
        }
      }],
      tool_choice={
        "type": "function",
        "function": {
          "name": "UserDetail"
        }
      }
)


llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /Users/rlm/Desktop/Code/llama.cpp/models/meta-llama-3-8b-instruct.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: 

{'id': 'chatcmpl-52b952a7-6847-4b6d-88b3-102c0dcb49e5',
 'object': 'chat.completion',
 'created': 1717799438,
 'model': '/Users/rlm/Desktop/Code/llama.cpp/models/meta-llama-3-8b-instruct.Q8_0.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': None,
    'function_call': {'name': 'UserDetail',
     'arguments': '{"name": "Jason", "age": 25}'},
    'tool_calls': [{'id': 'call__0_UserDetail_cmpl-52b952a7-6847-4b6d-88b3-102c0dcb49e5',
      'type': 'function',
      'function': {'name': 'UserDetail',
       'arguments': '{"name": "Jason", "age": 25}'}}]},
   'logprobs': None,
   'finish_reason': 'tool_calls'}],
 'usage': {'prompt_tokens': 268, 'completion_tokens': 12, 'total_tokens': 280}}

### Test with ChatLlamaCpp integration

In [1]:
from langchain_community.chat_models import ChatLlamaCpp

In [5]:
import multiprocessing

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

llm = ChatLlamaCpp(
    temperature=0.3,
    model_path=local_model,
    n_ctx=10000,
    n_gpu_layers=4,
    n_batch=200,  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    max_tokens=512,
    n_threads=multiprocessing.cpu_count() - 1,
    callback_manager=CallbackManager(
        [StreamingStdOutCallbackHandler()]
    ),  # Callbacks support token-wise streaming
    streaming=True,
    repeat_penalty=1.5,
    top_p=0.5,
    stop=["<|end_of_text|>", "<|eot_id|>"],
    verbose=True,
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /Users/rlm/Desktop/Code/llama.cpp/models/meta-llama-3-8b-instruct.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B-Instruct
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: 

### Invoke

In [11]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]

ai_msg = llm.invoke(messages)
ai_msg

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit


Je adore le programmation.

(Note: "programming" is translated as both an activity ("je programme") and also referring specifically to computer science, which I've tried to convey with my translation.)


llama_print_timings:        load time =    2005.08 ms
llama_print_timings:      sample time =      11.32 ms /    40 runs   (    0.28 ms per token,  3532.94 tokens per second)
llama_print_timings: prompt eval time =    1829.14 ms /    33 tokens (   55.43 ms per token,    18.04 tokens per second)
llama_print_timings:        eval time =    4734.94 ms /    39 runs   (  121.41 ms per token,     8.24 tokens per second)
llama_print_timings:       total time =    6614.99 ms /    72 tokens


AIMessage(content='Je adore le programmation.\n\n(Note: "programming" is translated as both an activity ("je programme") and also referring specifically to computer science, which I\'ve tried to convey with my translation.)', response_metadata={'finish_reason': 'stop'}, id='run-0eb329b2-d351-4cf6-b355-04d24c09d2cb-0')

### Chain

In [10]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that translates {input_language} to {output_language}.",
        ),
        ("human", "{input}"),
    ]
)

chain = prompt | llm
chain.invoke(
    {
        "input_language": "English",
        "output_language": "German",
        "input": "I love programming.",
    }
)

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit


Das Programmieren ist toll! (That's "Programming is great!" in case you didn't know.) Do you have any favorite languages or projects? Ich bin hier, um dir zu helfen und über das Thema sprechen! () - I'm here to help and talk about the topic with you!)


llama_print_timings:        load time =   15285.72 ms
llama_print_timings:      sample time =      17.26 ms /    61 runs   (    0.28 ms per token,  3534.59 tokens per second)
llama_print_timings: prompt eval time =     629.70 ms /    16 tokens (   39.36 ms per token,    25.41 tokens per second)
llama_print_timings:        eval time =    6864.21 ms /    60 runs   (  114.40 ms per token,     8.74 tokens per second)
llama_print_timings:       total time =    7570.66 ms /    76 tokens


AIMessage(content='Das Programmieren ist toll! (That\'s "Programming is great!" in case you didn\'t know.) Do you have any favorite languages or projects? Ich bin hier, um dir zu helfen und über das Thema sprechen! () - I\'m here to help and talk about the topic with you!)', response_metadata={'finish_reason': 'stop'}, id='run-80e2fac4-152f-4ec0-ac5c-3c6e63be345f-0')

### Structured output

In [6]:
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.utils.function_calling import convert_to_openai_tool

class AnswerWithJustification(BaseModel):
    '''An answer to the user question along with justification for the answer.'''
    answer: str
    justification: str

dict_schema = convert_to_openai_tool(AnswerWithJustification)
structured_llm = llm.with_structured_output(dict_schema)

In [8]:
result = structured_llm.invoke("What weighs more a pound of bricks or a pound of feathers")

from_string grammar:
answer-kv ::= ["] [a] [n] [s] [w] [e] [r] ["] space [:] space string 
space ::= space_7 
string ::= ["] string_8 ["] space 
char ::= [^"\] | [\] char_4 
char_4 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
justification-kv ::= ["] [j] [u] [s] [t] [i] [f] [i] [c] [a] [t] [i] [o] [n] ["] space [:] space string 
root ::= [{] space answer-kv [,] space justification-kv [}] space 
space_7 ::= [ ] | 
string_8 ::= char string_8 | 

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =    2005.08 ms
llama_print_timings:      sample time =     262.96 ms /    30 runs   (    8.77 ms per token,   114.08 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (     nan ms per token,      nan to

In [9]:
result

### Tool calling

* However, it cannot automatically trigger a function/tool. 
* We need to force it by specifying the 'tool choice' parameter. This parameter is typically formatted as described below.

In [11]:
from langchain.tools import tool
from langchain_core.pydantic_v1 import BaseModel, Field

class WeatherInput(BaseModel):
    location: str = Field(description="The city and state, e.g. San Francisco, CA")
    unit: str = Field(enum=["celsius", "fahrenheit"])

@tool("get_current_weather", args_schema=WeatherInput)
def get_weather(location: str, unit: str):
    """Get the current weather in a given location"""
    return f"Now the weather in {location} is 22 {unit}"

llm_with_tools = llm.bind_tools(
    tools=[get_weather],
    tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
)

In [17]:
ai_msg = llm_with_tools.invoke(
    "what is the weather in San Francisco, CA",
)

from_string grammar:
char ::= [^"\] | [\] char_1 
char_1 ::= ["\/bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
location-kv ::= ["] [l] [o] [c] [a] [t] [i] [o] [n] ["] space [:] space string 
space ::= space_7 
string ::= ["] string_8 ["] space 
root ::= [{] space location-kv [,] space unit-kv [}] space 
unit-kv ::= ["] [u] [n] [i] [t] ["] space [:] space unit 
space_7 ::= [ ] | 
string_8 ::= char string_8 | 
unit ::= ["] [c] [e] [l] [s] [i] [u] [s] ["] | ["] [f] [a] [h] [r] [e] [n] [h] [e] [i] [t] ["] 

llama_tokenize_internal: Added a BOS token to the prompt as specified by the model but the prompt also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. Are you sure this is what you want?
Llama.generate: prefix-match hit

llama_print_timings:        load time =   15285.72 ms
llama_print_timings:      sample time =     137.57 ms /    18 runs   (    7.64 ms per token,   130.84 tokens per second)
llama_print_timings: prompt eval time =     573.48 m

In [20]:
ai_msg

AIMessage(content='', response_metadata={'finish_reason': 'tool_calls'}, id='run-e504702e-f0fd-4916-9e40-50efe8ca1549-0')