In [1]:
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [22]:
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    #model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="../../../../llms/the_bloke/llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=4096,
    # kwargs to pass to __call__()
    generate_kwargs={"n_ctx": 4096},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../../../../llms/the_bloke/llama-2-7b-chat.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_K     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_K     [  

In [12]:
response = llm.complete("Hello! Can you tell me a poem about cats and dogs?")
print(response.text)

  Of course, I'd be happy to help! Here is a short poem about cats and dogs:
Cats and dogs, so furry and sweet,
Bringing joy to our hearts, with their playful treat.
Their wagging tails and purrs, bring us delight,
As they curl up by our side, on a cozy night.
With their cuddles and kisses, they brighten our day,
And fill our lives with love, in every way.
I hope you enjoy this poem! Let me know if you have any other questions or requests.



llama_print_timings:        load time = 34668.42 ms
llama_print_timings:      sample time =   127.79 ms /   132 runs   (    0.97 ms per token,  1032.97 tokens per second)
llama_print_timings: prompt eval time = 34666.83 ms /    81 tokens (  427.99 ms per token,     2.34 tokens per second)
llama_print_timings:        eval time = 41221.51 ms /   131 runs   (  314.67 ms per token,     3.18 tokens per second)
llama_print_timings:       total time = 76635.50 ms


In [20]:
response_iter = llm.stream_complete("Can you write me a poem about fast cars?")
for response in response_iter:
    print(response.delta, end="", flush=True)

  Of course! Here is a poem about fast cars:
Racing down the highway, wind in my hair
Fast car taking me there, without a care
Adrenaline pumping, heart beating fast
The thrill of speeding, never lasts too long at last

I hope you enjoy this poem! Let me know if you have any other requests.


llama_print_timings:        load time = 28951.87 ms
llama_print_timings:      sample time =    67.55 ms /    82 runs   (    0.82 ms per token,  1213.95 tokens per second)
llama_print_timings: prompt eval time = 28067.51 ms /    77 tokens (  364.51 ms per token,     2.74 tokens per second)
llama_print_timings:        eval time = 27190.04 ms /    81 runs   (  335.68 ms per token,     2.98 tokens per second)
llama_print_timings:       total time = 55721.14 ms
llama_print_timings:        load time = 28951.87 ms
llama_print_timings:      sample time =    67.55 ms /    82 runs   (    0.82 ms per token,  1213.95 tokens per second)
llama_print_timings: prompt eval time = 28067.51 ms /    77 tokens (  364.51 ms per token,     2.74 tokens per second)
llama_print_timings:        eval time = 27190.04 ms /    81 runs   (  335.68 ms per token,     2.98 tokens per second)
llama_print_timings:       total time = 55721.14 ms


## Query engine set up with LlamaCPP
We can simply pass in the LlamaCPP LLM abstraction to the LlamaIndex query engine as usual.

But first, let’s change the global tokenizer to match our LLM.

In [14]:
from llama_index import set_global_tokenizer
from transformers import AutoTokenizer

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-chat-hf").encode
)

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 746/746 [00:00<00:00, 3.60MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:01<00:00, 444kB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:01<00:00, 1.45MB/s]
added_tokens.json: 100%|██████████| 21.0/21.0 [00:00<00:00, 78.4kB/s]
special_tokens_map.json: 100%|██████████| 435/435 [00:00<00:00, 1.08MB/s]


In [15]:
# use Huggingface embeddings
from llama_index.embeddings import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

config.json: 100%|██████████| 743/743 [00:00<00:00, 3.32MB/s]
pytorch_model.bin: 100%|██████████| 134M/134M [00:19<00:00, 6.82MB/s] 
tokenizer_config.json: 100%|██████████| 366/366 [00:00<00:00, 1.17MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 449kB/s]
tokenizer.json: 100%|██████████| 711k/711k [00:00<00:00, 1.41MB/s]
special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 847kB/s]


In [23]:
# create a service context
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

AttributeError: 'Llama' object has no attribute 'context_params'