In [1]:
import os.path, os
from llama_index.core import (
    Settings,
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    set_global_tokenizer,
)
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
from transformers import AutoTokenizer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    # model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    # model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin",
    model_path="/Users/u1155516/Dropbox/Technical/llms/models/llama/llama-mac-metal/llama.cpp/models/llama-2-13b-chat.Q8_0.gguf",
    temperature=0.1,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 20},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /Users/u1155516/Dropbox/Technical/llms/models/llama/llama-mac-metal/llama.cpp/models/llama-2-13b-chat.Q8_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_mo

In [3]:
response = llm.complete("Hello! Can you tell me a poem about cats and dogs?")
print(response.text)


llama_print_timings:        load time =    4869.51 ms
llama_print_timings:      sample time =      10.93 ms /   115 runs   (    0.10 ms per token, 10525.35 tokens per second)
llama_print_timings: prompt eval time =    4869.32 ms /    79 tokens (   61.64 ms per token,    16.22 tokens per second)
llama_print_timings:        eval time =   11181.49 ms /   114 runs   (   98.08 ms per token,    10.20 tokens per second)
llama_print_timings:       total time =   16213.41 ms /   193 tokens


  Of course! Here's a short poem about cats and dogs:

Cats and dogs, so different yet so dear,
Both furry friends, but oh so clear.

Cats purr and curl up tight,
Dogs wag their tails with delight.

Cats hunt at night, with stealthy grace,
Dogs bark and chase with joyful pace.

Both bring us joy, both bring us love,
Cats and dogs, sent from above.


In [None]:
response_iter = llm.stream_complete("Can you write me a poem about aliens?")
for response in response_iter:
    print(response.delta, end="", flush=True)

In [4]:
# Load the RAG vector index

set_global_tokenizer(
    AutoTokenizer.from_pretrained("NousResearch/Nous-Hermes-Llama2-13b").encode
)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# create vector store index
# check if storage already exists
if not os.path.exists("./storage_segments"):
    # load the documents and create the index
    documents = SimpleDirectoryReader("data_segments").load_data()
    index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)
    # store it for later
    index.storage_context.persist("./storage_segments")
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir="./storage_segments")
    index = load_index_from_storage(storage_context)
    

FileNotFoundError: [Errno 2] No such file or directory: '/Users/u1155516/Dropbox/Technical/llms/dev/hand-rolled/storage_segments/docstore.json'

In [None]:
# Request a basic summary of the documents in the corpus
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query("Summarize all of the Hansard documents for 1901 please. It is a big document set so you might want to summarize by themes, noting which sessions emphasised which themes and which speakers spoke the most")
print(response)