In [1]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [28]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import os
from tqdm import tqdm
from llama_index.core import (
    Settings,
    VectorStoreIndex, SummaryIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.objects import ObjectIndex
from llama_index.core.agent import ReActAgent
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.callbacks import CallbackManager
from IPython.display import Markdown, display
import agent_utils, data_utils

  from .autonotebook import tqdm as notebook_tqdm



INFO:datasets:PyTorch version 2.3.0+cu118 available.
PyTorch version 2.3.0+cu118 available.


In [3]:
device_map = "cuda:1"
llm_hf, embed_model = agent_utils.load_llm_embed_models(
    llm_name="models/Meta-Llama-3.1-8B-Instruct", embed_name="models/bge-base-en-v1.5", device_map=device_map
)

Loading embedding model: models/bge-base-en-v1.5
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: models/bge-base-en-v1.5
Load pretrained SentenceTransformer: models/bge-base-en-v1.5
INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']
Loading LLM: models/Meta-Llama-3.1-8B-Instruct
Loading tokenizer and model with quantization config from: models/Meta-Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.97s/it]


Loaded LLM and embedding models


In [4]:
Settings.llm = llm_hf
Settings.embed_model = embed_model

## Load documents/ nodes

In [5]:
documents = data_utils.load_md_documents(
    docs_dir="data/llama-blogs-md", docs_metadata="data/llama_blogs_metadata.json"
)

Num documents: 166


Parsing documents: 100%|██████████| 166/166 [00:00<00:00, 12504.79it/s]


In [6]:
test_doc = documents[30]
test_doc.metadata

{'filename': 'using-llamaindex-and-llamafile-to-build-a-local-private-research-assistant.md',
 'extension': '.md',
 'title': 'Using LlamaIndex and llamafile to build a local, private research assistant',
 'date': 'May 14, 2024',
 'url': 'https://www.llamaindex.ai/blog/using-llamaindex-and-llamafile-to-build-a-local-private-research-assistant'}

In [7]:
nodes = data_utils.parse_md_doc([test_doc])

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 532.27it/s]


In [8]:
len(nodes)

11

## Get tools

In [9]:
vector_tool, summary_tool = agent_utils.get_tools_from_nodes(nodes, doc_metadata=test_doc.metadata)

Found existing index from using-llamaindex-and-llamafile-to-build-a-local-private-research-assistant
INFO:llama_index.core.indices.loading:Loading all indices.
Loading all indices.


In [10]:
print("Loaded tools:", vector_tool.metadata.name, summary_tool.metadata.name, sep="\n")

Loaded tools:
vector_tool_using_llamaindex_and_llamafile_to_build_a_local_private_research_assistant
summary_tool_using_llamaindex_and_llamafile_to_build_a_local_private_research_assistant


## Define Agent

In [11]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [12]:
from llama_index.core.agent import AgentRunner, ReActAgentWorker, ReActAgent

In [13]:
agent_context = '''\
You are CTO of LLamaIndex, you can answer question related to LlamaIndex blog'''
agent = ReActAgent.from_tools(
    [vector_tool, summary_tool],
    llm=Settings.llm,
    context=agent_context,
    verbose=True
)

ReActChatFormatter.from_context is deprecated, please use `from_defaults` instead.
ReActChatFormatter.from_context is deprecated, please use `from_defaults` instead.


In [14]:
response = agent.chat("What is llamafile?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 9f314a31-7d6b-4585-af13-7b42eeeb96c3. Step input: What is llamafile?
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: vector_tool_using_llamaindex_and_llamafile_to_build_a_local_private_research_assistant
Action Input: {'input': 'What is llamafile?'}
[0m

Batches: 100%|██████████| 1/1 [00:00<00:00, 20.37it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: 

A llamafile is an executable Large Language Model (LLM) that you can run on your own computer. It contains the weights for a given open-source LLM, as well as everything needed to actually run that model on your computer.
[0m> Running step 1e465e29-06fe-4dbc-8fdb-8e0d482a92ef. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: A llamafile is an executable Large Language Model (LLM) that you can run on your own computer. It contains the weights for a given open-source LLM, as well as everything needed to actually run that model on your own computer.
[0m

In [18]:
response.response

'A llamafile is an executable Large Language Model (LLM) that you can run on your own computer. It contains the weights for a given open-source LLM, as well as everything needed to actually run that model on your own computer.'

In [21]:
response = agent.chat("How to download and run a llamafile?")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


> Running step 8fc150f2-322c-445e-a434-bf493b6270c2. Step input: How to download and run a llamafile?
[1;3;38;5;200mThought: The user wants to know how to download and run a llamafile. I need to use a tool to help me answer the question.
Action: vector_tool_using_llamaindex_and_llamafile_to_build_a_local_private_research_assistant
Action Input: {'input': 'downloading and running a llamafile'}
[0m

Batches: 100%|██████████| 1/1 [00:00<00:00, 59.65it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: 

To download and run a llamafile, follow these steps:

1. **Download the llamafile-ized model**: You can either click the download link or use a command like `wget` to download the model. For this example, we'll use the `wget` command:
   ```
   wget https://huggingface.co/Mozilla/TinyLlama-1.1B-Chat-v1.0-llamafile/resolve/main/TinyLlama-1.1B-Chat-v1.0.F16.llamafile
   ```
2. **Make it executable (you only need to do this once)**: If you're on macOS, Linux, or BSD, you'll need to grant permission for your computer to execute this new file by running:
   ```
   chmod +x TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile
   ```
   If you're on Windows, simply rename the file by adding ".exe" on the end.
3. **Run in server mode**: Once the file is executable, you can run it in server mode using the following command:
   ```
  ./TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile --server --nobrowser --embedding
   ```

Note: Make sure to replace the file name with the actual name of t

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: Error: Could not parse output. Please follow the thought-action-input format. Try again.
[0m> Running step 1d87297d-4676-4f22-8490-e0176bc8fcea. Step input: None
[1;3;38;5;200mThought: The user wants to know how to download and run a llamafile. I need to use a tool to help me answer the question.
Action: vector_tool_using_llamaindex_and_llamafile_to_build_a_local_private_research_assistant
Action Input: {'input': 'downloading and running a llamafile, step-by-step instructions'}
[0m

Batches: 100%|██████████| 1/1 [00:00<00:00, 77.06it/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[1;3;34mObservation: 

To download and run a llamafile, follow these step-by-step instructions:

**Step 1: Download a llamafile**

1. Go to the [HuggingFace model hub](https://huggingface.co/models?sort=trending&search=llamafile) and search for 'llamafile'.
2. Choose a llamafile, such as [TinyLlama-1.1B](https://huggingface.co/Mozilla/TinyLlama-1.1B-Chat-v1.0-llamafile/resolve/main/TinyLlama-1.1B-Chat-v1.0.F16.llamafile) (0.67 GB).
3. Click the download link or use a command like `wget` to download the model. The download should take 5-10 minutes depending on your internet connection.

**Step 2: Make the llamafile executable**

1. If you're using macOS, Linux, or BSD, you need to grant permission for your computer to execute the file. Run the command `chmod +x TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile`.
2. If you're on Windows, rename the file by adding ".exe" on the end, e.g., `TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile.exe`.

**Step 3: Run the llamafile in server mode**

1. Open your

In [22]:
display(Markdown(response.response))

To download and run a llamafile, follow these step-by-step instructions: First, go to the HuggingFace model hub and search for 'llamafile'. Choose a llamafile, such as TinyLlama-1.1B, and click the download link or use a command like `wget` to download the model. Then, make the llamafile executable by running `chmod +x TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile` on macOS, Linux, or BSD, or by renaming the file with ".exe" on Windows. Finally, run the llamafile in server mode by navigating to the directory where the llamafile is stored and running `./TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile --server --nobrowser --embedding`.

In [23]:
agent.chat_history

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What is llamafile?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='A llamafile is an executable Large Language Model (LLM) that you can run on your own computer. It contains the weights for a given open-source LLM, as well as everything needed to actually run that model on your own computer.', additional_kwargs={}),
 ChatMessage(role=<MessageRole.USER: 'user'>, content='How to download and run a llamafile?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='To download and run a llamafile, follow these step-by-step instructions: First, go to the HuggingFace model hub and search for \'llamafile\'. Choose a llamafile, such as TinyLlama-1.1B, and click the download link or use a command like `wget` to download the model. Then, make the llamafile executable by running `chmod +x TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile` on macOS, Linux, or BSD, or by renami

In [32]:
agent.memory

ChatMemoryBuffer(chat_store=SimpleChatStore(store={'chat_history': [ChatMessage(role=<MessageRole.USER: 'user'>, content='What is llamafile?', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='A llamafile is an executable Large Language Model (LLM) that you can run on your own computer. It contains the weights for a given open-source LLM, as well as everything needed to actually run that model on your own computer.', additional_kwargs={}), ChatMessage(role=<MessageRole.USER: 'user'>, content='How to download and run a llamafile?', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='To download and run a llamafile, follow these step-by-step instructions: First, go to the HuggingFace model hub and search for \'llamafile\'. Choose a llamafile, such as TinyLlama-1.1B, and click the download link or use a command like `wget` to download the model. Then, make the llamafile executable by running `chmod +x TinyLlama-1.1B-C