## Crawl Data

In [5]:
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

In [6]:
# Extract and print the main text from each card
def get_blog_urls():
    # get all blog
    base_url = "https://www.llamaindex.ai"
    response = requests.get(f"{base_url}/blog")
    html = response.text
    
    soup = BeautifulSoup(html, 'html.parser')
    # Find all blog post cards
    blog_cards = soup.find_all('div', class_='CardBlog_card__mm0Zw')
    blog_data = []
    for card in blog_cards:
        # Extract title
        title_element = card.find('p', class_='CardBlog_title__qC51U').find('a')
        title = title_element.text.strip()
        url = base_url + title_element['href']
    
        # Extract publication date
        date = card.find('p', class_='Text_text__zPO0D Text_text-size-16__PkjFu').text.strip()
    
        # Print the extracted information
        print(f"Title: {title}")
        print(f"Date: {date}")
        print(f"URL: {url}")
        print("---")

        blog_data.append({
            "title": title,
            "date": date,
            "url": url
        })
        
    return blog_data

In [7]:
def extract_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style tags
    for script in soup(['script', 'style']):
        script.extract()

    # Get rid of empty tags
    for tag in soup.find_all():
        if not tag.text.strip():
            tag.extract()

    # only get main blog content, ignore related blogs
    blog_content = soup.find("main").find("div", class_="BlogPost_htmlPost__Z5oDL")

    return blog_content.prettify()

In [8]:
blogs_data = get_blog_urls()

Title: LlamaIndex Newsletter 2024-07-23
Date: Jul 23, 2024
URL: https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-07-23
---
Title: Improving Vector Search - Reranking with PostgresML and LlamaIndex
Date: Jul 19, 2024
URL: https://www.llamaindex.ai/blog/improving-vector-search-reranking-with-postgresml-and-llamaindex
---
Title: The latest updates to LlamaCloud
Date: Jul 19, 2024
URL: https://www.llamaindex.ai/blog/the-latest-updates-to-llamacloud
---
Title: Case Study: How Scaleport.ai Accelerated Development and Improved Sales with LlamaCloud
Date: Jul 17, 2024
URL: https://www.llamaindex.ai/blog/case-study-how-scaleport-ai-accelerated-development-and-improved-sales-with-llamacloud
---
Title: Building a multi-agent concierge system
Date: Jul 17, 2024
URL: https://www.llamaindex.ai/blog/building-a-multi-agent-concierge-system
---
Title: LlamaIndex Newsletter 2024-07-16
Date: Jul 16, 2024
URL: https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-07-16
---
Title: Arize AI and

In [9]:
blogs_data[0]

{'title': 'LlamaIndex Newsletter 2024-07-23',
 'date': 'Jul 23, 2024',
 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-07-23'}

In [34]:
# raw blog 
response = requests.get(blog_data[0]["url"])
raw_blog = BeautifulSoup(response.text, 'html.parser')
with open(f'raw_page.html', 'w', encoding='utf-8') as f:
    f.write(raw_blog.prettify())

In [31]:
# clean blog
 with open(f'sample_clean.html', 'w', encoding='utf-8') as f:
    f.write(blog_content)

In [38]:
# # extract all blog page and save to folder data
# save_folder = "./data"
# for blog_data in tqdm(blogs_data,desc="Crawling data"):
#     cleaned_html = extract_page(blog_data["url"])
#     save_name =  blog_data["url"].split("/")[-1]
#     with open(f'{save_folder}/{save_name}.html', 'w', encoding='utf-8') as f:
#         f.write(cleaned_html)

Crawling data: 100%|██████████| 159/159 [01:03<00:00,  2.51it/s]


## Loading and Injestion

### Load documents

In [10]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

In [11]:
documents = SimpleDirectoryReader(
    input_dir="./data",
).load_data()

In [3]:
len(documents[0].text), len(documents)

(24684, 159)

In [12]:
documents[0].metadata

{'file_path': '/workspace/projects/LlamaRAG/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html',
 'file_name': 'a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html',
 'file_type': 'text/html',
 'file_size': 24708,
 'creation_date': '2024-07-21',
 'last_modified_date': '2024-07-21'}

### Split documents

In [13]:
splitter = SentenceSplitter(
    chunk_size=2048,
    chunk_overlap=200
)

In [14]:
nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} nodes.")

Created 427 nodes.


## Indexing and Embedding

In [15]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core import load_index_from_storage
from llama_index.core import VectorStoreIndex
from llama_index.core.indices import SummaryIndex
from llama_index.core import DocumentSummaryIndex
from llama_index.core.prompts import PromptTemplate
import chromadb
import torch
import model_utils




In [16]:
import nest_asyncio
nest_asyncio.apply()

In [8]:
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path="models/Llama-2-7b-chat-hf",
    device="cuda"
)

Loading tokenizer and model with quantization config from: models/Llama-2-7b-chat-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="models/bge-small-en-v1.5", device="cuda")

In [10]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    generate_kwargs={
        "temperature": 0.7,
        "do_sample": True
    },
    device_map="cuda",
    model_name="models/Llama-2-7b-chat-hf",
    model=model,
    tokenizer=tokenizer
)

In [19]:
Settings.embed_model = embed_model
Settings.llm = llm_hf

In [20]:
Settings.context_window

4096

### Init chromadb 

In [21]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")
# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("llama_index_blogs")
vector_collection, vector_collection.count()

(Collection(id=3186f0ec-26e5-46fa-b687-281a3a26066f, name=llama_index_blogs),
 427)

In [32]:
summary_collection = chroma_client.get_or_create_collection("llma_blogs_summary")
summary_collection, summary_collection.count()

(Collection(id=cf744ea1-c23d-41c8-8204-363adcd4b3fe, name=llma_blogs_summary),
 0)

### Create vetor store

In [29]:
# Init chromadb storage
vector_store = ChromaVectorStore(chroma_collection=vector_collection)
vector_storage_context = StorageContext.from_defaults(vector_store=vector_store)

summary_store = ChromaVectorStore(chroma_collection=summary_collection)
summary_storage_context = StorageContext.from_defaults(vector_store=summary_store, persist_dir="./chroma_db/summary/")

In [14]:
# # First run: Create vector index from documents.
# vector_index = VectorStoreIndex(
#     nodes, storage_context=storage_context, show_progress=True
# )

In [30]:
# create summary index
summary_index = SummaryIndex.from_documents(
    documents=documents,
    storage_context=summary_storage_context, 
    show_progress=True
)

Parsing nodes:   0%|          | 0/159 [00:00<?, ?it/s]

In [25]:
# summary_storage_context.persist("./chroma_db/summary")

In [23]:
# load your index from stored vectors
vector_index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=vector_storage_context
)

In [27]:
# summary_storage_context = StorageContext.from_defaults(persist_dir="./chroma_db/summary/")
# # doc_summary_index = load_index_from_storage(storage_context)
summary_index = load_index_from_storage(
    storage_context=summary_storage_context
)

## Querying

#### Logging setup

In [33]:
from llama_index.core.response.notebook_utils import display_response
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

#### Compact query

In [34]:
# compact query
query_engine = vector_index.as_query_engine(response_mode="compact")

response = query_engine.query("What are key features of llama-agents?")

display_response(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** Based on the provided context information, the key features of llama-agents are:

1. Distributed Service Oriented Architecture: llama-agents allows for the creation of independently running microservices, each with its own LLM-powered control plane that routes and distributes tasks.
2. Standardized API Interfaces: llama-agents provides a standardized API interface between agents using a central control plane orchestrator, allowing for seamless communication between agents.
3. Define Agentic and Explicit Orchestration Flows: Developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an "agentic orchestrator" that decides which agents are relevant to the task.
4. Ease of Deployment: llama-agents provides a simple and easy-to-use interface for deploying and managing agents, allowing for efficient and scalable deployment.
5. Scalability and Resource Management: llama-agents provide built-in observability tools for monitoring the quality and performance of the system and each individual agent service, enabling efficient scaling and resource management.

By leveraging these key features, llama-agents provides a powerful framework for building and deploying multi-agent systems, enabling developers to create complex question-answering systems, collaborative AI assistants, and distributed AI workflows with ease.

In [35]:
response.metadata

{'9b674962-37bb-43af-8ef0-a1398491ca21': {'file_path': '/workspace/projects/LlamaRAG/data/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_name': 'introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html',
  'file_type': 'text/html',
  'file_size': 18790,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'},
 '902d9937-03d2-4744-8914-2095f695ae36': {'file_path': '/workspace/projects/LlamaRAG/data/how-to-build-llm-agents-in-typescript-with-llamaindex-ts-a88ed364a7aa.html',
  'file_name': 'how-to-build-llm-agents-in-typescript-with-llamaindex-ts-a88ed364a7aa.html',
  'file_type': 'text/html',
  'file_size': 16248,
  'creation_date': '2024-07-21',
  'last_modified_date': '2024-07-21'}}

In [30]:
len(response.source_nodes)

2

In [32]:
print(response.response)

Based on the context information provided, the key features of llama-agents are:

1. Distributed Service Oriented Architecture: llama-agents allows for the creation of independently running microservices, each with its own LLM-powered control plane that orchestrates the tasks.
2. Standardized API Interfaces: llama-agents provides a standardized API interface for communication between agents using a central control plane orchestrator.
3. Define Agentic and Explicit Orchestration Flows: Developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an "agentic orchestrator" that decides which agents are relevant to the task.
4. Ease of Deployment: llama-agents provides a simple way to launch, scale, and monitor each agent and the control plane independently.
5. Scalability and Resource Management: llama-agents provide built-in observability tools to monitor the quality and performance of the system and each individual agent service.

Thes

#### Refine Query

In [42]:
# compact query
query_engine = vector_index.as_query_engine(response_mode="refine", streaming=True)

response = query_engine.query('''
What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?
''')

response.print_response_stream()
# display_response(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Based on the new context provided, the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are:

1. Response Generation performance: This area of evaluation assesses the ability of the RAG system to generate coherent and contextually relevant responses to given prompts.
2. Contextual Relevance performance: This area of evaluation assesses the ability of the RAG system to retrieve relevant information from the input context.

These two areas are critical because they directly impact the quality of the responses generated by the RAG system. A RAG system that can generate coherent and contextually relevant responses is more likely to be useful and accurate in its output, while a RAG system that can retrieve relevant information from the input context is more likely to provide accurate and informative responses.

The provided context provides additional information on the importance of these areas and how the

### Tree summary

In [36]:
query_engine = vector_index.as_query_engine(response_mode="tree_summarize", streaming=True)
response = query_engine.query("What are key features of llama-agents?")
response.print_response_stream()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Based on the information provided in the article, the key features of llama-agents are:

1. Distributed Service Oriented Architecture: llama-agents are designed to be distributed and independent, allowing each agent to be its own microservice.
2. Communication via standardized API interfaces: llama-agents use a standardized API interface for communication between agents, making it easy to define the sequence of interactions between agents.
3. Define agentic and explicit orchestration flows: developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an "agentic orchestrator" that decides which agents are relevant to the task.
4. Ease of deployment: llama-agents can be easily launched and scaled independently, allowing for efficient management of multiple agents.
5. Scalability and resource management: llama-agents provide built-in observability tools for monitoring the quality and performance of the system and each individual agent s

## Router query engine

In [36]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

In [37]:
vector_tool = QueryEngineTool(
    query_engine=vector_index.as_query_engine(),
    metadata=ToolMetadata(
        name="vector_search",
        description="Useful for searching for specific facts."
    )
)

summary_tool = QueryEngineTool(
    summary_index.as_query_engine(response_mode="tree_summarize"),
    metadata=ToolMetadata(
        name="summary",
        description="Useful for summarizing an entire document."
    )
)


In [38]:
# create the router query engine
query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)
print("Created the router query engine.")

Created the router query engine.


In [39]:
response = query_engine.query("What are key features of llama-agents?")
response

INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 0: Llama-agents are useful for summarizing an entire document, which would allow for a quick understanding of the key features and main points of the document..
Selecting query engine 0: Llama-agents are useful for summarizing an entire document, which would allow for a quick understanding of the key features and main points of the document..
[1;3;38;5;200mSelecting query engine 0: Llama-agents are useful for summarizing an entire document, which would allow for a quick understanding of the key features and main points of the document..
[0m

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


KeyboardInterrupt: 

In [40]:
del query_engine