## Crawl Data

In [1]:
from bs4 import BeautifulSoup
import requests
import os
from tqdm import tqdm

In [2]:
# Extract and print the main text from each card
def get_blog_urls():
    # get all blog
    base_url = "https://www.llamaindex.ai"
    response = requests.get(f"{base_url}/blog")
    html = response.text
    
    soup = BeautifulSoup(html, 'html.parser')
    # Find all blog post cards
    blog_cards = soup.find_all('div', class_='CardBlog_card__mm0Zw')
    blog_data = []
    for card in blog_cards:
        # Extract title
        title_element = card.find('p', class_='CardBlog_title__qC51U').find('a')
        title = title_element.text.strip()
        url = base_url + title_element['href']
    
        # Extract publication date
        date = card.find('p', class_='Text_text__zPO0D Text_text-size-16__PkjFu').text.strip()
    
        # Print the extracted information
        print(f"Title: {title}")
        print(f"Date: {date}")
        print(f"URL: {url}")
        print("---")

        blog_data.append({
            "title": title,
            "date": date,
            "url": url
        })
        
    return blog_data

In [3]:
def extract_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style tags
    for script in soup(['script', 'style']):
        script.extract()

    # Get rid of empty tags
    for tag in soup.find_all():
        if not tag.text.strip():
            tag.extract()

    # only get main blog content, ignore related blogs
    blog_content = soup.find("main").find("div", class_="BlogPost_htmlPost__Z5oDL")

    return blog_content.prettify()

In [4]:
blogs_data = get_blog_urls()

Title: Introducing LlamaExtract Beta: structured data extraction in just a few clicks
Date: Jul 25, 2024
URL: https://www.llamaindex.ai/blog/introducing-llamaextract-beta-structured-data-extraction-in-just-a-few-clicks
---
Title: LlamaIndex Newsletter 2024-07-23
Date: Jul 23, 2024
URL: https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-07-23
---
Title: Improving Vector Search - Reranking with PostgresML and LlamaIndex
Date: Jul 19, 2024
URL: https://www.llamaindex.ai/blog/improving-vector-search-reranking-with-postgresml-and-llamaindex
---
Title: The latest updates to LlamaCloud
Date: Jul 19, 2024
URL: https://www.llamaindex.ai/blog/the-latest-updates-to-llamacloud
---
Title: Case Study: How Scaleport.ai Accelerated Development and Improved Sales with LlamaCloud
Date: Jul 17, 2024
URL: https://www.llamaindex.ai/blog/case-study-how-scaleport-ai-accelerated-development-and-improved-sales-with-llamacloud
---
Title: Building a multi-agent concierge system
Date: Jul 17, 2024
URL: htt

In [10]:
blogs_data[0]

{'title': 'Introducing LlamaExtract Beta: structured data extraction in just a few clicks',
 'date': 'Jul 25, 2024',
 'url': 'https://www.llamaindex.ai/blog/introducing-llamaextract-beta-structured-data-extraction-in-just-a-few-clicks'}

In [7]:
# raw blog 
response = requests.get(blogs_data[0]["url"])
raw_blog = BeautifulSoup(response.text, 'html.parser')
with open(f'raw_page.html', 'w', encoding='utf-8') as f:
    f.write(raw_blog.prettify())

In [12]:
# clean blog
cleaned_html = extract_page(blogs_data[0]["url"])
with open(f'sample_clean.html', 'w', encoding='utf-8') as f:
    f.write(cleaned_html)

In [38]:
# # extract all blog page and save to folder data
# save_folder = "./data"
# for blog_data in tqdm(blogs_data,desc="Crawling data"):
#     cleaned_html = extract_page(blog_data["url"])
#     save_name =  blog_data["url"].split("/")[-1]
#     with open(f'{save_folder}/{save_name}.html', 'w', encoding='utf-8') as f:
#         f.write(cleaned_html)

Crawling data: 100%|██████████| 159/159 [01:03<00:00,  2.51it/s]


## Loading and Injestion

### Load documents

In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

In [2]:
documents = SimpleDirectoryReader(
    input_dir="./data",
).load_data()

In [3]:
len(documents[0].text), len(documents)

(24684, 159)

In [12]:
documents[0].metadata

{'file_path': '/workspace/projects/LlamaRAG/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html',
 'file_name': 'a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html',
 'file_type': 'text/html',
 'file_size': 24708,
 'creation_date': '2024-07-21',
 'last_modified_date': '2024-07-21'}

### Split documents

In [4]:
splitter = SentenceSplitter(
    chunk_size=2048,
    chunk_overlap=200
)

In [5]:
nodes = splitter.get_nodes_from_documents(documents)
print(f"Created {len(nodes)} nodes.")

Created 427 nodes.


In [12]:
nodes[0]

TextNode(id_='9c7d28fb-4c73-4b47-afb7-e1626ddc3d06', embedding=None, metadata={'file_path': '/workspace/projects/LlamindexHelper/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html', 'file_name': 'a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html', 'file_type': 'text/html', 'file_size': 24708, 'creation_date': '2024-07-21', 'last_modified_date': '2024-07-21'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='86c99a39-954b-4648-afc3-d83765458f82', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/workspace/projects/LlamindexHelper/data/a-cheat-sheet-and-some-recipes-for-building-advanced-rag-803a9d94c41b.html', 'file_name': 'a-cheat-sheet-and-s

## Indexing and Embedding

In [5]:
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, Settings
from llama_index.core import load_index_from_storage
from llama_index.core import VectorStoreIndex
from llama_index.core.indices import SummaryIndex
from llama_index.core import DocumentSummaryIndex
from llama_index.core.prompts import PromptTemplate
import chromadb
import torch
import model_utils

In [6]:
import nest_asyncio
nest_asyncio.apply()

In [7]:
model, tokenizer = model_utils.load_quantized_model(
    model_name_or_path="models/Llama-2-7b-chat-hf",
    device="cuda"
)

Loading tokenizer and model with quantization config from: models/Llama-2-7b-chat-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="models/bge-small-en-v1.5", device="cuda")

In [9]:
# config llm and embed_model to llamaindex
llm_hf = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    query_wrapper_prompt=PromptTemplate("<s> [INST] {query_str} [/INST] "),
    generate_kwargs={
        "temperature": 0.7,
        "do_sample": True
    },
    device_map="cuda",
    model_name="models/Llama-2-7b-chat-hf",
    model=model,
    tokenizer=tokenizer
)

In [10]:
Settings.embed_model = embed_model
Settings.llm = llm_hf

### Init chromadb 

In [11]:
# Creates a persistent instance of Chroma that saves to disk
chroma_client = chromadb.PersistentClient(path="./chroma_db")
# Get or create a collection with the given name and metadata.
vector_collection = chroma_client.get_or_create_collection("llama_index_blogs")
vector_collection, vector_collection.count()

(Collection(id=3186f0ec-26e5-46fa-b687-281a3a26066f, name=llama_index_blogs),
 427)

### Create vetor store

In [12]:
# Init chromadb storage
vector_store = ChromaVectorStore(chroma_collection=vector_collection)
vector_storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [14]:
# # First run: Create vector index from documents.
# vector_index = VectorStoreIndex(
#     nodes, storage_context=storage_context, show_progress=True
# )

In [13]:
# load your index from stored vectors
vector_index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store, storage_context=vector_storage_context
)

## Querying

#### Logging setup

In [14]:
from llama_index.core.response.notebook_utils import display_response
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

#### Questions

In [15]:
question1 = "What are key features of llama-agents?"
question2 = '''
What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?
'''
question3 = '''
What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?
'''

In [31]:
def print_ref_docs(resp_metadata):
    print("References:")
    base_url = "https://www.llamaindex.ai/"
    for idx, (_, doc_metatada) in enumerate(resp_metadata.items()):
        ref_url = base_url + doc_metatada["file_name"]
        print(f"{idx+1}.", ref_url)

#### Compact query

In [39]:
# compact query
query_engine = vector_index.as_query_engine(response_mode="compact")

In [40]:
print("Question:", question1)
response1 = query_engine.query(question1)
display_response(response1)
print_ref_docs(response1.metadata)

Question: What are key features of llama-agents?
Answer:


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** Based on the provided context information, the key features of llama-agents are:

1. Distributed Service Oriented Architecture: llama-agents allow each agent to be its own independently running microservice, orchestrated by a fully customizable LLM-powered control plane that routes and distributes tasks.
2. Standardized API Interfaces: llama-agents provide a standardized API interface for communication between agents, making it easy to define and implement complex distributed systems.
3. Explicit Orchestration Flows: Developers have the flexibility to directly define the sequence of interactions between agents or leave it up to an "agentic orchestrator" that decides which agents are relevant to the task.
4. Ease of Deployment: llama-agents provide a simple and easy-to-use interface for deploying and scaling agents and the control plane independently.
5. Scalability and Resource Management: llama-agents provide built-in observability tools to monitor the quality and performance of the system and each individual agent service.
6. Build Your Own Multi-Agent Systems: llama-agents provide a simple example of how to set up a basic multi-agent system using LLMs, including how to create tools, define agents, and launch and control the system.

References:
1. https://www.llamaindex.ai/introducing-llama-agents-a-powerful-framework-for-building-production-multi-agent-ai-systems.html
2. https://www.llamaindex.ai/how-to-build-llm-agents-in-typescript-with-llamaindex-ts-a88ed364a7aa.html


In [41]:
print("Question:", question2)
response2 = query_engine.query(question2)
display_response(response2)
print_ref_docs(response2.metadata)

Question: 
What are the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** Based on the context information provided, the two critical areas of RAG system performance that are assessed in the "Evaluating RAG with LlamaIndex" section of the OpenAI Cookbook are:

1. Retrieval System: This area assesses the quality of the retrieved context, which is an essential component of the RAG system. It evaluates how well the system can retrieve relevant and accurate context information to augment the response generation process.
2. Response Generation: This area assesses the quality of the generated responses, which is the final output of the RAG system. It evaluates how well the system can generate coherent and accurate responses to the input prompt, taking into account the retrieved context.

References:
1. https://www.llamaindex.ai/openai-cookbook-evaluating-rag-systems-fe393c61fb93.html
2. https://www.llamaindex.ai/supercharge-your-llamaindex-rag-pipeline-with-uptrain-evaluations.html


In [42]:
print("Question:", question3)
response3 = query_engine.query(question3)
display_response(response3)
print_ref_docs(response3.metadata)

Question: 
What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

**`Final Response:`** The two main metrics used to evaluate the performance of the different rerankers in the RAG system are:

1. Hit Rate: It measures the fraction of queries where the correct answer is found within the top-k retrieved documents.
2. Mean Reciprocal Rank (MRR): It evaluates the system’s accuracy by looking at the rank of the highest-placed relevant document.

In the context of the RAG system, these metrics are used to assess the performance of the embedding model and the reranker in retrieving relevant information from the corpus.

References:
1. https://www.llamaindex.ai/boosting-rag-picking-the-best-embedding-reranker-models-42d079022e83.html
2. https://www.llamaindex.ai/openai-cookbook-evaluating-rag-systems-fe393c61fb93.html
