## Load documents

In [1]:
import glob
import json
import os
from tqdm import tqdm
from pathlib import Path
from llama_index.core import (
    SimpleDirectoryReader,
    Document,
    Settings,
    VectorStoreIndex,
    StorageContext,
    SummaryIndex
)
from llama_index.readers.file import HTMLTagReader, FlatReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import HTMLNodeParser, MarkdownNodeParser, SentenceSplitter
from llama_index.core.storage.docstore import SimpleDocumentStore
from IPython.display import display, HTML, Markdown

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs_metadata = json.load(open("./data/llama_blogs_metadata.json", "r"))
len(docs_metadata)

166

### Load html documents

In [2]:
html_loader = SimpleDirectoryReader(
    input_dir="./data/llama-blogs-html/",
    filename_as_id=True,
)

In [3]:
html_documents = html_loader.load_data(show_progress=True, num_workers=8)

In [4]:
test_doc = html_documents[10]

In [5]:
test_doc.metadata

{'file_path': '/workspace/projects/LlamindexHelper/data/llama-blogs-html/bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot-db42e26ab4f3.html',
 'file_name': 'bridging-the-gap-in-crisis-counseling-introducing-counselor-copilot-db42e26ab4f3.html',
 'file_type': 'text/html',
 'file_size': 11333,
 'creation_date': '2024-08-08',
 'last_modified_date': '2024-08-08'}

In [6]:
display(HTML(test_doc.text))

### Preview markdown document loader

In [3]:
md_reader = FlatReader()
parser = MarkdownNodeParser()

In [27]:
test_doc_metadata = docs_metadata[50]
test_doc_metadata

{'title': 'Towards Long Context RAG',
 'date': 'Mar 1, 2024',
 'url': 'https://www.llamaindex.ai/blog/towards-long-context-rag'}

In [28]:
md_dir = "./data/llama-blogs-md/"

In [29]:
file_path = os.path.join(md_dir, test_doc_metadata["url"].split("/")[-1] + ".md")
file_path

'./data/llama-blogs-md/towards-long-context-rag.md'

In [15]:
md_docs = md_reader.load_data(Path(file_path), extra_info=doc0_metadata)

In [16]:
md_docs[0].metadata

{'filename': 'towards-long-context-rag.md',
 'extension': '.md',
 'title': 'Towards Long Context RAG',
 'date': 'Mar 1, 2024',
 'url': 'https://www.llamaindex.ai/blog/towards-long-context-rag'}

In [18]:
nodes = parser.get_nodes_from_documents(md_docs, show_progress=True)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 400.49it/s]


In [19]:
len(nodes)

9

In [22]:
for node in nodes:
    print(node.metadata)

{'filename': 'towards-long-context-rag.md', 'extension': '.md', 'title': 'Towards Long Context RAG', 'date': 'Mar 1, 2024', 'url': 'https://www.llamaindex.ai/blog/towards-long-context-rag'}
{'Header_2': ' Our Mission Goes Beyond RAG', 'filename': 'towards-long-context-rag.md', 'extension': '.md', 'title': 'Towards Long Context RAG', 'date': 'Mar 1, 2024', 'url': 'https://www.llamaindex.ai/blog/towards-long-context-rag'}
{'Header_2': ' Initial Gemini 1.5 Pro Observations', 'filename': 'towards-long-context-rag.md', 'extension': '.md', 'title': 'Towards Long Context RAG', 'date': 'Mar 1, 2024', 'url': 'https://www.llamaindex.ai/blog/towards-long-context-rag'}
{'Header_2': ' Long Contexts Resolve Some Pain Points, but some Challenges Remain', 'filename': 'towards-long-context-rag.md', 'extension': '.md', 'title': 'Towards Long Context RAG', 'date': 'Mar 1, 2024', 'url': 'https://www.llamaindex.ai/blog/towards-long-context-rag'}
{'Header_2': ' Towards New RAG Architectures', 'filename': 't

### Markdown documents injestion

In [4]:
md_reader = FlatReader()
parser = MarkdownNodeParser()

In [5]:
md_dir = "./data/llama-blogs-md/"

In [6]:
loaded_documents = []
for doc_metadata in tqdm(docs_metadata, desc="Parsing documents"):
    file_path = os.path.join(md_dir, doc_metadata["url"].split("/")[-1] + ".md")
    md_docs = md_reader.load_data(Path(file_path), extra_info=doc_metadata)
    loaded_documents = loaded_documents + md_docs

Parsing documents: 100%|██████████| 166/166 [00:00<00:00, 11530.83it/s]


In [7]:
len(loaded_documents)

166

In [21]:
# store documents
md_docs_store = SimpleDocumentStore()

In [22]:
md_docs_store.add_documents(nodes)

In [25]:
md_docs_store.persist("database/llama-blogs-nodes-md-parser.json")

### Split documents to nodes

In [8]:
nodes = parser.get_nodes_from_documents(loaded_documents, show_progress=True)

Parsing nodes: 100%|██████████| 166/166 [00:00<00:00, 1001.43it/s]


In [9]:
len(nodes)

1221

In [11]:
for node in nodes[:10]:
    display(Markdown(node.text))
    print(node.metadata)
    print("=="*50)

Greetings, Llama Lovers!

Welcome to this week’s edition of the LlamaIndex newsletter! We’re excited to
share our latest updates including dynamic features like LlamaIndex Workflows
and retrieval capabilities in LlamaCloud. Check out our in-depth guides,
tutorials, and the upcoming webinars that will help you make the most of these
new developments.

{'filename': 'llamaindex-newsletter-2024-08-06.md', 'extension': '.md', 'title': 'LlamaIndex Newsletter 2024-08-06', 'date': 'Aug 6, 2024', 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-08-06'}


**The highlights:**

  1. **LlamaIndex Workflows Launched:** LlamaIndex Workflows, a new event-driven architecture for building multi-agent applications, supports batching, async operations, and streaming. Agents subscribe to and emit events for complex, readable, Pythonic orchestration. [ Blogpost ](https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex) , [ Tweet ](https://x.com/llama_index/status/1819048068798616058) . 
  2. **Dynamic Retrieval Feature in LlamaCloud:** A new feature in LlamaCloud now supports dynamic retrieval for QA assistants, enabling both chunk-level and file-level document retrieval based on query similarity to intelligently route queries. [ Blogpost ](https://www.llamaindex.ai/blog/dynamic-retrieval-with-llamacloud) , [ Notebook ](https://github.com/run-llama/llamacloud-demo/blob/main/examples/10k_apple_tesla/demo_file_retrieval.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818337133746360623) . 
  3. **LongRAG LlamaPack:** LongRAG is now available as a LlamaPack in LlamaIndex, utilizing larger document chunks and long-context LLMs for more effective synthesis. [ Notebook ](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-longrag/examples/longrag.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818802688274100578) .

{'Header_2': ' **The highlights:**', 'filename': 'llamaindex-newsletter-2024-08-06.md', 'extension': '.md', 'title': 'LlamaIndex Newsletter 2024-08-06', 'date': 'Aug 6, 2024', 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-08-06'}


**Feature Releases and Enhancements:**

  * We have launched LlamaIndex Workflows, a new event-driven way to build multi-agent applications where each agent acts as a component that subscribes to and emits events, allowing for complex, readable, and Pythonic orchestration with enhanced support for batching, async operations, and streaming. [ Blogpost ](https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex) , [ Tweet ](https://x.com/llama_index/status/1819048068798616058) . 
  * We have introduced a new feature in LlamaCloud to improve your QA assistant with our latest capability for dynamic retrieval, allowing both chunk-level and file-level retrieval. This feature enables the retrieval of entire documents based on query similarity, which supports building agents that can intelligently route queries based on their content. [ Blogpost ](https://www.llamaindex.ai/blog/dynamic-retrieval-with-llamacloud) , [ Notebook ](https://github.com/run-llama/llamacloud-demo/blob/main/examples/10k_apple_tesla/demo_file_retrieval.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818337133746360623) . 
  * We have launched LongRAG as a LlamaPack in LlamaIndex. LongRAG simplifies retrieval by using larger document chunks and leveraging long-context LLMs for synthesis. [ Notebook ](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-longrag/examples/longrag.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818802688274100578) . 

**Guides:**

  * [ Guide ](https://docs.llamaindex.ai/en/latest/examples/workflow/react_agent/) to building a ReAct agent from scratch using LlamaIndex workflows. 
  * [ Guide ](https://docs.llamaindex.ai/en/latest/examples/workflow/rag/) to Building an Event-Driven RAG Pipeline with LlamaIndex, featuring distinct event-driven steps for retrieval, reranking, and synthesis, enhanced with graph tracing and async processing. 
  * [ Guide ](https://docs.llamaindex.ai/en/latest/module_guides/observability/#mlflow) to MLflow in LlamaIndex to manage, deploy, and monitor your genAI applications with MLflow's tracking, packaging, evaluation, and tracing capabilities. 

**Tutorials:**

  * [ Pavan Kumar’s ](https://x.com/pavan_mantha1) [ tutorial ](https://blog.gopenai.com/building-smarter-agents-using-llamaindex-agents-and-qdrants-hybrid-search-50c0ecbbfb0d) on Building Smarter Agents using LlamaIndex Agents and Qdrant’s Hybrid Search. 
  * [ Farzad Sunavala’s ](https://www.linkedin.com/in/farzadsunavala) [ tutorial ](https://farzzy.hashnode.dev/rag-observability-and-evaluation-with-azure-ai-search-azure-openai-llamaindex-and-arize-phoenix) on RAG Observability and Evaluation with Azure AI Search, Azure OpenAI, LlamaIndex, and Arize Phoenix. 
  * [ Composio’s ](https://x.com/composiohq) [ tutorial ](https://github.com/ComposioHQ/composio/tree/master/python/examples/pr_agent/pr_agent_llama_index) on building a PR review agent using Composio's GitHub/Slack tools and LlamaIndex agent abstractions. 
  * [ Benito Martin’s ](https://medium.com/@benitomartin) [ tutorial ](https://medium.com/@benitomartin/find-your-code-scaling-a-llamaindex-and-qdrant-application-with-google-kubernetes-engine-2db126f16344) on Scaling a LlamaIndex and Qdrant Application with Google Kubernetes Engine. 
  * [ Chew Loong Nian’s ](https://medium.com/@chewloongnian) [ tutorial ](https://pub.towardsai.net/introducing-llamaextract-beta-transforming-metadata-extraction-for-enhanced-rag-queries-de3d74d34cd7) on Transforming Metadata Extraction for Enhanced RAG Queries using LlamaExtract. 
  * [ Pavan Kumar’s ](https://x.com/pavan_mantha1) [ tutorial ](https://medium.com/@manthapavankumar11/practical-implementation-of-agentic-rag-workflows-with-llama-index-and-qdrant-3b6622cd3124) on Practical Implementation of Agentic RAG Workflows with Llama-Index and Qdrant. 
  * AI21 Labs [ tutorial ](https://www.llamaindex.ai/blog/jamba-instruct-s-256k-context-window-on-llamaindex) on using Jamba-Instruct Model with LlamaIndex. 

**Webinars And Hackathons:**

  * [ Join us ](https://lu.ma/ka5xtyqo) for a webinar on August 8th with [ Dedy Kredo ](https://x.com/DedyKredo) from [ CodiumAI ](https://x.com/CodiumAI) on using RAG with LlamaIndex to help build a code generation solution that’s contextually aware of the right elements of source code. 
  * [ Join us ](https://lu.ma/p13pkknm?tk=SsniSt) on RAG Hack Night at GitHub with [ Weaviate ](https://x.com/weaviate_io) , [ Neosync ](https://x.com/neosynccloud) , [ Arize AI ](https://x.com/arizeai) on August 13th.

{'Header_2': ' **Feature Releases and Enhancements:**', 'filename': 'llamaindex-newsletter-2024-08-06.md', 'extension': '.md', 'title': 'LlamaIndex Newsletter 2024-08-06', 'date': 'Aug 6, 2024', 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-08-06'}


We’re pleased to be introducing a brand-new beta feature of LlamaIndex:
workflows, a mechanism for orchestrating actions in the increasingly-complex
AI application we see our users building.

What started as a trend with the advent of LLMs is now a de-facto standard: AI
applications are made of multiple tasks implemented by different components.
Open source frameworks in the market strive to make the life of AI engineers
easier by providing easy-to-use abstractions for foundational components like
data loaders, LLMs, vector databases, and rerankers, all the way up to
external services. Meanwhile, all of those frameworks are also on a quest to
find what’s the best abstraction to orchestrate such components, researching
what’s most intuitive and efficient for an AI developer in order to implement
the logic that keeps together a compound AI system.

Two of those potential orchestration patterns are chains and pipelines, both
of which are implementations of the same Directed Acyclic Graph (DAG)
abstraction. We took a stab at this with our [ Query Pipelines
](https://www.llamaindex.ai/blog/introducing-query-pipelines-025dc2bb0537)
release at the beginning of the year - it was a declarative API that let you
orchestrate simple-to-advanced query workflows over your data for different
use cases, like QA, structured extraction, and agentic automation. But as we
tried to build upon it and experimented with adding cycles to better support
more complex workflows, we noticed several issues, causing us to reflect on
why a DAG may not be the right fit for an agentic landscape, and what
alternatives we could introduce in the framework.

{'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


Limitations of a Graph-based UX

A fundamental aspect of DAGs is the “A” in DAGs: they are acyclic, meaning
there are no loops. But in a world that’s more and more agentic, the inability
to perform loops in an AI application’s logic is simply unacceptable. For
example, if one component provides bad results, an AI developer should have a
way to tell the system to self-correct and try again.

Even without adding cycles and loops to a DAG, the query pipeline suffered
from a few noticeable issues:

  * hard to debug when things go wrong 
  * they obscure how components and modules are being executed 
  * our pipeline orchestrator became increasingly extremely complex and had to handle a ton of different edge cases 
  * they were hard to read for complex pipelines 

Once we added cycles to query pipelines, these developer UX issues around
graphs were amplified. We experienced first-hand developer pain in areas like:

  * A lot of core orchestration logic like ` if-else ` statements and ` while ` loops get baked into the edges of the graph. Defining these edges becomes cumbersome and verbose. 
  * It became hard to handle edge cases around optional and default values. It was hard for us as a framework to figure out whether a parameter would get passed from upstream nodes. 
  * Defining graphs with cycles didn’t always feel as natural to developers building agents. An agent encapsulates a general LLM-powered entity that can take in observations and generate responses. Here the graph UX enforced that “agent” node had the incoming edges and outgoing edges explicitly defined, forcing users to define verbose communication patterns with other nodes. 

We asked: are graphs really the only abstraction we can use to orchestrate
components in a compound AI system?

{'Header_2': ' Limitations of a Graph-based UX', 'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


From Graphs to EDA: go event-driven

A compound AI system can be implemented with a LlamaIndex _workflow_ . The
workflow dispatches events back and forth through a collection of Python
functions called _steps_ . Each step can be seen as one component of your
system: one to process a query, one to talk with an LLM, one to load data from
a vector database and so on. Every step receives one or more events to process
and can optionally send back events that will be relayed to other components
if needed.

Moving to an event-driven architecture causes a fundamental shift in design.
In many graph implementations the graph traversal algorithm is responsible for
determining what component should run next and what data should be passed. In
an event-driven architecture, the component subscribes to a certain types of
events and it’s ultimately responsible for deciding what to do based on the
data it received.

In an event-driven system, concepts like optionality of inputs and default
values are sorted out at the component level, dramatically simplifying the
orchestration code.

{'Header_2': ' From Graphs to EDA: go event-driven', 'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


A workflow primer

To help clarify this idea, let’s look at an example. A minimal LlamaIndex
workflow looks like this:

    
    
    from llama_index.core.workflow import (
        StartEvent,
        StopEvent,
        Workflow,
        step,
    )
    
    from llama_index.llms.openai import OpenAI
    
    class OpenAIGenerator(Workflow):
        @step()
        async def generate(self, ev: StartEvent) -> StopEvent:
            query = ev.get("query")
            llm = OpenAI()
            response = await llm.acomplete(query)
            return StopEvent(result=str(response))
    
    w = OpenAIGenerator(timeout=10, verbose=False)
    result = await w.run(query="What's LlamaIndex?")
    print(result)

The ` generate ` function is marked as a workflow step using the ` @step `
decorator and it declares which events it wants to receive and which events it
will send back using the method signature with proper typing annotations. In
order to run a workflow, we create an instance of the ` OpenAIGenerator `
class passing some configuration parameters like the desired timeout and we
then call the ` run ` method. Any keyword argument passed to ` run ` will be
packed into a special event of type ` StartEvent ` that will be relayed to the
steps that requested it (in this case, only the ` generate ` step). The `
generate ` step returns a special event of type ` StopEvent ` that will signal
the workflow to gracefully halt its execution. A ` StopEvent ` carries any
data that we want to return to the caller as the workflow result, in this case
the LLM response.

{'Header_2': ' A workflow primer', 'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


Workflows can loop

In event-driven architectures, loops have to do with communication rather than
topology. Any step can decide to call another step multiple times by crafting
and sending the proper event. Let’s see a self-correction loop for example
(check the [ notebook
](https://docs.llamaindex.ai/en/latest/examples/workflow/reflection/) for the
full code):

    
    
    class ExtractionDone(Event):
        output: str
        passage: str
    
    
    class ValidationErrorEvent(Event):
        error: str
        wrong_output: str
        passage: str
        
        
    class ReflectionWorkflow(Workflow):
        @step()
        async def extract(
            self, ev: StartEvent | ValidationErrorEvent
        ) -> StopEvent | ExtractionDone:
            if isinstance(ev, StartEvent):
                passage = ev.get("passage")
                if not passage:
                    return StopEvent(result="Please provide some text in input")
                reflection_prompt = ""
            elif isinstance(ev, ValidationErrorEvent):
                passage = ev.passage
                reflection_prompt = REFLECTION_PROMPT.format(
                    wrong_answer=ev.wrong_output, error=ev.error
                )
    
            llm = Ollama(model="llama3", request_timeout=30)
            prompt = EXTRACTION_PROMPT.format(
                passage=passage, schema=CarCollection.schema_json()
            )
            if reflection_prompt:
                prompt += reflection_prompt
    
            output = await llm.acomplete(prompt)
    
            return ExtractionDone(output=str(output), passage=passage)
    
        @step()
        async def validate(
            self, ev: ExtractionDone
        ) -> StopEvent | ValidationErrorEvent:
            try:
                json.loads(ev.output)
            except Exception as e:
                print("Validation failed, retrying...")
                return ValidationErrorEvent(
                    error=str(e), wrong_output=ev.output, passage=ev.passage
                )
    
            return StopEvent(result=ev.output)
    
    w = ReflectionWorkflow(timeout=60, verbose=True)
    result = await w.run(
        passage="There are two cars available: a Fiat Panda with 45Hp and a Honda Civic with 330Hp."
    )
    print(result)

In this example, the ` validate ` step receives the result of the tentative
schema extraction as an event and it can decide to try again by returning a `
ValidationErrorEvent ` that will be eventually delivered to the ` extract `
step which will perform another attempt. Note that in this example the
workflow might time out if this extract/validate loop keeps providing poor
results for too long, but another strategy might be giving up after a precise
number of attempts, just to give an example.

{'Header_2': ' A workflow primer', 'Header_3': ' Workflows can loop', 'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


Workflows keep state

Workflows keep a global state during the execution, and this state can be
shared and propagated to its steps upon request. This shared state is
implemented as a ` Context ` object and can be used by steps to store data in
between iterations but also as an alternative form of communication among
different steps. Let’s see an excerpt from a more complex RAG example as an
example showing how to use the global context (check [ notebook
](https://docs.llamaindex.ai/en/latest/examples/workflow/rag/) for full code):

    
    
    class RAGWorkflow(Workflow):
        @step(pass_context=True)
        async def ingest(self, ctx: Context, ev: StartEvent) -> Optional[StopEvent]:
            dataset_name = ev.get("dataset")
            _, documents = download_llama_dataset(dsname, "./data")
            ctx.data["INDEX"] = VectorStoreIndex.from_documents(documents=documents)
            return StopEvent(result=f"Indexed {len(documents)} documents.")
            
        ...

In this case the ` ingest ` step creates an index, and it wants to make it
available to any other step that might needed it later during workflow
execution. The idiomatic way of doing that in a LlamaIndex workflow is to
declare the step requires an instance of the global context ( `
@step(pass_context=True) ` does the trick) and store the index in the context
itself with a predefined key that other steps might access later.

{'Header_2': ' A workflow primer', 'Header_3': ' Workflows keep state', 'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


Workflows can be customized

Alongside Workflows, we’ll be releasing a set of predefined workflows so that
the most common use cases can be implemented with a single line of code. Using
these predefined flows, users still might want to just _slightly_ change a
predefined workflow to introduce some custom behavior without having to
rewrite a whole workflow from scratch. Let’s say you want to customize a RAG
workflow and use a custom re-ranking step, all you would need to do is
subclass a hypothetical built-in ` RAGWorkflow ` class and override the `
rerank ` step like this:

    
    
    class MyWorkflow(RAGWorkflow):
        @step(pass_context=True)
        def rerank(
            self, ctx: Context, ev: Union[RetrieverEvent, StartEvent]
        ) -> Optional[QueryResult]:
            # my custom reranking logic here
            
     
    w = MyWorkflow(timeout=60, verbose=True)
    result = await w.run(query="Who is Paul Graham?")

{'Header_2': ' A workflow primer', 'Header_3': ' Workflows can be customized', 'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


## Semantic Chunking

In [14]:
from llama_index.core.node_parser import (
    SentenceSplitter,
    SemanticSplitterNodeParser,
)

In [13]:
Settings.embed_model = HuggingFaceEmbedding(
    model_name="models/bge-base-en-v1.5",
    device="cuda:0"
)

In [17]:
splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=95, embed_model=Settings.embed_model
)

In [18]:
semantic_nodes = splitter.get_nodes_from_documents(loaded_documents, show_progress=True)

Parsing nodes:   0%|          | 0/166 [00:00<?, ?it/s]
Generating embeddings:   0%|          | 0/34 [00:00<?, ?it/s][A
Generating embeddings:  29%|██▉       | 10/34 [00:00<00:00, 28.39it/s][A
Generating embeddings: 100%|██████████| 34/34 [00:00<00:00, 66.55it/s][A
Parsing nodes:   1%|          | 1/166 [00:00<01:27,  1.90it/s]
Generating embeddings:   0%|          | 0/52 [00:00<?, ?it/s][A
Generating embeddings:  58%|█████▊    | 30/52 [00:00<00:00, 191.79it/s][A
Generating embeddings: 100%|██████████| 52/52 [00:00<00:00, 128.37it/s][A
Parsing nodes:   1%|          | 2/166 [00:00<01:15,  2.16it/s]
Generating embeddings:   0%|          | 0/42 [00:00<?, ?it/s][A
Generating embeddings:  48%|████▊     | 20/42 [00:00<00:00, 138.20it/s][A
Generating embeddings: 100%|██████████| 42/42 [00:00<00:00, 97.05it/s] [A
Parsing nodes:   2%|▏         | 3/166 [00:01<01:14,  2.20it/s]
Generating embeddings:   0%|          | 0/35 [00:00<?, ?it/s][A
Generating embeddings: 100%|██████████| 35/35 [0

In [19]:
for node in semantic_nodes[:10]:
    display(Markdown(node.text))
    print(node.metadata)
    print("=="*50)

Greetings, Llama Lovers!

Welcome to this week’s edition of the LlamaIndex newsletter! We’re excited to
share our latest updates including dynamic features like LlamaIndex Workflows
and retrieval capabilities in LlamaCloud. Check out our in-depth guides,
tutorials, and the upcoming webinars that will help you make the most of these
new developments.

##  **The highlights:**

  1. **LlamaIndex Workflows Launched:** LlamaIndex Workflows, a new event-driven architecture for building multi-agent applications, supports batching, async operations, and streaming. Agents subscribe to and emit events for complex, readable, Pythonic orchestration. [ Blogpost ](https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex) , [ Tweet ](https://x.com/llama_index/status/1819048068798616058) . 
  

{'filename': 'llamaindex-newsletter-2024-08-06.md', 'extension': '.md', 'title': 'LlamaIndex Newsletter 2024-08-06', 'date': 'Aug 6, 2024', 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-08-06'}


2. **Dynamic Retrieval Feature in LlamaCloud:** A new feature in LlamaCloud now supports dynamic retrieval for QA assistants, enabling both chunk-level and file-level document retrieval based on query similarity to intelligently route queries. [ Blogpost ](https://www.llamaindex.ai/blog/dynamic-retrieval-with-llamacloud) , [ Notebook ](https://github.com/run-llama/llamacloud-demo/blob/main/examples/10k_apple_tesla/demo_file_retrieval.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818337133746360623) . 
  

{'filename': 'llamaindex-newsletter-2024-08-06.md', 'extension': '.md', 'title': 'LlamaIndex Newsletter 2024-08-06', 'date': 'Aug 6, 2024', 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-08-06'}


3. **LongRAG LlamaPack:** LongRAG is now available as a LlamaPack in LlamaIndex, utilizing larger document chunks and long-context LLMs for more effective synthesis. [ Notebook ](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-longrag/examples/longrag.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818802688274100578) . 

##  **Feature Releases and Enhancements:**

  * We have launched LlamaIndex Workflows, a new event-driven way to build multi-agent applications where each agent acts as a component that subscribes to and emits events, allowing for complex, readable, and Pythonic orchestration with enhanced support for batching, async operations, and streaming. [ Blogpost ](https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex) , [ Tweet ](https://x.com/llama_index/status/1819048068798616058) . 
  * We have introduced a new feature in LlamaCloud to improve your QA assistant with our latest capability for dynamic retrieval, allowing both chunk-level and file-level retrieval. This feature enables the retrieval of entire documents based on query similarity, which supports building agents that can intelligently route queries based on their content. [ Blogpost ](https://www.llamaindex.ai/blog/dynamic-retrieval-with-llamacloud) , [ Notebook ](https://github.com/run-llama/llamacloud-demo/blob/main/examples/10k_apple_tesla/demo_file_retrieval.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818337133746360623) . 
  * We have launched LongRAG as a LlamaPack in LlamaIndex. LongRAG simplifies retrieval by using larger document chunks and leveraging long-context LLMs for synthesis. [ Notebook ](https://github.com/run-llama/llama_index/blob/main/llama-index-packs/llama-index-packs-longrag/examples/longrag.ipynb) , [ Tweet ](https://x.com/llama_index/status/1818802688274100578) . 

**Guides:**

  * [ Guide ](https://docs.llamaindex.ai/en/latest/examples/workflow/react_agent/) to building a ReAct agent from scratch using LlamaIndex workflows. 
  * [ Guide ](https://docs.llamaindex.ai/en/latest/examples/workflow/rag/) to Building an Event-Driven RAG Pipeline with LlamaIndex, featuring distinct event-driven steps for retrieval, reranking, and synthesis, enhanced with graph tracing and async processing. 
  * [ Guide ](https://docs.llamaindex.ai/en/latest/module_guides/observability/#mlflow) to MLflow in LlamaIndex to manage, deploy, and monitor your genAI applications with MLflow's tracking, packaging, evaluation, and tracing capabilities. 

**Tutorials:**

  * [ Pavan Kumar’s ](https://x.com/pavan_mantha1) [ tutorial ](https://blog.gopenai.com/building-smarter-agents-using-llamaindex-agents-and-qdrants-hybrid-search-50c0ecbbfb0d) on Building Smarter Agents using LlamaIndex Agents and Qdrant’s Hybrid Search. 
  * [ Farzad Sunavala’s ](https://www.linkedin.com/in/farzadsunavala) [ tutorial ](https://farzzy.hashnode.dev/rag-observability-and-evaluation-with-azure-ai-search-azure-openai-llamaindex-and-arize-phoenix) on RAG Observability and Evaluation with Azure AI Search, Azure OpenAI, LlamaIndex, and Arize Phoenix. 
  * [ Composio’s ](https://x.com/composiohq) [ tutorial ](https://github.com/ComposioHQ/composio/tree/master/python/examples/pr_agent/pr_agent_llama_index) on building a PR review agent using Composio's GitHub/Slack tools and LlamaIndex agent abstractions. 
  * [ Benito Martin’s ](https://medium.com/@benitomartin) [ tutorial ](https://medium.com/@benitomartin/find-your-code-scaling-a-llamaindex-and-qdrant-application-with-google-kubernetes-engine-2db126f16344) on Scaling a LlamaIndex and Qdrant Application with Google Kubernetes Engine. 
  * [ Chew Loong Nian’s ](https://medium.com/@chewloongnian) [ tutorial ](https://pub.towardsai.net/introducing-llamaextract-beta-transforming-metadata-extraction-for-enhanced-rag-queries-de3d74d34cd7) on Transforming Metadata Extraction for Enhanced RAG Queries using LlamaExtract. 
  * [ Pavan Kumar’s ](https://x.com/pavan_mantha1) [ tutorial ](https://medium.com/@manthapavankumar11/practical-implementation-of-agentic-rag-workflows-with-llama-index-and-qdrant-3b6622cd3124) on Practical Implementation of Agentic RAG Workflows with Llama-Index and Qdrant. 
  * AI21 Labs [ tutorial ](https://www.llamaindex.ai/blog/jamba-instruct-s-256k-context-window-on-llamaindex) on using Jamba-Instruct Model with LlamaIndex. 

**Webinars And Hackathons:**

  * [ Join us ](https://lu.ma/ka5xtyqo) for a webinar on August 8th with [ Dedy Kredo ](https://x.com/DedyKredo) from [ CodiumAI ](https://x.com/CodiumAI) on using RAG with LlamaIndex to help build a code generation solution that’s contextually aware of the right elements of source code. 
  * [ Join us ](https://lu.ma/p13pkknm?tk=SsniSt) on RAG Hack Night at GitHub with [ Weaviate ](https://x.com/weaviate_io) , [ Neosync ](https://x.com/neosynccloud) , [ Arize AI ](https://x.com/arizeai) on August 13th. 



{'filename': 'llamaindex-newsletter-2024-08-06.md', 'extension': '.md', 'title': 'LlamaIndex Newsletter 2024-08-06', 'date': 'Aug 6, 2024', 'url': 'https://www.llamaindex.ai/blog/llamaindex-newsletter-2024-08-06'}


We’re pleased to be introducing a brand-new beta feature of LlamaIndex:
workflows, a mechanism for orchestrating actions in the increasingly-complex
AI application we see our users building.

What started as a trend with the advent of LLMs is now a de-facto standard: AI
applications are made of multiple tasks implemented by different components.
Open source frameworks in the market strive to make the life of AI engineers
easier by providing easy-to-use abstractions for foundational components like
data loaders, LLMs, vector databases, and rerankers, all the way up to
external services. Meanwhile, all of those frameworks are also on a quest to
find what’s the best abstraction to orchestrate such components, researching
what’s most intuitive and efficient for an AI developer in order to implement
the logic that keeps together a compound AI system.

Two of those potential orchestration patterns are chains and pipelines, both
of which are implementations of the same Directed Acyclic Graph (DAG)
abstraction. We took a stab at this with our [ Query Pipelines
](https://www.llamaindex.ai/blog/introducing-query-pipelines-025dc2bb0537)
release at the beginning of the year - it was a declarative API that let you
orchestrate simple-to-advanced query workflows over your data for different
use cases, like QA, structured extraction, and agentic automation. But as we
tried to build upon it and experimented with adding cycles to better support
more complex workflows, we noticed several issues, causing us to reflect on
why a DAG may not be the right fit for an agentic landscape, and what
alternatives we could introduce in the framework.

##  Limitations of a Graph-based UX

A fundamental aspect of DAGs is the “A” in DAGs: they are acyclic, meaning
there are no loops. But in a world that’s more and more agentic, the inability
to perform loops in an AI application’s logic is simply unacceptable. For
example, if one component provides bad results, an AI developer should have a
way to tell the system to self-correct and try again.

Even without adding cycles and loops to a DAG, the query pipeline suffered
from a few noticeable issues:

  * hard to debug when things go wrong 
  * they obscure how components and modules are being executed 
  * our pipeline orchestrator became increasingly extremely complex and had to handle a ton of different edge cases 
  * they were hard to read for complex pipelines 

Once we added cycles to query pipelines, these developer UX issues around
graphs were amplified. We experienced first-hand developer pain in areas like:

  * A lot of core orchestration logic like ` if-else ` statements and ` while ` loops get baked into the edges of the graph. Defining these edges becomes cumbersome and verbose. 
  * It became hard to handle edge cases around optional and default values. It was hard for us as a framework to figure out whether a parameter would get passed from upstream nodes. 
  * Defining graphs with cycles didn’t always feel as natural to developers building agents. An agent encapsulates a general LLM-powered entity that can take in observations and generate responses. Here the graph UX enforced that “agent” node had the incoming edges and outgoing edges explicitly defined, forcing users to define verbose communication patterns with other nodes. 

We asked: are graphs really the only abstraction we can use to orchestrate
components in a compound AI system?

##  From Graphs to EDA: go event-driven

A compound AI system can be implemented with a LlamaIndex _workflow_ . The
workflow dispatches events back and forth through a collection of Python
functions called _steps_ . Each step can be seen as one component of your
system: one to process a query, one to talk with an LLM, one to load data from
a vector database and so on. Every step receives one or more events to process
and can optionally send back events that will be relayed to other components
if needed.

Moving to an event-driven architecture causes a fundamental shift in design.
In many graph implementations the graph traversal algorithm is responsible for
determining what component should run next and what data should be passed. In
an event-driven architecture, the component subscribes to a certain types of
events and it’s ultimately responsible for deciding what to do based on the
data it received.

In an event-driven system, concepts like optionality of inputs and default
values are sorted out at the component level, dramatically simplifying the
orchestration code.

##  A workflow primer

To help clarify this idea, let’s look at an example. A minimal LlamaIndex
workflow looks like this:

    
    
    from llama_index.core.workflow import (
        StartEvent,
        StopEvent,
        Workflow,
        step,
    )
    
    from llama_index.llms.openai import OpenAI
    
    class OpenAIGenerator(Workflow):
        @step()
        async def generate(self, ev: StartEvent) -> StopEvent:
            query = ev.get("query")
            llm = OpenAI()
            response = await llm.acomplete(query)
            return StopEvent(result=str(response))
    
    w = OpenAIGenerator(timeout=10, verbose=False)
    result = await w.run(query="What's LlamaIndex?")
    print(result)

The ` generate ` function is marked as a workflow step using the ` @step `
decorator and it declares which events it wants to receive and which events it
will send back using the method signature with proper typing annotations. In
order to run a workflow, we create an instance of the ` OpenAIGenerator `
class passing some configuration parameters like the desired timeout and we
then call the ` run ` method. Any keyword argument passed to ` run ` will be
packed into a special event of type ` StartEvent ` that will be relayed to the
steps that requested it (in this case, only the ` generate ` step). The `
generate ` step returns a special event of type ` StopEvent ` that will signal
the workflow to gracefully halt its execution. A ` StopEvent ` carries any
data that we want to return to the caller as the workflow result, in this case
the LLM response.

###  Workflows can loop

In event-driven architectures, loops have to do with communication rather than
topology. Any step can decide to call another step multiple times by crafting
and sending the proper event. Let’s see a self-correction loop for example
(check the [ notebook
](https://docs.llamaindex.ai/en/latest/examples/workflow/reflection/) for the
full code):

    
    
    class ExtractionDone(Event):
        output: str
        passage: str
    
    
    class ValidationErrorEvent(Event):
        error: str
        wrong_output: str
        passage: str
        
        
    class ReflectionWorkflow(Workflow):
        @step()
        async def extract(
            self, ev: StartEvent | ValidationErrorEvent
        ) -> StopEvent | ExtractionDone:
            if isinstance(ev, StartEvent):
                passage = ev.get("passage")
                if not passage:
                    return StopEvent(result="Please provide some text in input")
                reflection_prompt = ""
            elif isinstance(ev, ValidationErrorEvent):
                passage = ev.passage
                reflection_prompt = REFLECTION_PROMPT.format(
                    wrong_answer=ev.wrong_output, error=ev.error
                )
    
            llm = Ollama(model="llama3", request_timeout=30)
            prompt = EXTRACTION_PROMPT.format(
                passage=passage, schema=CarCollection.schema_json()
            )
            if reflection_prompt:
                prompt += reflection_prompt
    
            output = await llm.acomplete(prompt)
    
            return ExtractionDone(output=str(output), passage=passage)
    
        @step()
        async def validate(
            self, ev: ExtractionDone
        ) -> StopEvent | ValidationErrorEvent:
            try:
                json.loads(ev.output)
            except Exception as e:
                print("Validation failed, retrying...")
                return ValidationErrorEvent(
                    error=str(e), wrong_output=ev.output, passage=ev.passage
                )
    
            return StopEvent(result=ev.output)
    
    w = ReflectionWorkflow(timeout=60, verbose=True)
    result = await w.run(
        passage="There are two cars available: a Fiat Panda with 45Hp and a Honda Civic with 330Hp."
    )
    print(result)

In this example, the ` validate ` step receives the result of the tentative
schema extraction as an event and it can decide to try again by returning a `
ValidationErrorEvent ` that will be eventually delivered to the ` extract `
step which will perform another attempt. Note that in this example the
workflow might time out if this extract/validate loop keeps providing poor
results for too long, but another strategy might be giving up after a precise
number of attempts, just to give an example.



{'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


###  Workflows keep state

Workflows keep a global state during the execution, and this state can be
shared and propagated to its steps upon request. This shared state is
implemented as a ` Context ` object and can be used by steps to store data in
between iterations but also as an alternative form of communication among
different steps. Let’s see an excerpt from a more complex RAG example as an
example showing how to use the global context (check [ notebook
](https://docs.llamaindex.ai/en/latest/examples/workflow/rag/) for full code):

    
    
    class RAGWorkflow(Workflow):
        @step(pass_context=True)
        async def ingest(self, ctx: Context, ev: StartEvent) -> Optional[StopEvent]:
            dataset_name = ev.get("dataset")
            _, documents = download_llama_dataset(dsname, "./data")
            ctx.data["INDEX"] = VectorStoreIndex.from_documents(documents=documents)
            return StopEvent(result=f"Indexed {len(documents)} documents.")
            
        ...

In this case the ` ingest ` step creates an index, and it wants to make it
available to any other step that might needed it later during workflow
execution. The idiomatic way of doing that in a LlamaIndex workflow is to
declare the step requires an instance of the global context ( `
@step(pass_context=True) ` does the trick) and store the index in the context
itself with a predefined key that other steps might access later.

###  Workflows can be customized

Alongside Workflows, we’ll be releasing a set of predefined workflows so that
the most common use cases can be implemented with a single line of code. 

{'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


Using
these predefined flows, users still might want to just _slightly_ change a
predefined workflow to introduce some custom behavior without having to
rewrite a whole workflow from scratch. Let’s say you want to customize a RAG
workflow and use a custom re-ranking step, all you would need to do is
subclass a hypothetical built-in ` RAGWorkflow ` class and override the `
rerank ` step like this:

    
    
    class MyWorkflow(RAGWorkflow):
        @step(pass_context=True)
        def rerank(
            self, ctx: Context, ev: Union[RetrieverEvent, StartEvent]
        ) -> Optional[QueryResult]:
            # my custom reranking logic here
            
     
    w = MyWorkflow(timeout=60, verbose=True)
    result = await w.run(query="Who is Paul Graham?")

###  Workflows can be debugged

The complexity of your workflows will grow with the complexity of your
application logic, and sometimes it can be hard to understand how events will
flow during execution by just looking at the Python code. 

{'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


To ease the
understanding of complex workflows and to support the debugging of workflow
executions, LlamaIndex provides two functions:

  * ` draw_all_possible_flows ` produces a picture showing all the steps in a workflow and how events will possibly flow 
  * ` draw_most_recent_execution ` produces a similar picture, showing only the events that were actually sent during the last workflow execution 

On top of that, workflows can be executed manually, by calling ` run_step() `
multiple times until all the steps have completed. After each ` run_step `
call, the workflow can be inspected, examining any intermediate results or
debug logs.

##  Why you should use workflows today

Despite being at an early stage of development, LlamaIndex workflows already
represent a step forward compared to query pipelines, extending their
functionalities and adding more flexibility. On top of that, workflows come
with a set of features that you would normally expect from a much more mature
software:

  * Fully async with streaming support 
  * Instrumented by default, providing one-click observability with the supported integrations 
  * Step-by-step execution for easier debugging 
  * Validation and visualization of the event-driven dependencies 
  * Events are implemented as pydantic models to ease customization and further developments of new features 

##  Resources

Check out our [ workflow documentation
](https://docs.llamaindex.ai/en/latest/module_guides/workflow/) and our [
examples ](https://github.com/run-
llama/llama_index/tree/main/docs/docs/examples/workflow) including:

  * [ RAG ](https://docs.llamaindex.ai/en/latest/examples/workflow/rag/)
  * [ Reflection ](https://docs.llamaindex.ai/en/latest/examples/workflow/reflection/)
  * [ Function calling ](https://docs.llamaindex.ai/en/latest/examples/workflow/function_calling_agent/)
  * [ ReAct agent ](https://docs.llamaindex.ai/en/latest/examples/workflow/react_agent/)



{'filename': 'introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex.md', 'extension': '.md', 'title': 'Introducing workflows beta: a new way to create complex AI applications with LlamaIndex', 'date': 'Aug 1, 2024', 'url': 'https://www.llamaindex.ai/blog/introducing-workflows-beta-a-new-way-to-create-complex-ai-applications-with-llamaindex'}


Build state-of-the-art RAG applications for the enterprise by leveraging
LlamaIndex’s market-leading RAG strategies with AI21 Labs’ long context
Foundation Model, Jamba-Instruct.

We at AI21 Labs are excited to announce that our groundbreaking Jamba-Instruct
foundation model is now available through leading data framework LlamaIndex.
With this integration, developers can now build powerful RAG enterprise
applications with enhanced accuracy and cost-efficiency due to Jamba-
Instruct’s impressive 256K context window and LlamaIndex’s sophisticated end-
to-end offerings for RAG.

While many models declare long context windows, researchers at NVIDIA found
that [ most falter under evaluation ](https://arxiv.org/pdf/2404.06654) ,
revealing a discrepancy between their claimed and effective context window
lengths. Jamba-Instruct is one of the few models on the market to not only
achieve parity between its declared and effective lengths, but to do so with a
much longer context window length than any other model in its size class.

By offering a context window of 256K—roughly equivalent to 800 pages of
text—Jamba-Instruct increases the number of retrieved chunks and can vastly
improve the entire RAG system, rather than trying to improve the search
mechanism or incorporating an additional reranking component. Using a long
context foundation model like Jamba-Instruct makes querying private enterprise
data with RAG both more reliable and easier.

In the following notebook ( [ also available directly on colab
](https://colab.research.google.com/drive/1ycpC1pfCty9bqCmHdrgvAtqQwP1o0lPg)
), we’ll walk through an example of querying a collection of financial
documents, showing how Jamba-Instruct’s 256K context window allows the RAG
pipeline to retrieve more chunks at once in order to deliver an accurate
answer.

###  RAG Q&A on financial documents

To get started, these are the packages you need to install. You will also need
API keys to set up OpenAI for embeddings and AI21 for Jamba-Instruct.

    
    
    !pip install llama-index
    !pip install -U ai21
    !pip install llama-index-llms-ai21
    
    import os
    from llama_index.core.llama_dataset import download_llama_dataset
    from llama_index.core.llama_pack import download_llama_pack
    from llama_index.core import VectorStoreIndex
    from llama_index.core import SimpleDirectoryReader
    from llama_index.llms.ai21 import AI21
    
    os.environ['OPENAI_API_KEY'] = 'YOUR_OPENAI_API_KEY' # For embeddings
    os.environ['AI21_API_KEY'] = 'YOUR_AI21_API_KEY' # For the generation
    
    # Setup jamba instruct as the llm
    llm = AI21(
        model='jamba-instruct',
        temperature=0,
        max_tokens=2000
    )

Next, download 5 10-K forms from Amazon from [ Amazon’s Investor Relations
page. ](https://ir.aboutamazon.com/sec-filings/default.aspx)

    
    
    # Get the data - download 10k forms from AMZN from the last five years
    os.mkdir("data")
    !wget 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf' -O 'data/amazon_2023.pdf'
    !wget 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/d2fde7ee-05f7-419d-9ce8-186de4c96e25.pdf' -O 'data/amazon_2022.pdf'
    !wget 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/f965e5c3-fded-45d3-bbdb-f750f156dcc9.pdf' -O 'data/amazon_2021.pdf'
    !wget 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/336d8745-ea82-40a5-9acc-1a89df23d0f3.pdf' -O 'data/amazon_2020.pdf'
    !wget 'https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/4d39f579-19d8-4119-b087-ee618abf82d6.pdf' -O 'data/amazon_2019.pdf'

Set up your index and query engine to create the retrieval and generation
components of your RAG system.

    
    
    

{'filename': 'jamba-instruct-s-256k-context-window-on-llamaindex.md', 'extension': '.md', 'title': "Jamba-Instruct's 256k context window on LlamaIndex", 'date': 'Jul 31, 2024', 'url': 'https://www.llamaindex.ai/blog/jamba-instruct-s-256k-context-window-on-llamaindex'}


# Setup the index
    file_list = [os.path.join("data", f) for f in os.listdir("data")]
    
    amzn_10k_docs = SimpleDirectoryReader(input_files=file_list).load_data()
    index = VectorStoreIndex.from_documents(documents=amzn_10k_docs)
    
    # Build a query engine
    default_query_engine = index.as_query_engine(llm)

Let’s enter a query to make sure our RAG system is working.

    
    
    answer = default_query_engine.query("What was the company's revenue in 2021?")
    print(answer.response)
    
    
    The company's revenue in 2021 was $469,822 million.

Great! 

{'filename': 'jamba-instruct-s-256k-context-window-on-llamaindex.md', 'extension': '.md', 'title': "Jamba-Instruct's 256k context window on LlamaIndex", 'date': 'Jul 31, 2024', 'url': 'https://www.llamaindex.ai/blog/jamba-instruct-s-256k-context-window-on-llamaindex'}


It works. Now let’s try a similar query to continue validating.

    
    
    answer = default_query_engine.query("What was the company's revenue in 2023?")
    print(answer.response)
    
    
    The company's revenue in 2023 was not explicitly mentioned in the provided context. However, it is mentioned that the company's operating income increased to $36.9 billion in 2023, compared to $12.2 billion in 2022.

We can see there’s a problem—we know that the answer to our question is most
definitely included in our documents, yet our RAG system is claiming that it
cannot find the answer. That’s because the default amount of retrieved chunks
is rather small (a few chunks). This makes the whole system prone to errors
and failing to capture information that is indeed located in the documents.

However, with Jamba-Instruct, a model which handles a 256K context window
effectively, we can increase the number of retrieved chunks from just a few
(default value) to 100 and vastly improve the entire RAG system.

Let’s build a new query engine on top of our existing index and try the query
that failed before.

    
    
    # Large amount of chunks in the retrieval process
    extended_query_engine = index.as_query_engine(llm,
                                                  similarity_top_k=100)
    
    answer = extended_query_engine.query("What was the company's revenue in 2023?")
    print(answer.response)
    
    
    The company's revenue in 2023 was $574.785 million.

We see that the RAG system, with the help of Jamba-Instruct’s 256K context
window, is now able to produce the accurate answer.

Let’s try one more answer to validate our new RAG system.

    
    
    answer = default_query_engine.query("Was there a stock split in the last five years?")
    print(answer.response)
    
    
    No, there was no stock split in the last five years.
    
    
    answer = extended_query_engine.query("Was there a stock split in the last five years?")
    print(answer.response)
    
    
    Yes, there was a stock split in the last five years. On May 27, 2022, Amazon.com, Inc. effected a 20-for-1 stock split of its common stock.

###  Context is king

Often, the debate is framed as “RAG vs. long context.” We at AI21 Labs believe
that’s the wrong way to look at it. Rather, it’s long context _plus_ RAG. When
paired together in an AI system, a long context model enhances the quality and
accuracy of a RAG system, especially useful in enterprise contexts that
involve lengthy documents or vast databases of information.

Going forward, as RAG systems continue to scale, the number of documents and
lengths of chunks will drastically increase. Only a long context model—whose
context length truly delivers—can handle this amount of text.



{'filename': 'jamba-instruct-s-256k-context-window-on-llamaindex.md', 'extension': '.md', 'title': "Jamba-Instruct's 256k context window on LlamaIndex", 'date': 'Jul 31, 2024', 'url': 'https://www.llamaindex.ai/blog/jamba-instruct-s-256k-context-window-on-llamaindex'}


In [26]:
semantic_docstore = SimpleDocumentStore()

In [27]:
semantic_docstore.add_documents(docs=semantic_nodes)

In [28]:
semantic_docstore.persist("database/llama-blogs-nodes-semantic-parser.json")