In [1]:
import nest_asyncio
import os

nest_asyncio.apply()

In [None]:
import dotenv

dotenv.load_dotenv()

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# import Settings
from llama_index.core import Settings

llm = OpenAI(model="gpt-4o-mini")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = llm
Settings.embed_model = embed_model

In [5]:
research_paper_topics = ["RAG", "Agent"]
num_results_per_topic = 2

In [33]:
outline = """
# Research Paper Report on RAG - Retrieval Augmented Generation and Agentic World.

## 1. Introduction

## 2. Retrieval Augmented Generation (RAG) and Agents
2.1. Fundamentals of RAG and Agents.
2.2. Current State and Applications

## 3. Latest Papers:
3.1. HEALTH-PARIKSHA: Assessing RAG Models for Health Chatbots in Real-World Multilingual Settings
3.2. MIRAGE-Bench: Automatic Multilingual Benchmark Arena for Retrieval-Augmented Generation Systems
3.3. VLM-Grounder: A VLM Agent for Zero-Shot 3D Visual Grounding

## 4. Conclusion:
"""

In [22]:
directory = f'./research_results/{"_".join(research_paper_topics)}'
os.makedirs(directory, exist_ok=True)

In [23]:
import arxiv

from pathlib import Path

def download_papers(client, topics, num_results_per_topic, directory):
    """Function to download papers from arxiv for given topics and number of results per topic"""
    for topic in topics:

        # sort by recent data and with max results
        search = arxiv.Search(
            query = topic,
            max_results = num_results_per_topic,
            sort_by = arxiv.SortCriterion.SubmittedDate
        )

        # get the results
        results = client.results(search)

        # download the pdf
        save_dir = directory+"/pdfs"
        os.makedirs(save_dir, exist_ok=True)
        for r in results:
            r.download_pdf(dirpath=save_dir)

def list_pdf_files(directory):
    # List all .pdf files using pathlib

    pdf_files = [file.name for file in Path(directory+"/pdfs").glob('*.pdf')]
    return pdf_files

In [24]:
# create a client
client = arxiv.Client()

download_papers(client, research_paper_topics, num_results_per_topic, directory)

In [25]:
from llama_parse import LlamaParse

def parse_files(directory, pdf_files):
    """Function to parse the pdf files using LlamaParse in markdown format"""

    parser = LlamaParse(
        result_type="markdown",  # "markdown" and "text" are available
        num_workers=4,  # if multiple files passed, split in `num_workers` API calls
        verbose=True,
    )

    documents = []

    for index, pdf_file in enumerate(pdf_files):
        print(f"Processing file {index + 1}/{len(pdf_files)}: {pdf_file}")
        document = parser.load_data(f"{directory}/pdfs/{pdf_file}")
        documents.append(document)

    return documents

In [None]:

pdf_files = list_pdf_files(directory)
print(">>>>> found files: ", pdf_files)

documents = parse_files(directory, pdf_files)

In [27]:
from pydantic import BaseModel, Field
from typing import List
from llama_index.core.prompts import PromptTemplate
from llama_index.core.async_utils import run_jobs
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.node_parser import SemanticSplitterNodeParser

class Metadata(BaseModel):
    """Output containing the authors names, authors companies, and general AI tags."""

    author_names: List[str] = Field(..., description="List of author names of the paper. Give empty list if not available")

    author_companies: List[str] = Field(..., description="List of author companies of the paper. Give empty list if not available")

    ai_tags: List[str] = Field(..., description="List of general AI tags related to the paper. Give empty list if not available")


async def get_papers_metadata(text, llm):
    """Function to get the metadata from the given paper"""
    prompt_template = PromptTemplate("""Generate authors names, authors companies, and general top 3 AI tags for the given research paper.

    Research Paper:

    {text}""")

    # https://docs.llamaindex.ai/en/stable/api_reference/llms/#llama_index.core.llms.llm.LLM.astructured_predict
    metadata = await llm.astructured_predict(
        output_cls=Metadata,
        prompt=prompt_template,
        text=text,
    )

    return metadata

async def get_document_with_metadata(document, llm):
    """Function to create a Document object with metadata"""
    text_for_metadata_extraction = document[0].text + document[1].text + document[2].text
    full_text = "\n\n".join([doc.text for doc in document])
    metadata = await get_papers_metadata(text_for_metadata_extraction, llm)
    
    return Document(
        text=full_text,
        metadata={
            'author_names': metadata.author_names,
            'author_companies': metadata.author_companies,
            'ai_tags': metadata.ai_tags
        }
    )
                 
async def create_index(documents, llm, embed_model):
    """Function to create a local vector store index"""
    # Create Document objects with metadata
    extract_jobs = []
    for document in documents:
        extract_jobs.append(get_document_with_metadata(document, llm))
    
    document_objects = await run_jobs(extract_jobs, workers=4)

    # Create semantic node parser for more intelligent text splitting
    # embed_model = OpenAIEmbedding()
    node_parser = SemanticSplitterNodeParser(
        buffer_size=1, 
        breakpoint_percentile_threshold=95,
        embed_model=embed_model
    )
    
    # Create and return the index
    index = VectorStoreIndex.from_documents(
        documents=document_objects,
        node_parser=node_parser,
        show_progress=True  # Added to show indexing progress
    )
    
    return index

In [None]:
index = await create_index(documents, llm, embed_model)

# Persist the index to disk
index.storage_context.persist(directory+"/storage")


In [29]:
# Later, you can load the index from disk
from llama_index.core import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(persist_dir=directory+"/storage")
index = load_index_from_storage(storage_context)

In [30]:
query_engine = index.as_query_engine(
    dense_similarity_top_k=10,
    sparse_similarity_top_k=10,
    alpha=0.5,
    enable_reranking=True,
    rerank_top_n = 5,
    retrieval_mode="chunks"
)

In [34]:
import re

def extract_title(outline):
    """Function to extract the title from the first line of the outline"""

    first_line = outline.strip().split('\n')[0]
    return first_line.strip('# ').strip()

def generate_query_with_llm(title, section, subsection, llm):
    """Function to generate a query for a report using LLM"""

    prompt = f"Generate a research query for a report on {title}. "
    prompt += f"The query should be for the subsection '{subsection}' under the main section '{section}'. "
    prompt += "The query should guide the research to gather relevant information for this part of the report. The query should be clear, short and concise. "

    response = llm.complete(prompt)

    return str(response).strip()

def classify_query(query, llm):
    """Function to classify the query as either 'LLM' or 'INDEX' based on the query content"""

    prompt = f"""Classify the following query as either "LLM" if it can be answered directly by a large language model with general knowledge, or "INDEX" if it likely requires querying an external index or database for specific or up-to-date information.

    Query: "{query}"

    Consider the following:
    1. If the query asks for general knowledge, concepts, or explanations, classify as "LLM".
    2. If the query asks for specific facts, recent events, or detailed information that might not be in the LLM's training data, classify as "INDEX".
    3. If unsure, err on the side of "INDEX".

    Classification:"""

    classification = str(llm.complete(prompt)).strip().upper()

    if classification not in ["LLM", "INDEX"]:
        print("Classification is not clear, defaulting to INDEX")
        classification = "INDEX"  # Default to INDEX if the response is unclear

    return classification

def parse_outline_and_generate_queries(outline, llm):
    """Function to parse the outline and generate queries for each section and subsection"""
    
    lines = outline.strip().split('\n')
    title = extract_title(outline)
    current_section = ""
    queries = {}

    for line in lines[1:]:  # Skip the title line
        if line.startswith('## '): # This is used in the code to identify section headers in the outline format, where sections are numbered like "2. Retrieval Augmented Generation (RAG) and Agents" or "3. Latest Papers".
            current_section = line.strip('# ').strip()
            queries[current_section] = {}
        elif re.match(r'^\d+\.\d+\.', line): # This is used in the code to identify subsection headers in the outline format, where sections are numbered like "2.1. Fundamentals of RAG" or "3.2. Current Applications".
            subsection = line.strip()
            query = generate_query_with_llm(title, current_section, subsection, llm)
            classification = classify_query(query, llm)
            queries[current_section][subsection] = {"query": query, "classification": classification}

    # Handle sections without subsections
    for section in queries:
        if not queries[section]:
            query = generate_query_with_llm(title, section, "General overview", llm)
            queries[section]["General"] = {"query": query, "classification": "LLM"}

    return queries

In [35]:
from typing import Any, List
from llama_index.core.llms.function_calling import FunctionCallingLLM
from llama_index.core.workflow import Workflow, StartEvent, StopEvent, Context, step
from llama_index.core.workflow import Event

class ReportGenerationEvent(Event):
    pass


class ReportGenerationAgent(Workflow):
    """Report generation agent."""

    def __init__(
        self,
        query_engine: Any,
        llm: FunctionCallingLLM | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(**kwargs)
        self.query_engine = query_engine
        self.llm = llm or OpenAI(model='gpt-4o-mini')

    def format_report(self, section_contents, outline):
        """Format the report based on the section contents."""
        report = ""

        for section, subsections in section_contents.items():
            section_match = re.match(r'^(\d+\.)\s*(.*)$', section)
            if section_match:
                section_num, section_title = section_match.groups()
                
                if "introduction" in section.lower():
                    introduction_num, introduction_title = section_num, section_title
                elif "conclusion" in section.lower():
                    conclusion_num, conclusion_title = section_num, section_title
                else:
                    combined_content = "\n".join(subsections.values())
                    summary_query = f"Provide a short summary for section '{section}':\n\n{combined_content}"
                    section_summary = str(llm.complete(summary_query))
                    report += f"# {section_num} {section_title}\n\n{section_summary}\n\n"

                    report = self.get_subsections_content(subsections, report)

        # Add introduction
        introduction_query = f"Create an introduction for the report:\n\n{report}"
        introduction = str(self.llm.complete(introduction_query))
        report = f"# {introduction_num} {introduction_title}\n\n{introduction}\n\n" + report

        # Add conclusion
        conclusion_query = f"Create a conclusion for the report:\n\n{report}"
        conclusion = str(self.llm.complete(conclusion_query))
        report += f"# {conclusion_num} {conclusion_title}\n\n{conclusion}"

        # Add title
        title = extract_title(outline)
        report = f"# {title}\n\n{report}"
        return report

    def get_subsections_content(self, subsections, report):
        """Generate content for each subsection in the outline."""
        # Sort subsections by their keys before adding them to the report
        for subsection in sorted(subsections.keys(), key=lambda x: re.search(r'(\d+\.\d+)', x).group(1) if re.search(r'(\d+\.\d+)', x) else x):
            content = subsections[subsection]
            subsection_match = re.search(r'(\d+\.\d+)\.\s*(.+)', subsection)
            if subsection_match:
                subsection_num, subsection_title = subsection_match.groups()
                report += f"## {subsection_num} {subsection_title}\n\n{content}\n\n"
            else:
                report += f"## {subsection}\n\n{content}\n\n"
        return report

    def generate_section_content(self, queries, reverse=False):
        """Generate content for each section and subsection in the outline."""
        section_contents = {}
        for section, subsections in queries.items():
            section_contents[section] = {}
            subsection_keys = reversed(sorted(subsections.keys())) if reverse else sorted(subsections.keys())
            for subsection in subsection_keys:
                data = subsections[subsection]
                query = data['query']
                classification = data['classification']
                if classification == "LLM":
                    answer = str(llm.complete(query + " Give a short answer."))
                else:
                    answer = str(query_engine.query(query))
                section_contents[section][subsection] = answer
        return section_contents

    @step(pass_context=True)
    async def queries_generation_event(self, ctx: Context, ev: StartEvent) -> ReportGenerationEvent:
        """Generate queries for the report."""
        ctx.data["outline"] = ev.outline
        queries = parse_outline_and_generate_queries(ctx.data["outline"], self.llm)

        return ReportGenerationEvent(queries=queries)

    @step(pass_context=True)
    async def generate_report(
        self, ctx: Context, ev: ReportGenerationEvent
    ) -> StopEvent:
        """Generate report."""

        queries = ev.queries

        # Generate contents for sections in reverse order
        section_contents = self.generate_section_content(queries, reverse=True)
        # Format and compile the final report
        report = self.format_report(section_contents, ctx.data["outline"])
       
        return StopEvent(result={"response": report})

In [None]:
from llama_index.utils.workflow import (
    draw_all_possible_flows,
    draw_most_recent_execution,
)
draw_all_possible_flows(ReportGenerationAgent, filename="report_generation_agent.html")

In [None]:
agent = ReportGenerationAgent(
    query_engine=query_engine,
    llm=llm,
    verbose=True,
    timeout=1200.0,
)

In [None]:
report = await agent.run(outline=outline)

In [None]:
draw_most_recent_execution(agent, filename="report_generation_agent_execution.html")

In [None]:
print(report['response'])

In [42]:
with open(directory + "/report.md", "w") as f:
    f.write(report['response'])