# 100% local Agentic RAG

In [1]:
import os
import sys # New import for recursion limit
import warnings
from crewai import Agent, Crew, Task, LLM, Process
from src.agentic_rag.tool.custom_tool import DocumentSearchTool
from src.agentic_rag.tool.custom_tool import FireCrawlWebSearchTool
from crewai_tools import SerperDevTool # NEW: Import SerperDevTool
from dotenv import load_dotenv # NEW: Import for loading .env variables

warnings.filterwarnings("ignore")

# NEW: Increase the recursion limit for stability
# This helps prevent recursion errors from deep call stacks, especially with verbose logging.
sys.setrecursionlimit(5000) 

### Setup LLM


In [2]:
# ### Setup LLM

# %%
# Load environment variables from .env file
from dotenv import load_dotenv
import os
load_dotenv()

# --- LLM Configuration for OpenAI ---
# Ensure you have 'langchain-openai' installed: pip install langchain-openai
from langchain_openai import ChatOpenAI

# Initialize the LLM using ChatOpenAI
llm = ChatOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),  # Gets your OpenAI API key from .env
    model="gpt-4o",                       # Recommended: gpt-4o for best performance. You can also try "gpt-4-turbo" or "gpt-3.5-turbo"
    temperature=0.7                       # Good default for balanced creativity/accuracy
)

# IMPORTANT: Remove or comment out any previous Groq-specific environment variable settings
# that might still be present and causing conflicts, for example:
# # os.environ["OPENAI_API_BASE"] = "https://api.groq.com/openai/v1"
# # os.environ["OPENAI_MODEL_NAME"] = os.getenv("MODEL")
# # os.environ["OPENAI_API_KEY"] = os.getenv("GROQ_API_KEY") # This mapping is for Groq

### Setup Tools

In [3]:
# ### Setup Tools

# Robust PDF path calculation for Jupyter Notebooks
# This assumes the notebook is being run with the project's root directory (AGENTIC_RAG/)
# as the current working directory of the Jupyter kernel.
project_root_dir = os.getcwd()
pdf_file_path = os.path.join(project_root_dir, "knowledge", "dspy.pdf")

# If for some reason the notebook's working directory is AGENTIC_RAG/src/agentic_rag/
# and your project root is AGENTIC_RAG/, you would need this instead:
# pdf_file_path = os.path.join(project_root_dir, "..", "..", "knowledge", "dspy.pdf")
# However, the first option (assuming project root is CWD) is more common in well-set-up VS Code workspaces.


# Initialize your custom DocumentSearchTool
pdf_tool = DocumentSearchTool(file_path=pdf_file_path)

Make sure you have the API key for FireCrawl in your environment variables.

In [4]:
# NEW: Initialize SerperDevTool for general web search queries
web_search_tool_serper = SerperDevTool()

In [5]:
# Initialize your custom FireCrawlWebSearchTool for URL scraping
# Note: This tool's description should clearly state it expects a URL in custom_tool.py
firecrawl_web_search_tool = FireCrawlWebSearchTool()


### Agents

In [6]:
### Agents

# %%
retriever_agent = Agent(
    role="""Retrieve relevant information to answer the user query: {query}""",
    # UPDATED GOAL: Explicitly mentions Serper for general web search and FireCrawl for URL scraping.
    goal="""Retrieve the most relevant information from the available sources for the user query: {query}.
            First, **always** attempt to find information using the 'DocumentSearchTool' in the provided PDF knowledge base.
            If the information is not found or is incomplete in the PDF, then use the 'SerperDevTool' for a general web search
            to find up-to-date and relevant information.
            If the 'SerperDevTool' provides a specific URL that looks highly relevant, consider using the 'FireCrawlWebSearchTool'
            to scrape the content from that specific URL for detailed extraction.
            Your final output should be the most relevant information in text format.""",
    backstory="""You're a meticulous analyst with a keen eye for detail.
                You're known for your ability to understand the user query: {query}
                and retrieve knowledge from the most suitable knowledge base,
                prioritizing internal documents then web search,
                and then detailed web content extraction if a URL is available.""",
    verbose=False,
    tools=[
        pdf_tool,
        web_search_tool_serper, # IMPORTANT: This is for general web search
        firecrawl_web_search_tool # IMPORTANT: This is for scraping specific URLs found by Serper
    ],
    allow_delegation=False, # NEW: Prevent unintended delegation
    max_iterations=20, # NEW: Set a maximum number of steps for the agent
    llm=llm # Explicitly pass the LLM object to the agent
)

response_synthesizer_agent = Agent(
    role="""Response synthesizer agent for the user query: {query}""",
    goal="""Synthesize the retrieved information into a concise and coherent response
            based on the user query: {query}. If you are not able to retrieve the
            information (e.g., from the previous agent), then respond with
            "I'm sorry, I couldn't find the information you're looking for." """,
    backstory="""You're a skilled communicator with a knack for turning complex
                information into clear and concise responses.""",
    verbose=False,
    allow_delegation=False, # NEW: Prevent unintended delegation
    max_iterations=10, # NEW: Set a maximum number of steps for the agent
    llm=llm # Explicitly pass the LLM object to the agent
)


### Tasks

In [7]:
### Tasks

# %%
retrieval_task = Task(
    description="""Retrieve the most relevant information from the available
                   sources for the user query: {query}""",
    expected_output="""The most relevant information in form of text as retrieved
                       from the sources.""",
    agent=retriever_agent
)

response_task = Task(
    description="""Synthesize the final response for the user query: {query}""",
    expected_output="""A concise and coherent response based on the retrieved information
                       from the right source for the user query: {query}. If you are not
                       able to retrieve the information then respond with
                       I'm sorry, I couldn't find the information you're looking for.""",
    agent=response_synthesizer_agent
)

### Initialize Crew

In [8]:
### Initialize Crew

# %%
crew = Crew(
            agents=[retriever_agent, response_synthesizer_agent],
            tasks=[retrieval_task, response_task],
            process=Process.sequential,
            verbose=False, # NEW: Set verbose to 2 for more detailed logs
            # process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/
        )

### Kickoff Crew

In [9]:
result = crew.kickoff(inputs={"query": "When is Australian open 2025 happening?"})
print(result)

The 2025 Australian Open is scheduled to be a Grand Slam level tennis tournament held at Melbourne Park from January 12 to January 26, 2025. It will be the 113th edition of the Australian Open, the 57th in the Open Era, and the first major of the year.
