In [37]:
from dotenv import load_dotenv
load_dotenv()
import os
import tempfile
from crewai import Agent, Task, Crew, Process
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings   
from langchain.tools import Tool
from langchain_groq import ChatGroq 

In [38]:
groq_api_key = os.getenv("GROQ_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

In [39]:
os.environ["LITELLM_PROVIDER"] = "groq"

In [40]:
pdf_path = "../artifacts/data/machine-learning.pdf"

In [41]:
loader = PyPDFLoader(pdf_path)
docs = loader.load()

In [42]:
docs

[Document(metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'creationdate': '2021-04-08T13:34:38+08:00', 'author': 'Christian Janiesch', 'keywords': 'Machine learning,Deep learning,Artificial intelligence,Artificial neural networks,Analytical model building,C6,C8,M15,O3', 'moddate': '2021-04-08T13:35:16+08:00', 'subject': 'Electron Markets, doi:10.1007/s12525-021-00475-2', 'title': 'Machine learning and deep learning', 'source': '../artifacts/data/machine-learning.pdf', 'total_pages': 11, 'page': 0, 'page_label': '1'}, page_content='FUNDAMENTALS\nMachine learning and deep learning\nChristian Janiesch1 & Patrick Zschech2 & Kai Heinrich 3\nReceived: 7 October 2020 / Accepted: 19 March 2021\n# The Author(s) 2021\nAbstract\nToday, intelligent systems that offer artificial intelligence capabilities often rely on machine learning. Machine learning describes\nthe capacity of systems to learn from problem-specific tra

In [43]:
# splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(docs)

In [44]:
# embedding
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(final_documents, embedding=embeddings)

In [45]:
vector_store.similarity_search_with_score("what is machine learning")

[(Document(id='1a46e384-d268-44d0-b7f4-938da464ebf3', metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'creationdate': '2021-04-08T13:34:38+08:00', 'author': 'Christian Janiesch', 'keywords': 'Machine learning,Deep learning,Artificial intelligence,Artificial neural networks,Analytical model building,C6,C8,M15,O3', 'moddate': '2021-04-08T13:35:16+08:00', 'subject': 'Electron Markets, doi:10.1007/s12525-021-00475-2', 'title': 'Machine learning and deep learning', 'source': '../artifacts/data/machine-learning.pdf', 'total_pages': 11, 'page': 2, 'page_label': '3'}, page_content='Machine learning algorithmsFig. 1 Venn diagram of machine\nlearning concepts and classes\n(inspired by Goodfellow et al.\n2016,p .9 )\nTable 1 Overview of types of machine learning\nType Description\nSupervised learning Supervised learning requires a training dataset that covers examples for the input as well as labeled answers or target 

In [46]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [47]:
retriever.invoke("what is machine learning")

[Document(id='1a46e384-d268-44d0-b7f4-938da464ebf3', metadata={'producer': 'Acrobat Distiller 10.0.0 (Windows)', 'creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', 'creationdate': '2021-04-08T13:34:38+08:00', 'author': 'Christian Janiesch', 'keywords': 'Machine learning,Deep learning,Artificial intelligence,Artificial neural networks,Analytical model building,C6,C8,M15,O3', 'moddate': '2021-04-08T13:35:16+08:00', 'subject': 'Electron Markets, doi:10.1007/s12525-021-00475-2', 'title': 'Machine learning and deep learning', 'source': '../artifacts/data/machine-learning.pdf', 'total_pages': 11, 'page': 2, 'page_label': '3'}, page_content='Machine learning algorithmsFig. 1 Venn diagram of machine\nlearning concepts and classes\n(inspired by Goodfellow et al.\n2016,p .9 )\nTable 1 Overview of types of machine learning\nType Description\nSupervised learning Supervised learning requires a training dataset that covers examples for the input as well as labeled answers or target v

In [48]:
from crewai.tools import BaseTool
from crewai_tools import TavilySearchTool

In [49]:
# class VectorStoreRetrieverTool(BaseTool):
#     name: str = "Vector Store Retriever Tool"
#     description: str = "Searches for information in the vector store."
#     retriever: object # Add a class attribute for the retriever

#     def __init__(self, retriever: object):
#         super().__init__()
#         self.retriever = retriever

#     def _run(self, query: str) -> str:
#         if self.retriever:
#             results = self.retriever.get_relevant_documents(query)
#             return " ".join([doc.page_content for doc in results])
    
#         return "No relevant information found in the vector store"
    
class VectorStoreRetrieverTool(BaseTool):
    name: str = "Vector Store Retriever Tool"
    description: str = "Searches for information in the vector store."
    retriever: object  # Define the attribute here

    def _run(self, query: str) -> str:
        if self.retriever:
            results = self.retriever.get_relevant_documents(query)
            return " ".join([doc.page_content for doc in results])
        return "No relevant information found in the vector store."
    

In [50]:
# initialize wikipedia the tools
# wikipedia_wrapper = WikipediaAPIWrapper()
# wikipedia_tool = Tool(
#     name="wikipedia",
#     func=wikipedia_wrapper.run,
#     description="Search Wikipedia when the knowledge base does not contain the answer."
# )

In [71]:
# Initialize the web search tools
# class TavilyWebSearchTool(BaseTool):
#     name: str = "Tavily Search"
#     description: str = "Search the web for up-to-date and accurate information."

#     def _run(self, query: str) -> str:
#         # CrewAI will always pass a string query here
#         tavily = TavilySearchTool(tavily_api_key=tavily_api_key)
#         return tavily._run(query)
from typing import Any
class TavilyWebSearchTool(BaseTool):
    name: str = "Tavily Search"
    description: str = "Search the web for up-to-date and accurate information."

    def _run(self, query: Any) -> str:
        tavily = TavilySearchTool(tavily_api_key=tavily_api_key)

        # CrewAI sometimes passes a dict like {"description": "...", "type": "str"}
        if isinstance(query, dict):
            query = query.get("description") or query.get("query") or str(query)

        # Call Tavily properly (not _run, but using run or invoke)
        return tavily.run(query)


In [72]:
tb = TavilyWebSearchTool()
tb.run("tell me about iran and israel war")

Using Tool: Tavily Search
Using Tool: Tavily Search


'{\n  "query": "tell me about iran and israel war",\n  "follow_up_questions": null,\n  "answer": null,\n  "images": [],\n  "results": [\n    {\n      "url": "https://fsi.stanford.edu/news/understanding-war-between-israel-and-iran-qa-amichai-magen-and-abbas-milani",\n      "title": "Understanding the War between Israel and Iran: Q&A with Amichai ...",\n      "content": "# Understanding the War between Israel and Iran: Q&A with Amichai Magen and Abbas Milani # Understanding the War between Israel and Iran: Q&A with Amichai Magen and Abbas Milani ### **How do you assess the war between Israel and Iran in terms of its impact on regional stability in the Middle East? **Amichai Magen:** The 2025 Iran-Israel war is the most consequential conflict in the Middle East at least since the 2003 Iraq War and arguably since the Six Day War of June 1967. **Magen:** The overwhelming majority of Israelis perceive a nuclear-armed Iran as an unacceptable existential risk and view the Ayatollah regime as r

### Agent

In [53]:
llm = ChatGroq(model="groq/gemma2-9b-it", api_key=groq_api_key)

In [55]:
planner = Agent(
    role="Planner",
    goal="Decide whether a query can be answered using the knowledge base or requires external search.",
    backstory=(
        "Planner that evaluates the query, considers context, and routes it appropriately. "
        "If confident the answer is in the knowledge base, send to retriever; otherwise, "
        "use external tools. Consider query complexity, ambiguity, and completeness."
    ),
    llm=llm
)


In [56]:
response_data = {
    "answer": "",
    "source": ""
}


In [57]:
retriever_agent = Agent(
    role="Retriever",
    goal="Accurately fetch the most relevant answer from the knowledge base to support user queries.",
    backstory="Acts like a librarian who quickly searches and delivers the right information from internal documents.",
    tools=[VectorStoreRetrieverTool(retriever=retriever)],
    verbose=True,
    allow_delegation=False,
    llm=llm,
    on_complete=lambda output: response_data.update({"source": "vector_store", "answer": output})
)

In [58]:
external_agent = Agent(
    role="External Knowledge Seeker",
    goal="Retrieve accurate and up-to-date information from Wikipedia or web sources whenever the knowledge base lacks the answer.",
    backstory="An expert researcher specialized in finding reliable information outside the knowledge base. When internal data is insufficient, this agent consults trusted sources like Wikipedia and the web to ensure users always receive the most complete answer.",
    tools=[tb],
    verbose=True,
    allow_delegation=False,
    llm=llm, 
    on_complete=lambda output: response_data.update({"source": "external_search", "answer": output})
)

In [59]:
summarizer = Agent(
    role="Answer Composer",
    goal="Synthesize information from the knowledge base, tools, or external sources into a clear, concise, and well-structured final answer.",
    backstory="An expert communicator who takes complex or scattered information and transforms it into easy-to-understand, well-explained responses. Ensures that the user always receives a polished and insightful final answer.",
    llm=llm
)

In [60]:
# task
task1 = Task(
    description=(
        "Analyze the user's query to determine the best route: "
        "use internal knowledge base if confident, otherwise prepare to consult external sources."
    ),
    expected_output=(
        "A final determination of whether the query can be answered using "
        "internal knowledge or if an external search is required. The output should be a "
        "succinct decision, for example: 'Internal knowledge is sufficient' or 'External search is necessary'."
    ),
    agent=planner,
    context_variables=["query"]
)

In [61]:
task2 = Task(
    description=(
        "Search the knowledge base thoroughly and fetch the most relevant passages, "
        "ensuring high accuracy and context alignment with the user's query."
    ),
    expected_output=(
        "A summary of the most relevant information retrieved from the knowledge base, "
        "formatted as direct quotes or paraphrased key points. The response must be "
        "accurate, directly addressing the user's query and citing the source document "
        "or section if applicable."
    ),
    agent=retriever_agent,
    context_variables=["query"]
)

In [62]:
task3 = Task(
    description=(
        "If the knowledge base does not provide a sufficient answer, "
        "perform a targeted external search using trusted sources like Wikipedia and web search tools, "
        "prioritizing accuracy and recency."
    ),
    expected_output=(
        "A comprehensive and concise answer based on external web search results. "
        "The output must synthesize information from multiple sources if necessary, "
        "address the user's query directly, and include a clear statement "
        "indicating that the information was retrieved from external sources due to "
        "a lack of data in the internal knowledge base."
    ),
    agent=external_agent,
    context_variables=["query"]
)

In [63]:
task4 = Task(
    description=(
        "Integrate and synthesize all gathered information—whether from internal or external sources—"
        "into a clear, concise, and user-friendly final response, maintaining accuracy and readability."
    ),
    expected_output=(
        "A single, final answer that is a direct and complete response to the user's original query. "
        "The response must be easy to read and synthesize all gathered information from the previous steps. "
        "DO NOT mention the internal or external search process; just provide the final answer."
    ),
    agent=summarizer,
    context_variables=["query"]
)

In [64]:
crew = Crew(
    agents=[planner, retriever_agent, external_agent, summarizer],
    tasks=[task1, task2, task3, task4],
    process=Process.sequential,
    verbose=True
)

In [69]:
response = crew.kickoff(inputs={"query": "tell me about iran and israel war"})

In [70]:
final_answer = response.tasks_output[-1].raw
print(final_answer)

Thought: I need the user's query to search the knowledge base. 
Action:  Please provide the user's query so I can assist you. 



In [67]:
response_data

{'answer': '', 'source': ''}