In [4]:
from dotenv import load_dotenv
import os
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, Sequence
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, ToolMessage
from operator import add as add_messages
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.tools import tool



loading the pdf and the embedding model

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
loader = PyPDFLoader("Unit 3 Optoelectronic Properties of Semiconductors.pdf")
docs = loader.load()

checking for the laoded pdf

In [10]:
len(docs)

73

splitting the document 

In [11]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64 
)
chunks=splitter.split_documents(docs)


checking the split

In [12]:
print(chunks)

[Document(metadata={'producer': 'Microsoft® Office PowerPoint® 2007', 'creator': 'Microsoft® Office PowerPoint® 2007', 'creationdate': '2023-03-27T10:20:18+05:30', 'title': 'PowerPoint Presentation', 'author': 'ganesh vattikondala', 'moddate': '2023-03-27T10:20:18+05:30', 'source': 'Unit 3 Optoelectronic Properties of Semiconductors.pdf', 'total_pages': 73, 'page': 0, 'page_label': '1'}, page_content='Semiconductor Physics and \nComputational Methods \n(21PYB102J) \n \nCourse Instructor: Dr. Venkata Ravindra A \n         Assistant Professor \n         Department of Physics and Nanotechnology \n         SRM Institute of Science and Technology \n         Email: venkatar1@srmist.edu.in \n         Phone: 8019448666 \n         Office: UB-709/A 27 March 2023 1'), Document(metadata={'producer': 'Microsoft® Office PowerPoint® 2007', 'creator': 'Microsoft® Office PowerPoint® 2007', 'creationdate': '2023-03-27T10:20:18+05:30', 'title': 'PowerPoint Presentation', 'author': 'ganesh vattikondala', 

creating the vector database

In [13]:
vectorDB = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chromadb"
)
retriever = vectorDB.as_retriever()

checking the vector db

In [15]:
check = retriever.invoke("Absorption")
print(check)

[Document(id='9b41f948-574d-4efc-b3de-cc22d5aa0ba0', metadata={'creationdate': '2023-03-27T10:20:18+05:30', 'total_pages': 73, 'source': 'Unit 3 Optoelectronic Properties of Semiconductors.pdf', 'page': 13, 'creator': 'Microsoft® Office PowerPoint® 2007', 'page_label': '14', 'author': 'ganesh vattikondala', 'moddate': '2023-03-27T10:20:18+05:30', 'title': 'PowerPoint Presentation', 'producer': 'Microsoft® Office PowerPoint® 2007'}, page_content='Overall picture of Absorption and emission processes \n1\n4'), Document(id='dc2b8416-75d1-4fa7-9463-8328a4b139fb', metadata={'title': 'PowerPoint Presentation', 'page_label': '8', 'source': 'Unit 3 Optoelectronic Properties of Semiconductors.pdf', 'moddate': '2023-03-27T10:20:18+05:30', 'creationdate': '2023-03-27T10:20:18+05:30', 'creator': 'Microsoft® Office PowerPoint® 2007', 'producer': 'Microsoft® Office PowerPoint® 2007', 'author': 'ganesh vattikondala', 'page': 7, 'total_pages': 73}, page_content='Absorption: \n\uf0d8Let us consider two 

tools making

tool 1 

In [17]:
@tool 
def search(query:str)->str:
    """   This tool searches and returns the information from the Unit 3 Optoelectronic Properties of Semiconductors pdf """
    docs = retriever.invoke(query)
    if not docs:
        return "content not found in the document submitted"
    result=[]
    for i ,docs in enumerate(docs):
        result.append(f"document{i+1} : \n{docs.page_content}")
    return "\n\n".join(result)    
