In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [3]:
%pwd

'c:\\Users\\Raju\\Downloads\\Medical_chat_bot\\research'

In [4]:
import os
os.chdir("../")

In [5]:
extracted_data = load_pdf(data="Data/")

In [6]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [9]:
def download_google_embeddings():
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return embeddings

In [10]:
from dotenv import load_dotenv

load_dotenv()  

api_key = os.getenv("GOOGLE_API_KEY") 

In [11]:
embeddings = download_google_embeddings()

In [12]:
len(embeddings.embed_query("Hi"))

768

In [13]:
from dotenv import load_dotenv

load_dotenv

<function dotenv.main.load_dotenv(dotenv_path: Union[str, ForwardRef('os.PathLike[str]'), NoneType] = None, stream: Optional[IO[str]] = None, verbose: bool = False, override: bool = False, interpolate: bool = True, encoding: Optional[str] = 'utf-8') -> bool>

In [14]:
PINECONE_API_KEY =  os.environ.get("PINECONE_API_KEY")

In [15]:
from pinecone import Pinecone, ServerlessSpec

# pc = Pinecone(api_key="********-****-****-****-************")

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
pc = Pinecone(api_key=PINECONE_API_KEY)

In [19]:
index_name = "test"

pc.create_index(
    name=index_name,
    dimension=768, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "test",
    "metric": "cosine",
    "host": "test-gy38o8b.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [21]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(documents=text_chunks,index_name = index_name, embedding=embeddings)

In [22]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2522f73ced0>

In [23]:
docsearch = PineconeVectorStore.from_existing_index(index_name = index_name, embedding=embeddings)

In [24]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2523213e3d0>

In [25]:
retriever = docsearch.as_retriever(search_type = "similarity",search_kwargs={"k":3})

In [26]:
retriever_docs = retriever.invoke("What is meant by caught out?")

In [27]:
retriever_docs

[Document(id='83792569-224e-4068-b036-de7e6b8bc763', metadata={'creationdate': '2019-10-29T12:42:46+00:00', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'moddate': '2019-10-30T09:08:55+04:00', 'page': 275.0, 'page_label': '276', 'producer': 'Adobe PDF Library 15.0', 'source': 'Data\\Cricket_Handbook.pdf', 'total_pages': 427.0, 'trapped': '/False'}, page_content='ICC WOMEN’S TWENTY20 INTERNATIONAL \nPLAYING CONDITIONS\nICC WOMEN’S TWENTY20 INTERNATIONAL \nPLAYING CONDITIONS09 09\n9.60 9.61\n 33 CAUGHT\n 33.1 Out Caught\n  The striker is out Caught if a ball delivered by the bowler, not being a No \nball, touches her bat without having previously been in contact with any \nfielder,\tand\tis\tsubsequently\theld\tby\ta\tfielder\tas\ta\tfair\tcatch,\tas\tdescribed\t\nin clauses 33.2 and 33.3, before it touches the ground.\n 33.2 A fair catch'),
 Document(id='5a1b9290-21a5-44be-9b25-511abcf3e781', metadata={'creationdate': '2019-10-29T12:42:46+00:00', 'creator': 'Adobe InDesign 14.0 (Macint

In [28]:
from langchain_google_genai import ChatGoogleGenerativeAI

In [29]:
llm = ChatGoogleGenerativeAI(model = "gemini-1.5-pro", temperature= 0.5)

In [30]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [38]:
System_prompt = (
    """You are assistant for question-answer tasks.
    Use the following retrieved context for answering the question.
    If you dont know the answer reply that you dont know
    \n\n
    {context}"""
)

In [39]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system",System_prompt),
        ("human","{input}"),
    ]
)

In [40]:
question_ans_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_ans_chain)

In [50]:
# response = rag_chain.invoke({"input" : "What is caught out"})