In [1]:
%pwd

'c:\\Projects\\Gen AI Projects\\Gen-AI-Chat-Bot\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'c:\\Projects\\Gen AI Projects\\Gen-AI-Chat-Bot'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract Data from the PDF file
def load_pdf_file(data):
    loader = DirectoryLoader(data, 
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [6]:
extracted_data = load_pdf_file(data='Data/')

In [None]:
# extracted_data

[Document(metadata={'producer': 'Adobe PDF Library 5.0', 'creator': 'Adobe InDesign 2.0.2', 'creationdate': '2003-06-04T14:18:31+00:00', 'keywords': 'Common Dog Diseases and Health Problems, 4-H Companion Animal Health', 'moddate': '2021-04-20T13:11:38-04:00', 'subject': '4-H Companion Animal Health', 'title': 'Common Dog Diseases and Health Problems - 4-H-852', 'trapped': '/False', 'source': 'Data\\CommonDogDiseases.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='4-H-852-W4-H Companion Animal Health\nCommon Dog Diseases  and        \n Health Problems\nWhether your dog is a working companion, cham-\npion show animal, hunting partner, or just a best \nfriend, the kindest and most responsible thing you \ncan do for him is to provide proper health care. \nKnowing about common dog diseases and being \naware of appropriate prevention and treatment can \nbetter help you provide that care.\nMany Diseases Can Be \nPrevented\nSome of the most common and serious \ndog diseas

In [9]:
# Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("Length of Text chunks", len(text_chunks))

Length of Text chunks 30


In [11]:
## Import the HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceEmbeddings # old version
from langchain_huggingface import HuggingFaceEmbeddings

In [12]:
# Download the Embeddings from Hugging face
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [13]:
# %pip install sentence-transformers
embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [24]:
# Complete vector
#query_result

In [30]:
from dotenv import load_dotenv
load_dotenv()


True

In [31]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [18]:
from pinecone.grpc import PineconeGRPC as Pinecone

from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "pdfchatbotindex1"

pc.create_index(
    name = index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "pdfchatbotindex1",
    "metric": "cosine",
    "host": "pdfchatbotindex1-cx1rjh9.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [32]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


In [20]:
# Embed each chunk and upsert the embeddings into your Pinecone Index
from langchain_pinecone import PineconeVectorStore

docsearch  = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding=embeddings
)

In [None]:
# docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x16f6952d510>

In [22]:
# Load existing Index
# Embed each chunk and upsert the embeddings into your Pinecone Index
docSearch = PineconeVectorStore.from_existing_index(
    index_name= index_name,
    embedding=embeddings
)

In [None]:
# docSearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x16f53e2b1c0>

In [24]:
# retrive the similarity search
retriever = docSearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [25]:
retriever_docs = retriever.invoke("What is Ear mites?")

In [26]:
retriever_docs

[Document(id='a79bb0fc-f104-4654-a033-1345e849eb1d', metadata={'creationdate': '2003-06-04T14:18:31+00:00', 'creator': 'Adobe InDesign 2.0.2', 'keywords': 'Common Dog Diseases and Health Problems, 4-H Companion Animal Health', 'moddate': '2021-04-20T13:11:38-04:00', 'page': 2.0, 'page_label': '3', 'producer': 'Adobe PDF Library 5.0', 'source': 'Data\\CommonDogDiseases.pdf', 'subject': '4-H Companion Animal Health', 'title': 'Common Dog Diseases and Health Problems - 4-H-852', 'total_pages': 4.0, 'trapped': '/False'}, page_content='Ear mites tunnel in the skin of the outer ear ca-\nnal. They are easily transmitted from dog to dog \nor cat to dog. They can be seen in the ear with \nmagnification. Ear mites are suspected when dark \ncoffee-ground debris is present in the ears. Infesta-\ntion signs are head shaking and scratching at the \nears. Left untreated, ear mites predispose the ear \nto secondary bacterial infection. Treatment requires \ncleaning of the ear by a veterinarian and use

In [33]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)


In [45]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use ONLY the following retrieved context to answer the question."
    "If the answer cannot be explicitly found in the provided context, say: 'I don't know.'"
    "Do NOT attempt to make up an answer."
    "Keep the response concise, within three sentences."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [46]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [47]:
response = rag_chain.invoke({"input": "What is Ear mites?"})
print(response["answer"])



Ear mites are tiny parasites that live in the skin of the outer ear canal and can be easily transmitted from one dog or cat to another. They can be seen with magnification and are suspected when dark coffee-ground debris is present in the ears. If left untreated, they can lead to secondary bacterial infections. Treatment involves cleaning the ear and using a mite-killing insecticide, as well as treating any other pets in the household.


In [50]:
response = rag_chain.invoke({"input": "What is Generative AI?"})
print(response["answer"])


I don't know.
