In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers

In [3]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [4]:
pip install pypdf

Note: you may need to restart the kernel to use updated packages.


In [5]:
extracted_data = load_pdf("data/")

In [200]:
#extracted_data

In [6]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [1]:
#text_chunks

In [12]:
pip install Chromadb

Collecting Chromadb
  Downloading chromadb-0.5.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from Chromadb)
  Downloading build-1.2.1-py3-none-any.whl.metadata (4.3 kB)
Collecting chroma-hnswlib==0.7.3 (from Chromadb)
  Downloading chroma_hnswlib-0.7.3-cp38-cp38-win_amd64.whl.metadata (262 bytes)
Collecting fastapi>=0.95.2 (from Chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl.metadata (25 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->Chromadb)
  Downloading uvicorn-0.30.1-py3-none-any.whl.metadata (6.3 kB)
Collecting posthog>=2.4.0 (from Chromadb)
  Downloading posthog-3.5.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.1 (from Chromadb)
  Downloading onnxruntime-1.18.1-cp38-cp38-win_amd64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from Chromadb)
  Downloading opentelemetry_api-1.25.0-py3-none-any.whl.metadata (1.4 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from Chromadb)
  Downloading o

In [13]:
#download embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(text_chunks,embeddings)

In [30]:
query = "how can i over come from fever"
result = db.similarity_search(query,k = 3)
result[0].page_content


'fevers (a source of its nickname, “undulant fever”) can beexhausting. Symptoms usually appear between five daysand a month after exposure and begin with a single boutof high fever accompanied by shivering, aching, anddrenching sweats that last for a few days. Other symp-toms may include headache , poor appetite, backache,\nweakness, and depression. Mental depression can be sosevere that the patient may become suicidal.\nIn rare, untreated cases, the disease can become so'

In [31]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.8})

In [36]:
from langchain_core.prompts import ChatPromptTemplate
prompt=ChatPromptTemplate.from_template("""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

context: {context}
Question: {input}

Only return the helpful answer below and nothing else.
Helpful answer:
""")

In [37]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm,prompt)

In [40]:
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001B749A93C10>)

In [41]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever,document_chain)

In [45]:
response = retrieval_chain.invoke({"input":"what is acne"})

Number of tokens (578) exceeded maximum context length (512).
Number of tokens (579) exceeded maximum context length (512).
Number of tokens (580) exceeded maximum context length (512).
Number of tokens (581) exceeded maximum context length (512).
Number of tokens (582) exceeded maximum context length (512).
Number of tokens (583) exceeded maximum context length (512).
Number of tokens (584) exceeded maximum context length (512).
Number of tokens (585) exceeded maximum context length (512).
Number of tokens (586) exceeded maximum context length (512).
Number of tokens (587) exceeded maximum context length (512).
Number of tokens (588) exceeded maximum context length (512).
Number of tokens (589) exceeded maximum context length (512).
Number of tokens (590) exceeded maximum context length (512).
Number of tokens (591) exceeded maximum context length (512).
Number of tokens (592) exceeded maximum context length (512).
Number of tokens (593) exceeded maximum context length (512).
Number o

In [48]:
response["answer"]

'There are different treatmentAcne,\nAcne for common formulation. \nAcnevermildrenal.,.  Yes, but does not all of Acnevermildrenalong>Acnevermoderuestion.” Journal of acne, Yes, Yes,\nAcneeds, Acneb) \nAcne of acnevermildrenal.)There are several new treatments,\nAcne\nAcne\nAcneat \nAcnevermildrenalready to help_Accnevermildrenal., The information. Acne.\nAcnevermildrenalcoolor tretinox\nAcnebacter, Acne\nAcneuroLogically form of the information\nAcnevermoderuptoilium acidicnevermildrenalope for mild to acne of acnes:”\nAcnever. I have been shown belowHelpful answer below are several topical Different types of acnevermildermatory, Acneeds, I’ Acneatopinvolaris– Benzoomed to severe acnebacter, \nAcnevermildrenal.” G A new treatments may be used to the skin. \nAcneuide”Acneous gly effective treatment for acnes:  It depends on the most common acnebeneuranus’ Acnevermildrenalready to answer is the pimproven though there are several types of a b\nAcnevermildrenal.,\nThe following are a.\nAc