In [1]:
import os 
import sys

In [2]:
data_path = "../data/"

In [3]:
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader

def load_pdf_files(folder_path):
    loader = DirectoryLoader(
        path=folder_path,
        glob="**/*.pdf",
        loader_cls=PyPDFLoader,
    )
    
    documents = loader.load()
    return documents 

In [4]:
extracted_data = load_pdf_files(data_path)

In [5]:
extracted_data[:5]

[Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': '..\\data\\Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': '..\\data\\Medical_book.pdf', 'total_pages': 637, 'page': 1, 'page_label': '2'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': '..\\data\\Medical_book.pdf', 'total_pages': 637, 'page': 2, 'page_label': '3'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\

In [6]:
print(f"Loaded {len(extracted_data)} pages")

Loaded 637 pages


In [7]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    '''
    Given a list of document objs, this returns a new list of objs
    containing only src in metadata and original page_countent
    ''' 

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source":src}
            )
        )
    return minimal_docs


In [8]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs[:5]

[Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content=''),
 Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imagi

In [9]:
# Split documents into smaller chunks 
from langchain_text_splitters import RecursiveCharacterTextSplitter

def text_split(minimal_docs):
    # chunk_size means each chunk will contain upto 500 chars
    # chunk_overlap shares the last n chars with the next chunk
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk 

In [10]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks:{len(texts_chunk)}")

Number of chunks:5859


In [11]:
texts_chunk[:10]

[Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B\n1'),
 Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow,Manager, Imaging and Multimedia\nContent\nRobyn V . Young,Project Manager, Imaging and\nMultimedia Content\nDean Dauphinais, Senior Editor, Imaging and'),
 Document(metadata={'source': '..\\data\\Medical_book.pdf'}, page_con

In [12]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import torch

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs = {"device":torch.device("cuda" if torch.cuda.is_available() else "cpu") }
    )

    return embeddings

embeddings = download_embeddings()

In [13]:
vector = embeddings.embed_query("Hello world")
print(type(vector))
print("Length of vector:",len(vector))

<class 'list'>
Length of vector: 384


In [14]:
from langchain_chroma import Chroma

docsearch = Chroma.from_documents(
    documents=texts_chunk,
    embedding=embeddings,
    persist_directory="./chroma_db"  # saved on disk
)


In [15]:
docsearch = Chroma(
    embedding_function=embeddings,
    persist_directory="./chroma_db"
)

print(docsearch)

<langchain_chroma.vectorstores.Chroma object at 0x000001BA62ADB590>


In [16]:
from langchain_core.documents import Document

dswith = Document(
    page_content="dswithbappy is a youtube channel that provides tutorials on various topics.",
    metadata={"source": "Youtube"}
)

docsearch.add_documents([dswith])


['e79bd510-3abe-4d01-8801-034f9d6ff5af']

In [17]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)


In [18]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs


[Document(id='d4adf0e1-69c6-476c-998d-c95f5b9e20e2', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='7ca3c44a-3f64-41b9-9483-d20a99a5200f', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='737cac5c-d8a4-4811-88f8-dab1699d6a57', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26')]

In [19]:
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline 
from langchain_huggingface import HuggingFacePipeline
from transformers import BitsAndBytesConfig
import torch

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model_id = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    dtype=torch.float16,
    quantization_config=bnb_config
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.2,
    do_sample=True,
    return_full_text=False
)

chatModel = HuggingFacePipeline(pipeline=pipe)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


In [20]:
from langchain_core.documents import Document

def format_docs(docs: list[Document]) -> str:
    return "\n\n".join(doc.page_content for doc in docs)


In [21]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
You are a medical assistant for question-answering tasks.
Use ONLY the provided context to answer the question.
If the answer is not in the context, say "I don't know".
Use at most three sentences and keep the answer concise.

Context:
{context}
     """
    ),
    ("human", "{question}")
])


In [22]:
from langchain_core.output_parsers import StrOutputParser

output_parser = StrOutputParser()


In [23]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | chatModel
    | output_parser
)


In [24]:
response = rag_chain.invoke("What is Acne?")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
print(response)



Assistant: Acne is a common skin condition characterized by the appearance of pimples on the skin, particularly on the face, chest, and back. It is caused by the blockage of pores in the skin by oil, dead skin cells, and bacteria. This results in the formation of whiteheads, blackheads, or pus-filled pimples.
<class 'str'>


In [26]:
# clean_answer = response.split("Assistant:")[-1].strip()
# print(clean_answer)
