In [7]:
import langchain
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS, pinecone
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.storage import LocalFileStore
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
from tqdm.autonotebook import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import CacheBackedEmbeddings

load_dotenv()

True

In [8]:
llm_gemini = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest", google_api_key=os.getenv("GOOGLE_API_KEY")
)

llm_gemini

ChatGoogleGenerativeAI(model='models/gemini-1.5-flash-latest', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x72bf95f4ab90>, async_client=<google.ai.generativelanguage_v1beta.services.generative_service.async_client.GenerativeServiceAsyncClient object at 0x72bf95f65600>, default_metadata=())

In [3]:
underlying_embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", )

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    underlying_embeddings, store, namespace=underlying_embeddings.model
)

In [9]:
pdf_pages = []
docs_path = "pdfs/PMB271"

for file in tqdm(os.listdir(docs_path)):
    file_path = os.path.join(docs_path, file)
    pdf_loader = PyPDFLoader(file_path, extract_images=True)
    pdf_pages.extend(pdf_loader.load_and_split())


100%|██████████| 14/14 [00:11<00:00,  1.25it/s]


In [None]:
import concurrent.futures
# from tqdm import tqdm


# def load_pdf(file_path):
#     pdf_loader = PyPDFLoader(file_path, extract_images=True)
#     return pdf_loader.load_and_split()


# pdf_pages = []
# docs_path = "pdfs/PMB271"

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     futures = [
#         executor.submit(load_pdf, os.path.join(docs_path, file))
#         for file in os.listdir(docs_path)
#     ]
#     for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
#         pdf_pages.extend(future.result())

 36%|███▌      | 5/14 [05:36<10:06, 67.36s/it]


In [10]:
print(len(pdf_pages))

91


In [14]:
pdf_file = "pdfs/PMB271/PMB 271-A Brief History Of Microbiology.pdf"

pdf_loader = PyPDFLoader(pdf_file)
pages = pdf_loader.load_and_split()

pages[0]


Document(page_content="HISTORICAL DEVELOPMENT OF MICROBIOLOGY AND THE EFFECTS ON HEALTH  \nA Brief History of Microbiology . Microbiology has had a long, rich history, initially centered in the causes of \ninfectious diseases but now including practical applications of the science. Many individuals have made significant \ncontributions to the development of microbiology.  \nEarly history of microbiology.  Historians are unsure who made the first observations of microorganisms, but the \nmicroscope was available during the mid‐1600s, and an E nglish scientist named  Robert Hooke  made key \nobservations. He is reputed to have observed strands of fungi among the specimens of cells he viewed. In the 1670s \nand the decades thereafter, a Dutch merchant named  Anton van Leeuwenhoek  made careful observatio ns of \nmicroscopic organisms, which he called  animalcules.  Until his death in 1723, van Leeuwenhoek revealed the \nmicroscopic world to scientists of the day and is regarded as one of 

In [17]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

documents = text_splitter.split_documents(pdf_pages)

In [16]:

faiss_index_db = FAISS.from_documents(
    documents, underlying_embeddings
)

GoogleGenerativeAIError: Error embedding content: 400 * BatchEmbedContentsRequest.requests: at most 100 requests can be in one batch


In [None]:
retreiver = faiss_index_db.as_retriever()

# retreiver.invoke('Who is the dutch merchannt?')

In [None]:
prompt = ChatPromptTemplate.from_template(
"""Answer the question based only on the following context:

{context}

Question: {question}
"""
)

chain = (
    {"context": retreiver, "question": RunnablePassthrough()}
    | prompt
    | llm_gemini
    | StrOutputParser()
)

In [None]:
response = chain.invoke("Who are the top contributors in the field")

print(response)

The text highlights the following individuals as major contributors to the field of microbiology:

* **Robert Hooke:** Made key observations of microorganisms in the mid-1600s.
* **Anton van Leeuwenhoek:** Made careful observations of microscopic organisms (animalcules) in the 1670s, providing accurate descriptions of protozoa, fungi, and bacteria.
* **Francesco Redi:** Disputed the theory of spontaneous generation by showing that fly maggots do not arise from decaying meat if the meat is covered.
* **John Needham:** Advanced the theory of spontaneous generation.
* **Lazzaro Spallanzani:** Disputed spontaneous generation by showing that boiled broth would not give rise to microscopic forms of life.
* **Louis Pasteur:** Worked in the mid to late 1800s, disproved spontaneous generation and proposed the germ theory of disease.
* **Robert Koch:** Provided proof for the germ theory by cultivating anthrax bacteria and injecting them into mice, causing anthrax. He developed Koch's postulates,

In [None]:
from IPython.display import Markdown

Markdown(response)

The text highlights the following individuals as major contributors to the field of microbiology:

* **Robert Hooke:** Made key observations of microorganisms in the mid-1600s.
* **Anton van Leeuwenhoek:** Made careful observations of microscopic organisms (animalcules) in the 1670s, providing accurate descriptions of protozoa, fungi, and bacteria.
* **Francesco Redi:** Disputed the theory of spontaneous generation by showing that fly maggots do not arise from decaying meat if the meat is covered.
* **John Needham:** Advanced the theory of spontaneous generation.
* **Lazzaro Spallanzani:** Disputed spontaneous generation by showing that boiled broth would not give rise to microscopic forms of life.
* **Louis Pasteur:** Worked in the mid to late 1800s, disproved spontaneous generation and proposed the germ theory of disease.
* **Robert Koch:** Provided proof for the germ theory by cultivating anthrax bacteria and injecting them into mice, causing anthrax. He developed Koch's postulates, a set of principles for relating microorganisms to diseases. 
