In [None]:
!pip install langchain_google_genai langchain-community huggingface_hub PyPDF2 langchain-huggingface faiss-cpu

Collecting langchain_google_genai
  Downloading langchain_google_genai-2.1.6-py3-none-any.whl.metadata (7.0 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting filetype<2.0.0,>=1.2.0 (from langchain_google_genai)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting google-ai-generativelanguage<0.7.0,>=0.6.18 (from langchain_google_genai)
  Downloading google_ai_generativelanguage-0.6.18-py3-none-any.whl.metadata (9.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4

In [2]:
from dotenv import load_dotenv
from PyPDF2 import PdfReader

from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings # r
from langchain.vectorstores import FAISS

from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata

In [3]:
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

from huggingface_hub import login

import os

In [4]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HUGGINGFACEHUB_API_TOKEN')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

load_dotenv()
login(os.getenv("HUGGINGFACEHUB_API_TOKEN"))

TimeoutException: Requesting secret HUGGINGFACEHUB_API_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
pdf_paths = [f"/content/{filename}" for filename in uploaded.keys()]
print(f"Selected PDFs: {pdf_paths}")

In [None]:
def get_pdf_text(pdf_docs):
  text = ""
  for pdf in pdf_docs:
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
      page_text = page.extract_text()
      if page_text:
        text += page_text
  return text

raw_text = get_pdf_text(pdf_paths)
print(f"Length of text: {len(raw_text)}")

In [None]:
def get_text_chunks(raw_text):
  text_splitter = CharacterTextSplitter(
      separator="\n",
      chunk_size=1000,
      chunk_overlap=200,
      length_function=len
  )

  chunks = text_splitter.split_text(raw_text)
  return chunks

text_chunks = get_text_chunks(raw_text)
print(f"Number of text chunks: {len(text_chunks)}")

In [None]:
def get_vectorstore(text_chunks):
  embeddings = HuggingFaceEmbeddings(
      model_name="hkunlp/instructor-xl",
      # model_kwargs={"device": "cuda"}
      model_kwargs={"device": "cpu"}
  )

  vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
  return vectorstore

get_vectorstore(text_chunks)


In [None]:
def get_conversation_chain(vectorstore):
  llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

  return ConversationalRetrievalChain.from_llm(
      llm=llm,
      retriever=vectorstore.as_retriever(),
      memory=ConversationBufferMemory(memory_key="chat_history", return_messages=True)
  )

# conversation = get_conversation_chain(get_vectorstore(text_chunks))
chain = get_conversation_chain(get_vectorstore(text_chunks))

In [None]:
def chat_with_pdf(question):
  response = chain({"question": question})

  for i, msg in enumerate(response["chat_history"]):
    sender = "User" if i % 2 == 0 else "Bot"
    print(f"{sender}: {msg.content}\n")

In [None]:
chat_with_pdf("Give me a summary of the document.")

In [None]:
chat_with_pdf("How does diet help in managing diabetes?")

In [None]:
chat_with_pdf("How does medication help with diabetes?")