In [50]:
import pymupdf  # PyMuPDF
import os

def get_pdf_text(pdf_docs):
    text = ""

    # Check if the input is a directory
    if os.path.isdir(pdf_docs):
        # Iterate over all PDF files in the directory
        for filename in os.listdir(pdf_docs):
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(pdf_docs, filename)
                doc = pymupdf.open(pdf_path)
                for page in doc:
                    text += page.get_text()
    elif os.path.isfile(pdf_docs):
        # Handle a single PDF file
        doc = pymupdf.open(pdf_docs)
        for page in doc:
            text += page.get_text()
    else:
        raise ValueError(f"The path {pdf_docs} is neither a file nor a directory.")

    return text

In [51]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

In [52]:
chunks = text_splitter.split_text(get_pdf_text("pdf"))
print(len(chunks))

73


In [53]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
load_dotenv()
# genai.configure(api_key=userdata.get("GOOGLE_API_KEY"))
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [54]:
from langchain_google_genai import (
    GoogleGenerativeAIEmbeddings,
    ChatGoogleGenerativeAI
)
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [55]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
documents = [Document(page_content=chunk) for chunk in chunks]
# idx = [str(i) for i in range(1, len(documents)+1)]
vectors = Chroma(embedding_function=embeddings, persist_directory="./db")
# vectors.add_documents(documents=documents, ids=idx)
vectors.add_documents(documents=documents)

['7f4f19d4-9209-44fd-a7c0-090bf7b8a857',
 '551d6ecd-2acc-4946-91dc-efa7100dd67a',
 'cc84ac43-f217-4f1b-8fd7-fc6a0d03d7ec',
 '62281ece-7500-42c6-a127-2bf6d560e9ee',
 '9995a9b0-678c-42ab-89c2-85cc1199c0ba',
 'da9e6d22-eb62-40c0-a877-c638ef32ac5d',
 'a1357ea3-1184-4653-adeb-72063b7b3e9d',
 'bdf1df04-a54d-4dfb-bdf4-92e7b077ce4e',
 '6f8e023c-1230-4b60-8e42-0078d7dc4b52',
 'b8128093-42c4-426e-ba9a-a0004a59cff7',
 '7aceea91-3241-4d21-aae1-2c09071743c9',
 '669f840e-0756-4490-b89a-14c16f8d68b4',
 '8edda0a4-ed33-4aa9-8f69-4d8238bb2211',
 '4f1cf18c-af05-4a9e-983d-07824f2a3c73',
 'd424e9ea-f125-4162-bce6-4f0bce41729d',
 '81890504-d0f1-4ce6-99a6-ab63404bfc80',
 'cc2a66f7-0169-4bf1-9e1c-c164ceeba667',
 '481f9415-3376-4a59-a636-03a09adb9930',
 'ca7a867c-172b-4789-932a-6e2bf94fdf8e',
 'd6d3d28e-ebd1-4bf9-a8f2-0c4185d8aa15',
 '053079bd-7e74-4dce-ae11-498aa4f4a057',
 '3053f44c-dba4-4f3d-a6f2-922f7e702725',
 '5945e501-b57a-4862-9321-4c6f6307058c',
 '1c1ad0a4-819e-42da-8d4b-7e844e651642',
 '9fd6fcc4-e7b8-

In [22]:
retriever = vectors.as_retriever(
    # search_type="similarity_score_threshold",
    search_kwargs={"k": 3}
)

docs = retriever.invoke("tell me about Google Cloud’s AI Adoption Framework")
docs[0].page_content.split("\n")

['With the framework, you can assess your organization’s AI maturity and determine what ',
 'you’ll need to bridge the gap to where you’d like to be. While we touch on the Google Cloud ',
 'products, you can use this information however you would like: the framework is technology ',
 'agnostic. We’re here to offer further guidance, if that alignment dovetails with your vision. ',
 'We’ve worked hard to make AI accessible to all, not only ML researchers and engineers, but ',
 'to a vast array of customers across industries as well. And our ongoing work in tooling, ',
 'frameworks, datasets, and models is well documented in the open source community. AI and ',
 'ML are central to who we are.',
 'Whether or not we accompany you on the journey, however, our framework can help you find ',
 'your way, from your initial changes all the way to becoming fully AI-powered. ',
 '12',
 'Next steps',
 'Find out more',
 'To dive more deeply into the details of Google Cloud’s AI Adoption Framework, se

In [56]:
from langchain_core.prompts import (
    ChatPromptTemplate,
    PromptTemplate
)
from langchain.retrievers.multi_query import MultiQueryRetriever

In [57]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [69]:
llm = GoogleGenerativeAI(model="gemini-1.5-flash-001")
type(llm)

langchain_google_genai.llms.GoogleGenerativeAI

In [64]:
retriever = MultiQueryRetriever.from_llm(
    vectors.as_retriever(
        search_kwargs={"k": 5}
    ),
    llm,
    prompt=QUERY_PROMPT
)

In [65]:
type(retriever)

langchain.retrievers.multi_query.MultiQueryRetriever

In [60]:
#RAG prompt

template = """Answer the questions based ONLY on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [61]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [62]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [63]:
chain.invoke(input())

 tell me about google's role in ai


'Google Cloud offers a range of products and services to help organizations accelerate their AI journey. They offer prebuilt APIs, Cloud AutoML services, AI Platform, and data management tools. Google Cloud also provides a framework for AI adoption, which helps organizations build an effective AI capability. They offer professional consulting services and workshops to help organizations discover, assess, deploy, and upskill in ML.  \n'

In [48]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm,prompt)

In [66]:
type(vectors)

langchain_chroma.vectorstores.Chroma

In [67]:
def process_pdf(pdf_path: str) -> Chroma:
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )
    chunks = text_splitter.split_text(get_pdf_text(pdf_path))
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    documents = [Document(page_content=chunk) for chunk in chunks]
    vectors = Chroma(embedding_function=embeddings, persist_directory="./db")
    vectors.add_documents(documents=documents)
    return vectors


In [68]:
process_pdf("pdf")

<langchain_chroma.vectorstores.Chroma at 0x7f99e35cf0e0>