In [None]:
! pip install langchain unstructured openai Cython tiktoken

In [None]:
! pip install --upgrade langchain-astradb

In [None]:
! pip install langchain-openai datasets pypdf

In [None]:
! pip install unstructured-pytesseract tesseract-ocr

In [None]:
! pip install "unstructured[pptx]"

In [None]:
! pip install pdf2image pdfminer.six unstructured[pdf]

In [None]:
import os
from getpass import getpass
from datasets import (
    load_dataset,
)
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from google.colab import userdata
from langchain_astradb import AstraDBVectorStore
from langchain_community.document_loaders import DirectoryLoader

In [None]:
OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
embedding = OpenAIEmbeddings()

Using Unstructured for loading Multiple pdfs

In [None]:
root_dir = "/content/"

In [None]:
pdf_folder_path = f"{root_dir}/docs"

In [None]:
os.listdir(pdf_folder_path)

In [None]:
# Location of the pdf file/files
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path), fn) for fn in os.listdir(pdf_folder_path)]

In [None]:
index = VectorstoreIndexCreator().from_loaders(loaders)

In [None]:
index.query("What is the tokenization in RAG?")

In [None]:
index.query_with_sources("What is the tokenization in RAG?")

Pypdf loader with Multiple Pdfs

In [None]:
ASTRA_DB_API_ENDPOINT = "ASTRA_DB_API_ENDPOINT"
ASTRA_DB_APPLICATION_TOKEN = "ASTRA_DB_APPLICATION_TOKEN"
ASTRA_DB_KEYSPACE = "ASTRA_DB_KEYSPACE"

In [None]:
pdfs = os.listdir(pdf_folder_path)

In [None]:
data = PyPDFLoader("/content/data/MachineTranslationwithAttention.pdf")

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)

In [None]:
data.load_and_split(text_splitter=splitter)

In [None]:
docs = []
for pdf in pdfs:
    data = PyPDFLoader(f"/content/data/{pdf}")
    docs.append(data)

In [None]:
docs_from_pdf = docs.load_and_split(text_splitter=splitter)

In [None]:
vstore = AstraDBVectorStore(
    embedding=embedding,
    collection_name="multidoc_vector",
    api_endpoint = ASTRA_DB_API_ENDPOINT,
    toke=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE
)

In [None]:
print(f"Documents from PDF : {len(docs_from_pdf)}")
inserted_ids_from_pdf = vstore.add_documents(docs_from_pdf)
print(f"Inserted {len(inserted_ids_from_pdf)} docuemtns")

In [None]:
retriever = vstore.as_retriever(search_kwargs={"k":3})

In [None]:
prompt_template = """  

    You are a philosopher that draws inspiration from great thinkers of the past
    to craft well-thought answers to user questions. Use the provided context as the basis
    for your answers and do not make up new reasoning paths. just mix-end-match what you are
    Your answers must be concise and to the point, and refrain from answering about other top
    
    CONTEXT:
    {context}

    QUESTION: {question}

    YOUR_ANSWER:

"""

In [None]:
philo_prompt = ChatPromptTemplate.from_template(prompt_template)

In [None]:
llm = ChatOpenAI()

chain = (
    {"context": retriever, "question":RunnablePassthrough()}
    | philo_prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("How does Russel elaborate on Peirce's idea of the security blanket?")

Directory Loaders (Chat With Multiple DOCS)

In [None]:
! rm -rf "/content/docs/.ipynb_checkpoints"

In [None]:
! sudo apt-get update

In [None]:
! sudo apt-get install poppler-utils

In [None]:
! sudo apt-get install libleptonica-dev tesseract-dev tesseract-ocr libtesseract-dev python3-pil tesseract-ocr-eng tesseract-ocr-script-latn

In [None]:
loader = DirectoryLoader("/content/docs")

In [None]:
splitter_1 = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)

In [None]:
dcos = loader.load_and_split(text_splitter=splitter_1)

In [None]:
len(docs)

In [None]:
inserted_ids = vstore.add_documents(docs)
print(f"\nInserted {len(inserted_ids)} documetns.")

In [None]:
prompt_template_1 = """"

    You are an AI philossopher drawing insights from the roadmap of 'rag' , 'llama3', and 'genai'.
    Craft thoughtful answers based on this roadmap, mixing and matching existing paths.
    Your responses should be soncise and strictly related to the provided context.

    ROADMAP CONTEXT:
    {context}

    QUESTION: {question}

    YOUR ANSWER:

"""

In [None]:
prompt_template_1 = ChatPromptTemplate.from_template(prompt_template_1)

In [None]:
chain = (
    {"context": retriever, "question":RunnablePassthrough()}
    | prompt_template
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("Can you tell me the roadmap of Generative AI?")