In [None]:
!pip install "unstructured[all-docs]"
!pip install libmagic-dev poppler-utils tesseract-ocr

In [None]:
# Import necessary modules
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

# Access the secret key
# SECRET_KEY = os.getenv()

# # Now you can use the SECRET_KEY in your code
# print(f'SECRET_KEY:  {SECRET_KEY}')

In [None]:
# deps
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [None]:
# laoding PDf local
pdf_path = 'transformers.pdf'

In [None]:
# Processing PDF
if pdf_path:
    loader = UnstructuredPDFLoader(file_path = pdf_path)
    data = loader.load()
else:
    print("Upload a PDF")


# Viewing Processed pdf
data[0].page_content

## Vector Embeddings


In [None]:
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
# Now you can use the API key to initialize the CohereEmbeddings model
embedding_model = CohereEmbeddings(cohere_api_key = COHERE_API_KEY)

In [None]:
# vecotr embeddings & text spiltter & vecotr store deps
from langchain_cohere import CohereEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [None]:
# splitting text from pdf and chunking for proper embeddings
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 8000, chunk_overlap = 100)
chunks = text_splitter.split_documents(data)

In [None]:
# adding a vectorDB
vector_db = Chroma.from_documents(
    documents = chunks,
    embedding = embeddings_model,
    collection_name = "rag-pdf"
)

In [None]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
from langchain_groq import ChatGroq # load groq deps

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# loading the mixtral llm from groq
llm = ChatGroq(temperature=0, groq_api_key= GROQ_API_KEY, model_name="mixtral-8x7b-32768")

In [None]:
# creating a custom template saying whenever user input a Query, make 5 alternative similar query
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [None]:
# Storing those alternative generated query into vecotor db, this is a custom retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
if not found generate generic answer also along with it mention "NOT FROM PDF"
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
# Now lets chain everything
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke(input(""))