In [2]:
pip install langchain


Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import MessagesPlaceholder  # Import missing dependency
import fitz  # PyMuPDF
import os
from dotenv import load_dotenv


load_dotenv()  # Load environment variables

# Langsmith Tracking
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")

groq_api_key = os.getenv("GROQ_API_KEY")

# Function to extract text from PDFs
def extract_text_from_pdf(uploaded_files):
    all_text = ""
    for uploaded_file in uploaded_files:
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        all_text += "\n".join([page.get_text() for page in doc]) + "\n"
    return all_text

# Function to process PDF files and answer queries
def process_pdfs_and_query(uploaded_files, query):
    if uploaded_files:
        pdf_contents = extract_text_from_pdf(uploaded_files)
        
        # Split the text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        split_texts = text_splitter.split_text(pdf_contents)

        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        db = Chroma.from_texts(split_texts, embeddings)

        retriever = db.as_retriever()
        model = ChatGroq(model="Gemma2-9b-It", groq_api_key=groq_api_key)

        # Contextualization prompt
        contextualize_q_system_prompt = (
            "Given a chat history and the latest user question "
            "which might reference context in the chat history, "
            "formulate a standalone question which can be understood "
            "without the chat history. Do NOT answer the question, "
            "just reformulate it if needed and otherwise return it as is."
        )
        contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", contextualize_q_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),  # Fixed placeholder
                ("human", "{input}"),
            ]
        )

        history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)

        # System prompt for answering questions
        system_prompt = "You are an AI assistant that helps summarize and answer questions from documents."

        qa_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),
                ("human", "{input}"),
            ]
        )

        question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
        rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

        chat_history = []
        response = rag_chain.invoke({"input": query, "chat_history": chat_history})

        return response['answer']  # Ensure function returns the answer

# Example usage
if __name__ == "__main__":
    with open("Attention.pdf", "rb") as file:
        uploaded_files = [file]
        query = "What is the main point of the document?"

        answer = process_pdfs_and_query(uploaded_files, query)
        print("Answer:", answer)



  


ImportError: cannot import name 'MessagesPlaceholder' from 'langchain.schema' (f:\CONVERSION QandA model with pdf\venv\lib\site-packages\langchain\schema\__init__.py)

In [15]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import MessagesPlaceholder  # Correct import
import fitz  # PyMuPDF
import os
from dotenv import load_dotenv

load_dotenv()  # Load environment variables

# Langsmith Tracking
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")

groq_api_key = os.getenv("GROQ_API_KEY")

# Function to extract text from PDFs
def extract_text_from_pdf(uploaded_files):
    all_text = ""
    for uploaded_file in uploaded_files:
        doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
        all_text += "\n".join([page.get_text() for page in doc]) + "\n"
    return all_text

# Function to process PDF files and answer queries
def process_pdfs_and_query(uploaded_files, query):
    if uploaded_files:
        pdf_contents = extract_text_from_pdf(uploaded_files)
        
        # Split the text into chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        split_texts = text_splitter.split_text(pdf_contents)

        embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        db = Chroma.from_texts(split_texts, embeddings)

        retriever = db.as_retriever()
        model = ChatGroq(model="Gemma2-9b-It", groq_api_key=groq_api_key)

        # Contextualization prompt
        contextualize_q_system_prompt = (
            "Given a chat history and the latest user question "
            "which might reference context in the chat history, "
            "formulate a standalone question which can be understood "
            "without the chat history. Do NOT answer the question, "
            "just reformulate it if needed and otherwise return it as is."
        )
        contextualize_q_prompt = ChatPromptTemplate.from_messages(
            [
                ("system", contextualize_q_system_prompt),
                MessagesPlaceholder(variable_name="chat_history"),  # Corrected import
                ("human", "{input}"),
            ]
        )

        history_aware_retriever = create_history_aware_retriever(model, retriever, contextualize_q_prompt)

        # System prompt for answering questions
        system_prompt = "You are an AI assistant that helps summarize and answer questions from documents."

        qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{input}"),
        ("assistant", "{context}")  # Ensure the `context` variable is present
    ]
)


        question_answer_chain = create_stuff_documents_chain(model, qa_prompt)
        rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

        chat_history = []
        response = rag_chain.invoke({"input": query, "chat_history": chat_history})

        return response['answer']  # Ensure function returns the answer

# Example usage
if __name__ == "__main__":
    with open("Attention.pdf", "rb") as file:
        uploaded_files = [file]
        query = "What is the main point of the document?"

        answer = process_pdfs_and_query(uploaded_files, query)
        print("Answer:", answer)


Answer: 

This excerpt appears to be a list of references for a paper or document. Each reference includes:

* **Authors:** The names of the researchers who wrote the paper.
* **Title:** A brief description of the paper's topic.
* **Publication details:** Information about where the paper was published, including the journal name, volume, issue, pages, and year.

**Therefore, the main point of this document excerpt is to provide a list of cited sources for further reading.**




