### Importing libraries

In [None]:
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

# Set your Google Gemini API key
GOOGLE_API_KEY = "GOOGLE_API_KEY"  # Replace with your actual key
genai.configure(api_key=GOOGLE_API_KEY)

### Extract Text from PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a given PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

# Upload your PDF file manually in Jupyter and provide the file path
pdf_path = "FILE.pdf"  # Change to your actual file path
pdf_text = extract_text_from_pdf(pdf_path)

print("✅ PDF Text Extracted! Showing first 1000 characters:\n")
print(pdf_text[:1000])  # Display only first 1000 characters


### Split Extracted Text into Meaningful Chunks

In [None]:
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    """Splits text into chunks while preserving context."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_text(text)

# Split the extracted PDF text into chunks
chunks = chunk_text(pdf_text)
print(f"✅ Text Chunked! Total Chunks: {len(chunks)}")


### Generate Embeddings with Google Gemini and Store in FAISS for Retrieval

In [None]:
# Use Google Gemini for embeddings
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)

# Convert text chunks into embeddings and store in FAISS
vector_store = FAISS.from_texts(chunks, embedding_model)

# Save FAISS index (optional)
vector_store.save_local("faiss_index")
print("✅ FAISS Index Created & Saved!")


### Load FAISS Index and Set Up Conversational Memory

In [None]:
# Load FAISS index with safe deserialization
faiss_index = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)

# Initialize conversational memory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# Create conversational retrieval chain
qa_chain = ConversationalRetrievalChain.from_llm(
    GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY),
    retriever=faiss_index.as_retriever(),
    memory=memory
)


### Retrieve Relevant Information and Answer Questions with Google Gemini

In [None]:
def retrieve_and_answer(query, vector_store, full_text):
    """Retrieves relevant context OR provides a summary when requested."""
    
    # If the user asks for a summary, provide the full document
    summary_keywords = ["summary", "summarize", "brief", "overview"]
    if any(keyword in query.lower() for keyword in summary_keywords):
        context = full_text  # Use full text for summarization
    else:
        # Find relevant documents
        relevant_docs = vector_store.similarity_search(query, k=3)  # Retrieve top 3 most relevant chunks
        context = "\n".join([doc.page_content for doc in relevant_docs])

    # Generate response using Gemini
    llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GOOGLE_API_KEY)
    prompt = f"Based on the following document, answer the question:\n\n{context}\n\nQuestion: {query}"
    response = llm.invoke(prompt)
    
    return response

# Example Usage
user_query = input("Ask a question: ")
answer = retrieve_and_answer(user_query, faiss_index, pdf_text)
print("\n💡 Answer:", answer)
