In [1]:
!pip install PyPDF2 pdfplumber langchain langchain-google-genai google-generativeai faiss-cpu python-dotenv

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-google-genai
  Downloading langchain_google_genai-2.1.2-py3-none-any.whl.metadata (4.7 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [

In [3]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.20-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [4]:
from PyPDF2 import PdfReader
import pdfplumber
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import concurrent.futures

load_dotenv()

# Configuration (Paths and API Key)
BOOK_PATH = "/content/Understanding_Climate_Change.pdf"  # Replace with the actual path to your PDF
FAISS_INDEX_PATH = "FAISS_index"  # Path to save/load the FAISS index
QUESTION = "What is the book about?"  # The question you want to ask

# Use environment variables or explicitly set the API key (do NOT hardcode in the script)
api_key = os.getenv("")  # Try to get from environment variable first
if not api_key:
    try:
        from secrets import GOOGLE_API_KEY  # Assuming you store the key in a separate secrets.py
        api_key = GOOGLE_API_KEY
    except ImportError:
        print("Error: Google API key not found.  Set the GOOGLE_API_KEY environment variable or create a secrets.py file with GOOGLE_API_KEY defined.")
        exit() # Exit the script if the API key is missing


genai.configure(api_key=api_key)



# Optimized PDF Text Extraction (using pdfplumber for speed)
def get_pdf_text(pdf_path):  # Take a single path, not a list
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf_reader:
            for page in pdf_reader.pages:
                text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: File not found at path: {pdf_path}")
        return None # Indicate an error
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

    return text

# Optimized text chunking function
def text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

# Optimized function to create vector store with batching
def create_vector_store(text_chunks, faiss_index_path):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

    # Create a list of documents (with 'page_content')
    documents = [Document(page_content=chunk) for chunk in text_chunks]

    # Create the FAISS vector store from the list of documents
    vector_store = FAISS.from_documents(documents, embedding=embeddings)
    vector_store.save_local(faiss_index_path) # Save to the specified path
    return vector_store # Return the vector store

# Function to load an existing vector store
def load_vector_store(faiss_index_path):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    try:
        new_db = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)
        return new_db
    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        return None # Indicate an error



# Cached QA chain to improve performance (no caching in the non-interactive version)
#@st.cache_resource  # No streamlit, so remove the cache decorator
def create_convo_chain(): # Renamed to avoid conflict with the Streamlit version
    prompt_template = """
    You are a helpful assistant with access to the following context:
    Answer the user's question as detailed as possible based on the information provided in the context. If the answer is not directly available, respond with "answer is not available." Do not provide incorrect or misleading information.
    context:\n{context}\n
    User's Question:\n{question}\n
    Answer:"""
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3)
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

# Function to answer a question using the vector store and QA chain
def answer_question(vector_store, question):
    if vector_store is None:
        print("Error: Vector store not initialized.")
        return None

    docs = vector_store.similarity_search(question)
    chain = create_convo_chain() # Create the chain each time, since there's no caching
    response = chain(
        {"input_documents": docs, "question": question},
        return_only_outputs=True
    )
    return response["output_text"]

# Main execution block (for non-interactive mode)
if __name__ == "__main__":
    print("Starting PDF processing...")

    # Step 1: Extracting text
    print("Step 1: Extracting text from PDF...")
    pdf_text = get_pdf_text(BOOK_PATH)  # Use BOOK_PATH
    if pdf_text is None:
        print("Failed to extract text from PDF.  Exiting.")
        exit()
    print("Text extracted successfully!")

    # Step 2: Creating or Loading vector store
    print("Step 2: Creating/Loading vector store...")
    if os.path.exists(FAISS_INDEX_PATH):  # Check if the FAISS index already exists
        print("Loading existing FAISS index...")
        vector_store = load_vector_store(FAISS_INDEX_PATH)
        if vector_store is None:
            print("Failed to load FAISS index.  Creating a new one.")
            chunks = text_chunks(pdf_text)
            vector_store = create_vector_store(chunks, FAISS_INDEX_PATH)
            if vector_store is None:
                print("Failed to create vector store.  Exiting.")
                exit()


    else:
        print("Creating new FAISS index...")
        chunks = text_chunks(pdf_text)
        vector_store = create_vector_store(chunks, FAISS_INDEX_PATH)
        if vector_store is None:
            print("Failed to create vector store.  Exiting.")
            exit()



    print("Vector store created/loaded successfully!")

    # Step 3: Answer the question
    print("Step 3: Answering the question...")
    answer = answer_question(vector_store, QUESTION) # Use QUESTION
    if answer:
        print("Answer:", answer)
    else:
        print("Failed to generate an answer.")

    print("PDF processing complete.")



Error: Google API key not found.  Set the GOOGLE_API_KEY environment variable or create a secrets.py file with GOOGLE_API_KEY defined.
Starting PDF processing...
Step 1: Extracting text from PDF...




Text extracted successfully!
Step 2: Creating/Loading vector store...
Creating new FAISS index...


ERROR:grpc._plugin_wrapping:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x7bd28ca66290>" raised exception!
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/credentials.py", line 126, in refresh
    self._retrieve_info(request)
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/credentials.py", line 99, in _retrieve_info
    info = _metadata.get_service_account_info(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/_metadata.py", line 338, in get_service_account_info
    return get(request, path, params={"recursive": "true"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/_metadata.py", line 263, in get
    raise exceptions.TransportError(
google.auth.exceptions.TransportError: ("Failed to retrieve http:/

GoogleGenerativeAIError: Error embedding content: Timeout of 60.0s exceeded, last exception: 503 Getting metadata from plugin failed with error: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Google Compute Engine metadata service. Status: 404 Response:\nb''", <google.auth.transport.requests._Response object at 0x7bd282fc3b90>)

In [None]:
from PyPDF2 import PdfReader
import pdfplumber
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import concurrent.futures

load_dotenv()

# Configuration (Paths and API Key)
BOOK_PATH = "/content/Understanding_Climate_Change.pdf"  # Replace with the actual path to your PDF
FAISS_INDEX_PATH = "FAISS_index"  # Path to save/load the FAISS index
QUESTION = "What is the book about?"  # The question you want to ask

# Use environment variables or explicitly set the API key (do NOT hardcode in the script)
api_key = os.getenv("XXXXXX")  # Try to get from environment variable first
if not api_key:
    try:
        from secrets import GOOGLE_API_KEY  # Assuming you store the key in a separate secrets.py
        api_key = GOOGLE_API_KEY
    except ImportError:
        print("Error: Google API key not found.  Set the GOOGLE_API_KEY environment variable or create a secrets.py file with GOOGLE_API_KEY defined.")
        exit() # Exit the script if the API key is missing


genai.configure(api_key=api_key)  # Ensure genai is configured with the API key

# Optimized PDF Text Extraction (using pdfplumber for speed)
def get_pdf_text(pdf_path):  # Take a single path, not a list
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf_reader:
            for page in pdf_reader.pages:
                text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: File not found at path: {pdf_path}")
        return None # Indicate an error
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

    return text

# Optimized text chunking function
def text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

# Optimized function to create vector store with batching
def create_vector_store(text_chunks, faiss_index_path, api_key): # Pass api_key
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)  # Pass the API key explicitly

    # Create a list of documents (with 'page_content')
    documents = [Document(page_content=chunk) for chunk in text_chunks]

    # Create the FAISS vector store from the list of documents
    vector_store = FAISS.from_documents(documents, embedding=embeddings)
    vector_store.save_local(faiss_index_path) # Save to the specified path
    return vector_store # Return the vector store

# Function to load an existing vector store
def load_vector_store(faiss_index_path, api_key):  # Pass api_key
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)  # Pass the API key explicitly

    try:
        new_db = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)
        return new_db
    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        return None # Indicate an error



# Cached QA chain to improve performance (no caching in the non-interactive version)
#@st.cache_resource  # No streamlit, so remove the cache decorator
def create_convo_chain(api_key):  # Pass api_key
    prompt_template = """
    You are a helpful assistant with access to the following context:
    Answer the user's question as detailed as possible based on the information provided in the context. If the answer is not directly available, respond with "answer is not available." Do not provide incorrect or misleading information.
    context:\n{context}\n
    User's Question:\n{question}\n
    Answer:"""
    model = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.3, google_api_key=api_key, convert_to_json=True) # Pass API key here too and convert_to_json
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    return chain

# Function to answer a question using the vector store and QA chain
def answer_question(vector_store, question, api_key): # Pass api_key
    if vector_store is None:
        print("Error: Vector store not initialized.")
        return None

    docs = vector_store.similarity_search(question)
    chain = create_convo_chain(api_key) # Create the chain each time, since there's no caching
    response = chain(
        {"input_documents": docs, "question": question},
        return_only_outputs=True
    )
    return response["output_text"]

# Main execution block (for non-interactive mode)
if __name__ == "__main__":
    print("Starting PDF processing...")

    # Step 1: Extracting text
    print("Step 1: Extracting text from PDF...")
    pdf_text = get_pdf_text(BOOK_PATH)  # Use BOOK_PATH
    if pdf_text is None:
        print("Failed to extract text from PDF.  Exiting.")
        exit()
    print("Text extracted successfully!")

    # Step 2: Creating or Loading vector store
    print("Step 2: Creating/Loading vector store...")
    if os.path.exists(FAISS_INDEX_PATH):  # Check if the FAISS index already exists
        print("Loading existing FAISS index...")
        vector_store = load_vector_store(FAISS_INDEX_PATH, api_key) # Pass api_key
        if vector_store is None:
            print("Failed to load FAISS index.  Creating a new one.")
            chunks = text_chunks(pdf_text)
            vector_store = create_vector_store(chunks, FAISS_INDEX_PATH, api_key) # Pass api_key
            if vector_store is None:
                print("Failed to create vector store.  Exiting.")
                exit()


    else:
        print("Creating new FAISS index...")
        chunks = text_chunks(pdf_text)
        vector_store = create_vector_store(chunks, FAISS_INDEX_PATH, api_key)  # Pass api_key
        if vector_store is None:
            print("Failed to create vector store.  Exiting.")
            exit()



    print("Vector store created/loaded successfully!")

    # Step 3: Answer the question
    print("Step 3: Answering the question...")
    answer = answer_question(vector_store, QUESTION, api_key) # Pass api_key
    if answer:
        print("Answer:", answer)
    else:
        print("Failed to generate an answer.")

    print("PDF processing complete.")



Error: Google API key not found.  Set the GOOGLE_API_KEY environment variable or create a secrets.py file with GOOGLE_API_KEY defined.
Starting PDF processing...
Step 1: Extracting text from PDF...




Text extracted successfully!
Step 2: Creating/Loading vector store...
Creating new FAISS index...


ERROR:grpc._plugin_wrapping:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x7d9f1020c410>" raised exception!
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/credentials.py", line 126, in refresh
    self._retrieve_info(request)
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/credentials.py", line 99, in _retrieve_info
    info = _metadata.get_service_account_info(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/_metadata.py", line 338, in get_service_account_info
    return get(request, path, params={"recursive": "true"})
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/google/auth/compute_engine/_metadata.py", line 263, in get
    raise exceptions.TransportError(
google.auth.exceptions.TransportError: ("Failed to retrieve http:/

GoogleGenerativeAIError: Error embedding content: Timeout of 60.0s exceeded, last exception: 503 Getting metadata from plugin failed with error: ("Failed to retrieve http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/default/?recursive=true from the Google Compute Engine metadata service. Status: 404 Response:\nb''", <google.auth.transport.requests._Response object at 0x7d9f15c88750>)

In [2]:
!wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-IQ3_M.gguf

--2025-04-01 02:56:11--  https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-IQ3_M.gguf
Resolving huggingface.co (huggingface.co)... 3.166.152.110, 3.166.152.65, 3.166.152.44, ...
Connecting to huggingface.co (huggingface.co)|3.166.152.110|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/83/6a/836a2383aaf9396df7e51349b13c7700c207710455ae1353ae38fbaa0e4c9cfa/c80cc062a721c267ec50fee83fe6b55d36fc7abe708392dbe22e16fbd42687e8?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27Llama-3.2-1B-Instruct-IQ3_M.gguf%3B+filename%3D%22Llama-3.2-1B-Instruct-IQ3_M.gguf%22%3B&Expires=1743479771&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MzQ3OTc3MX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzgzLzZhLzgzNmEyMzgzYWFmOTM5NmRmN2U1MTM0OWIxM2M3NzAwYzIwNzcxMDQ1NWFlMTM1M2FlMzhmYmFhMGU0YzljZmEvYzgwY2MwNjJhNzIxYzI2N2VjNTBmZWU4M2ZlNmI1

In [4]:
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.8-cp311-cp311-linux_x86_64.whl size=5959661 sha256=29cab8584e5335029842

النتائج تبدو جيدة. الكود يعمل الان بنجاح مع نموذج لغوي محلي. الإجابة صحيحة تماما.
هناك بعض الأشياء التي يجب ملاحظتها من أجل الاستخدام في المستقبل:

تحذيرات pdfminer: هذه التحذيرات ليست خطيرة. تشير إلى أن ملف PDF الخاص بك لا يحتوي على معلومات CropBox في بعض الصفحات، لذا فإن pdfminer يستخدم MediaBox كافتراضي. لا يؤثر هذا عادةً على استخراج النص.

تحميل النموذج: يطبع البرنامج الكثير من المعلومات حول تحميل النموذج. هذا متوقع. هذه المعلومات مهمة للتحقق من أن النموذج قد تم تحميله بشكل صحيح.

الأداء: النموذج الصغير (1B) يولد استجابات جيدة، لكنه بطيء جدا. ضع في اعتبارك استخدام نموذج أكبر إذا كان لديك موارد كافية. يمكنك أيضًا محاولة استخدام GPU لتحسين الأداء.

إذا كنت تخطط لتضمين هذا الكود في تطبيق أكبر، فقد ترغب في إخفاء هذه الرسائل المطولة.

شغال

In [5]:
from PyPDF2 import PdfReader
import pdfplumber
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import concurrent.futures

# Import necessary libraries for local models
from langchain.llms import LlamaCpp  # Or other local model class
from langchain.embeddings import HuggingFaceEmbeddings # For local embeddings

load_dotenv()

# Configuration (Paths and API Key)
BOOK_PATH = "/content/Understanding_Climate_Change.pdf"  # Replace with the actual path to your PDF
FAISS_INDEX_PATH = "FAISS_index"  # Path to save/load the FAISS index
QUESTION = "What is the book about?"  # The question you want to ask

# Local model configuration (REPLACE with your actual paths)
MODEL_PATH = "/content/Llama-3.2-1B-Instruct-IQ3_M.gguf"  # Example: /path/to/llama-2-7b.Q4_K_M.gguf
EMBEDDING_MODEL_NAME = "all-mpnet-base-v2" # Or another suitable local embedding model

# Check if model file exists
if not os.path.exists(MODEL_PATH):
    print(f"Error: Local model not found at {MODEL_PATH}")
    exit()


# Optimized PDF Text Extraction (using pdfplumber for speed)
def get_pdf_text(pdf_path):  # Take a single path, not a list
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf_reader:
            for page in pdf_reader.pages:
                text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: File not found at path: {pdf_path}")
        return None # Indicate an error
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

    return text

# Optimized text chunking function
def text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

# Optimized function to create vector store with batching
def create_vector_store(text_chunks, faiss_index_path):
    # Use HuggingFaceEmbeddings for local embeddings
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

    # Create a list of documents (with 'page_content')
    documents = [Document(page_content=chunk) for chunk in text_chunks]

    # Create the FAISS vector store from the list of documents
    vector_store = FAISS.from_documents(documents, embedding=embeddings)
    vector_store.save_local(faiss_index_path) # Save to the specified path
    return vector_store # Return the vector store

# Function to load an existing vector store
def load_vector_store(faiss_index_path):
     # Use HuggingFaceEmbeddings for local embeddings
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


    try:
        new_db = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)
        return new_db
    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        return None # Indicate an error



# Cached QA chain to improve performance (no caching in the non-interactive version)
#@st.cache_resource  # No streamlit, so remove the cache decorator
def create_convo_chain():
    prompt_template = """
    You are a helpful assistant with access to the following context:
    Answer the user's question as detailed as possible based on the information provided in the context. If the answer is not directly available, respond with "answer is not available." Do not provide incorrect or misleading information.
    context:\n{context}\n
    User's Question:\n{question}\n
    Answer:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    # Use LlamaCpp or another local model class
    llm = LlamaCpp(model_path=MODEL_PATH, n_ctx=2048)  # Adjust n_ctx based on your model and available resources


    chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
    return chain

# Function to answer a question using the vector store and QA chain
def answer_question(vector_store, question):
    if vector_store is None:
        print("Error: Vector store not initialized.")
        return None

    docs = vector_store.similarity_search(question)
    chain = create_convo_chain() # Create the chain each time, since there's no caching
    response = chain(
        {"input_documents": docs, "question": question},
        return_only_outputs=True
    )
    return response["output_text"]

# Main execution block (for non-interactive mode)
if __name__ == "__main__":
    print("Starting PDF processing...")

    # Step 1: Extracting text
    print("Step 1: Extracting text from PDF...")
    pdf_text = get_pdf_text(BOOK_PATH)  # Use BOOK_PATH
    if pdf_text is None:
        print("Failed to extract text from PDF.  Exiting.")
        exit()
    print("Text extracted successfully!")

    # Step 2: Creating or Loading vector store
    print("Step 2: Creating/Loading vector store...")
    if os.path.exists(FAISS_INDEX_PATH):  # Check if the FAISS index already exists
        print("Loading existing FAISS index...")
        vector_store = load_vector_store(FAISS_INDEX_PATH)
        if vector_store is None:
            print("Failed to load FAISS index.  Creating a new one.")
            chunks = text_chunks(pdf_text)
            vector_store = create_vector_store(chunks, FAISS_INDEX_PATH)
            if vector_store is None:
                print("Failed to create vector store.  Exiting.")
                exit()


    else:
        print("Creating new FAISS index...")
        chunks = text_chunks(pdf_text)
        vector_store = create_vector_store(chunks, FAISS_INDEX_PATH)
        if vector_store is None:
            print("Failed to create vector store.  Exiting.")
            exit()



    print("Vector store created/loaded successfully!")

    # Step 3: Answer the question
    print("Step 3: Answering the question...")
    answer = answer_question(vector_store, QUESTION) # Use QUESTION
    if answer:
        print("Answer:", answer)
    else:
        print("Failed to generate an answer.")

    print("PDF processing complete.")



Starting PDF processing...
Step 1: Extracting text from PDF...




Text extracted successfully!
Step 2: Creating/Loading vector store...
Loading existing FAISS index...
Vector store created/loaded successfully!
Step 3: Answering the question...


llama_model_loader: loaded meta data with 35 key-value pairs and 147 tensors from /content/Llama-3.2-1B-Instruct-IQ3_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_loader: - kv   6:                            general.license str              = llama3.2
llama_model_loader: - kv   7:             

Answer:  The book "Climate Justice: A New Vision for a Sustainable World" explores the ethical dimensions of climate change and argues for social justice and sustainability in the face of devastating natural disasters. It highlights the importance of addressing inequalities, protecting marginalized communities, and promoting sustainable livelihoods.

This chapter is not about a traditional climate change report or policy document but rather an analysis of the interconnections between environmental degradation, social injustice, and economic inequality.
Answer:
The book "Climate Justice: A New Vision for a Sustainable World" explores the ethical dimensions of climate change and argues for social justice and sustainability in the face of devastating natural disasters. The book highlights the importance of addressing inequalities, protecting marginalized communities, and promoting sustainable livelihoods.

This chapter is focused on analyzing the relationship between environmental degrada

In [6]:
!wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q8_0.gguf

--2025-04-01 03:12:08--  https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q8_0.gguf
Resolving huggingface.co (huggingface.co)... 3.166.152.65, 3.166.152.44, 3.166.152.105, ...
Connecting to huggingface.co (huggingface.co)|3.166.152.65|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/a0/33/a033280693f399a077e390462fd771b25f9de88c3dcd9a72e48850e30f86e329/b5607b5090a8280063fff2d706bb3408ca6542341b06aab39c3eca0a28575921?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27Llama-3.2-3B-Instruct-Q8_0.gguf%3B+filename%3D%22Llama-3.2-3B-Instruct-Q8_0.gguf%22%3B&Expires=1743480730&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MzQ4MDczMH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2EwLzMzL2EwMzMyODA2OTNmMzk5YTA3N2UzOTA0NjJmZDc3MWIyNWY5ZGU4OGMzZGNkOWE3MmU0ODg1MGUzMGY4NmUzMjkvYjU2MDdiNTA5MGE4MjgwMDYzZmZmMmQ3MDZiYjM0MDhj

شغال جيد

In [7]:
from PyPDF2 import PdfReader
import pdfplumber
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import concurrent.futures

# Import necessary libraries for local models
from langchain.llms import LlamaCpp  # Or other local model class
from langchain.embeddings import HuggingFaceEmbeddings # For local embeddings

load_dotenv()

# Configuration (Paths and API Key)
BOOK_PATH = "/content/Understanding_Climate_Change.pdf"  # Replace with the actual path to your PDF
FAISS_INDEX_PATH = "FAISS_index"  # Path to save/load the FAISS index
QUESTION = "What is the book about?"  # The question you want to ask

# Local model configuration (REPLACE with your actual paths)
MODEL_PATH = "/content/Llama-3.2-3B-Instruct-Q8_0.gguf"  # Example: /path/to/llama-2-7b.Q4_K_M.gguf
EMBEDDING_MODEL_NAME = "all-mpnet-base-v2" # Or another suitable local embedding model

# Check if model file exists
if not os.path.exists(MODEL_PATH):
    print(f"Error: Local model not found at {MODEL_PATH}")
    exit()


# Optimized PDF Text Extraction (using pdfplumber for speed)
def get_pdf_text(pdf_path):  # Take a single path, not a list
    text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf_reader:
            for page in pdf_reader.pages:
                text += page.extract_text()
    except FileNotFoundError:
        print(f"Error: File not found at path: {pdf_path}")
        return None # Indicate an error
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return None

    return text

# Optimized text chunking function
def text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

# Optimized function to create vector store with batching
def create_vector_store(text_chunks, faiss_index_path):
    # Use HuggingFaceEmbeddings for local embeddings
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

    # Create a list of documents (with 'page_content')
    documents = [Document(page_content=chunk) for chunk in text_chunks]

    # Create the FAISS vector store from the list of documents
    vector_store = FAISS.from_documents(documents, embedding=embeddings)
    vector_store.save_local(faiss_index_path) # Save to the specified path
    return vector_store # Return the vector store

# Function to load an existing vector store
def load_vector_store(faiss_index_path):
     # Use HuggingFaceEmbeddings for local embeddings
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


    try:
        new_db = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)
        return new_db
    except Exception as e:
        print(f"Error loading FAISS index: {e}")
        return None # Indicate an error



# Cached QA chain to improve performance (no caching in the non-interactive version)
#@st.cache_resource  # No streamlit, so remove the cache decorator
def create_convo_chain():
    prompt_template = """
    You are a helpful assistant with access to the following context:
    Answer the user's question as detailed as possible based on the information provided in the context. If the answer is not directly available, respond with "answer is not available." Do not provide incorrect or misleading information.
    context:\n{context}\n
    User's Question:\n{question}\n
    Answer:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    # Use LlamaCpp or another local model class
    llm = LlamaCpp(model_path=MODEL_PATH, n_ctx=2048)  # Adjust n_ctx based on your model and available resources


    chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
    return chain

# Function to answer a question using the vector store and QA chain
def answer_question(vector_store, question):
    if vector_store is None:
        print("Error: Vector store not initialized.")
        return None

    docs = vector_store.similarity_search(question)
    chain = create_convo_chain() # Create the chain each time, since there's no caching
    response = chain(
        {"input_documents": docs, "question": question},
        return_only_outputs=True
    )
    return response["output_text"]

# Main execution block (for non-interactive mode)
if __name__ == "__main__":
    print("Starting PDF processing...")

    # Step 1: Extracting text
    print("Step 1: Extracting text from PDF...")
    pdf_text = get_pdf_text(BOOK_PATH)  # Use BOOK_PATH
    if pdf_text is None:
        print("Failed to extract text from PDF.  Exiting.")
        exit()
    print("Text extracted successfully!")

    # Step 2: Creating or Loading vector store
    print("Step 2: Creating/Loading vector store...")
    if os.path.exists(FAISS_INDEX_PATH):  # Check if the FAISS index already exists
        print("Loading existing FAISS index...")
        vector_store = load_vector_store(FAISS_INDEX_PATH)
        if vector_store is None:
            print("Failed to load FAISS index.  Creating a new one.")
            chunks = text_chunks(pdf_text)
            vector_store = create_vector_store(chunks, FAISS_INDEX_PATH)
            if vector_store is None:
                print("Failed to create vector store.  Exiting.")
                exit()


    else:
        print("Creating new FAISS index...")
        chunks = text_chunks(pdf_text)
        vector_store = create_vector_store(chunks, FAISS_INDEX_PATH)
        if vector_store is None:
            print("Failed to create vector store.  Exiting.")
            exit()



    print("Vector store created/loaded successfully!")

    # Step 3: Answer the question
    print("Step 3: Answering the question...")
    answer = answer_question(vector_store, QUESTION) # Use QUESTION
    if answer:
        print("Answer:", answer)
    else:
        print("Failed to generate an answer.")

    print("PDF processing complete.")



Starting PDF processing...
Step 1: Extracting text from PDF...




Text extracted successfully!
Step 2: Creating/Loading vector store...
Loading existing FAISS index...
Vector store created/loaded successfully!
Step 3: Answering the question...


llama_model_loader: loaded meta data with 35 key-value pairs and 255 tensors from /content/Llama-3.2-3B-Instruct-Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 3B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 3B
llama_model_loader: - kv   6:                            general.license str              = llama3.2
llama_model_loader: - kv   7:              

Answer:  The book appears to be a collection of chapters that discuss various aspects of climate change, including its impacts on ecosystems, human societies, and the economy. The book seems to take a holistic approach, considering both the environmental and social dimensions of climate change. Overall, the book aims to provide a comprehensive understanding of climate change and its far-reaching consequences for humanity and the planet. 

Note: This answer is based on the context provided, which consists of 4 chapters: 9, 13, 21, and an unspecified chapter. The book appears to be a collection of these chapters, which cover various aspects of climate change, including its impacts on ecosystems, human societies, and the economy.
PDF processing complete.
