In [1]:
!pip install transformers --upgrade

Collecting transformers
  Downloading transformers-4.42.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.42.3-py3-none-any.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.41.2
    Uninstalling transformers-4.41.2:
      Successfully uninstalled transformers-4.41.2
Successfully installed transformers-4.42.3


In [2]:
!pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [3]:
!pip install PyPDF2 pymupdf nltk sentence-transformers scikit-learn faiss-cpu transformers langchain bitsandbytes


Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting langchain
  Downloading langchain-0.2.6-py3-none-any.whl.metadata (7.0 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting PyMuPDFb==1.24.6 (from pymupdf)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Collecting langchain-core<0.3.0,>=0.2.10 (from langchain)
  Downloading langchain_core-0.2.10-py3-none-any.whl.metadata (6.0 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17

In [9]:
import os
import requests
import re
from PyPDF2 import PdfMerger
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available

# List of Wikipedia page titles and a direct PDF URL
wiki_pages = [
    "Elections_in_India",
    "2024_Indian_general_election",
    "2024_elections_in_India",
    "Lok_Sabha",
    "Rajya_Sabha",
    "State_Legislative_Assemblies",
    "President_and_Vice_President",
    "Election_Commission_of_India",
    "2019_Indian_general_election",
    "2014_Indian_general_election",
    "2009_Indian_general_election",
    "2004_Indian_general_election",
    "1999_Indian_general_election",
    "1998_Indian_general_election",
    "1996_Indian_general_election",
    "1991_Indian_general_election",
    "1989_Indian_general_election",
    "1984_Indian_general_election",
    "1980_Indian_general_election",
    "1977_Indian_general_election",
    "1971_Indian_general_election",
    "1967_Indian_general_election",
    "1962_Indian_general_election",
    "1957_Indian_general_election",
    "1951–52_Indian_general_election",
    "https://cdn.downtoearth.org.in/library/0.61706000_1558592806_first-general-elections-in-india,-vol.pdf"
]

# Directory to save individual PDFs
pdf_dir = "pdfs"
os.makedirs(pdf_dir, exist_ok=True)

def safe_filename(page):
    """Generate a safe filename from the page title or URL."""
    return re.sub(r'[\\/*?:"<>|]', "_", page) + ".pdf"

def download_pdfs(wiki_pages):
    """Download PDFs from Wikipedia and other sources."""
    for page in wiki_pages:
        pdf_path = os.path.join(pdf_dir, safe_filename(page))
        if not os.path.exists(pdf_path):
            print(f"[INFO] {pdf_path} doesn't exist, downloading...")
            url = page if page.startswith("https://") else f"https://en.wikipedia.org/api/rest_v1/page/pdf/{page}"
            try:
                response = requests.get(url)
                response.raise_for_status()
                with open(pdf_path, "wb") as file:
                    file.write(response.content)
                print(f"[INFO] The file has been downloaded and saved as {pdf_path}")
            except requests.RequestException as e:
                print(f"[ERROR] Failed to download the file {page}. Error: {e}")
        else:
            print(f"[INFO] File {pdf_path} already exists.")

def combine_pdfs(wiki_pages, combined_pdf_path):
    """Combine multiple PDFs into one."""
    pdf_merger = PdfMerger()
    for page in wiki_pages:
        pdf_path = os.path.join(pdf_dir, safe_filename(page))
        if os.path.exists(pdf_path):
            pdf_merger.append(pdf_path)
        else:
            print(f"[WARN] {pdf_path} does not exist and will not be included in the combined PDF.")
    with open(combined_pdf_path, "wb") as output_file:
        pdf_merger.write(output_file)
    pdf_merger.close()
    print(f"[INFO] All PDFs have been combined into {combined_pdf_path}")

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

def split_text_into_chunks(text, chunk_size=256, chunk_overlap=10):
    """Split text into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.create_documents([text])
    return [doc.page_content for doc in documents]

def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    """Embed text chunks using a Sentence Transformer model."""
    model = SentenceTransformer(model_name)
    return model.encode(chunks)

def build_faiss_index(embeddings):
    """Build a FAISS index for retrieval."""
    d = embeddings.shape[1]  # Dimension of embeddings
    index = faiss.IndexFlatL2(d)  # Create a FAISS index with the correct dimension
    index.add(embeddings)  # Add embeddings to the index
    return index

def retrieve_similar_documents(query, index, model, chunks, k=10):
    """Retrieve similar documents based on a query."""
    query_embedding = model.encode([query])  # Encode the query
    D, I = index.search(query_embedding, k)  # Search the index
    similar_docs = [chunks[i] for i in I[0]]
    distances = D[0]
    return similar_docs, distances

def get_gpu_memory_info():
    """Get available GPU memory."""
    gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
    gpu_memory_gb = round(gpu_memory_bytes / (2**30))
    return gpu_memory_gb

def configure_model(gpu_memory_gb):
    """Configure the model based on available GPU memory."""
    if gpu_memory_gb < 5.1:
        print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
    elif gpu_memory_gb < 8.1:
        print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
        use_quantization_config = True
        model_id = "google/gemma-2b-it"
    elif gpu_memory_gb < 19.0:
        print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
        use_quantization_config = False
        model_id = "google/gemma-2b-it"
    elif gpu_memory_gb > 19.0:
        print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
        use_quantization_config = False
        model_id = "google/gemma-7b-it"
    print(f"use_quantization_config set to: {use_quantization_config}")
    print(f"model_id set to: {model_id}")
    return use_quantization_config, model_id

def setup_model(model_id, use_quantization_config):
    os.environ['HF_TOKEN'] = "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"
    
    """Set up the language model."""
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, use_auth_token=os.getenv('HF_TOKEN'))
    quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) if use_quantization_config else None
    attn_implementation = "flash_attention_2" if (is_flash_attn_2_available() and torch.cuda.get_device_capability(0)[0] >= 8) else "sdpa"
    print(f"Using attention implementation: {attn_implementation}")
    llm_model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_id,
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        low_cpu_mem_usage=False,
        attn_implementation=attn_implementation,
        use_auth_token=os.getenv('HF_TOKEN')
    )
    if not use_quantization_config:
        llm_model.to("cuda")
    return tokenizer, llm_model

def format_answer(similar_docs, answer_text):
    """Format the answer text with context from similar documents."""
    formatted_answer = "### Retrieved Documents:\n"
    for i, doc in enumerate(similar_docs):
        formatted_answer += f"Document {i + 1}:\n{doc}\n\n"
    formatted_answer += "### Answer:\n"
    formatted_answer += answer_text
    return formatted_answer

def prompt_formatter(query, context_items):
    """
    Format the prompt for the language model based on Indian Psephology context.
    """
    # Customize the introduction based on the specific domain (Indian Psephology)

    introduction = """You are an AI model specializing in Indian elections. When answering questions, first retrieve relevant context from the provided database or source documents to ensure accurate and comprehensive responses. Only provide information and answer questions related to Indian elections, including historical election data, political parties, election processes, significant events, and notable figures. If a query is unrelated to Indian elections, politely decline to answer and remind the user to ask questions within the specified domain."""

    instructions = """When a query is received, retrieve relevant context from the provided database or source documents.
Use the retrieved context to generate a response.
If no relevant context is found, rely on your knowledge base but ensure the response is still within the scope of Indian elections.
If the query is unrelated to Indian elections, politely decline and remind the user to ask questions within the specified domain."""

    # Format each context item to include document numbers and titles
    formatted_context = "\n".join(context_items)

    # Construct the base prompt with query and formatted context
    base_prompt = f"""
    User Query: {query}

    {introduction}
    {instructions}

    {formatted_context}

    Answer:"""

    return base_prompt

def ask(query, index, model, tokenizer, llm_model, chunks, temperature=0.7, max_new_tokens=200, format_answer_text=False, return_answer_only=True):
    """
    Retrieve similar documents based on the query, then generate an answer using a language model.
    """
    # Retrieve similar documents
    similar_docs, _ = retrieve_similar_documents(query, index, model, chunks)

    # Format prompt
    context_items = [f"Document {i + 1}: {doc}" for i, doc in enumerate(similar_docs)]
    prompt = prompt_formatter(query, context_items)

    # Generate answer using language model
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature)
    answer_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optionally format the answer text with context
    if format_answer_text:
        answer_text = format_answer(similar_docs, answer_text)

    if return_answer_only:
        answer_text = answer_text.split("Answer:\n")[1]  # Strip the formatting to return only the answer

    return answer_text

# Main workflow
download_pdfs(wiki_pages)
combined_pdf_path = "combined_election_documents.pdf"
combine_pdfs(wiki_pages, combined_pdf_path)
text = extract_text_from_pdf(combined_pdf_path)
chunks = split_text_into_chunks(text)
chunk_embeddings = embed_chunks(chunks)

# Build FAISS index
index = build_faiss_index(chunk_embeddings)

# Configure model based on GPU memory
gpu_memory_gb = get_gpu_memory_info()
use_quantization_config, model_id = configure_model(gpu_memory_gb)

# Set up the model
tokenizer, llm_model = setup_model(model_id, use_quantization_config)

# Initialize the SentenceTransformer model
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')




[INFO] File pdfs/Elections_in_India.pdf already exists.
[INFO] File pdfs/2024_Indian_general_election.pdf already exists.
[INFO] File pdfs/2024_elections_in_India.pdf already exists.
[INFO] File pdfs/Lok_Sabha.pdf already exists.
[INFO] File pdfs/Rajya_Sabha.pdf already exists.
[INFO] File pdfs/State_Legislative_Assemblies.pdf already exists.
[INFO] pdfs/President_and_Vice_President.pdf doesn't exist, downloading...
[ERROR] Failed to download the file President_and_Vice_President. Error: 404 Client Error: Not Found for url: https://en.wikipedia.org/api/rest_v1/page/pdf/President_and_Vice_President
[INFO] File pdfs/Election_Commission_of_India.pdf already exists.
[INFO] File pdfs/2019_Indian_general_election.pdf already exists.
[INFO] File pdfs/2014_Indian_general_election.pdf already exists.
[INFO] File pdfs/2009_Indian_general_election.pdf already exists.
[INFO] File pdfs/2004_Indian_general_election.pdf already exists.
[INFO] File pdfs/1999_Indian_general_election.pdf already exists.

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it
Using attention implementation: sdpa


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
# Ask a query
query = "How to cook eggs?"
print(f"Query: {query}")
answer = ask(query, index, sentence_transformer_model, tokenizer, llm_model, chunks)
print(f"Answer: {answer}")

Query: How to cook eggs?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer: 
**How to Cook Eggs in India**

**Step 1: Gather Your Ingredients**

* 2-3 eggs
* 1 tablespoon of oil
* Salt and pepper to taste

**Step 2: Prepare the Eggs**

* Separate the eggs into individual containers.
* Beat the eggs with a fork or whisk until they are smooth and creamy.

**Step 3: Heat the Oil**

* Heat a pan or griddle over medium heat.
* Add a few drops of oil to the pan.

**Step 4: Cook the Eggs**

* Pour the beaten eggs into the pan.
* Cook for 5-7 minutes, or until the eggs are set to your desired doneness.
* Season with salt and pepper to taste.

**Step 5: Serve**

* Serve the cooked eggs immediately.

**Tips:**

* For a firmer egg, cook for longer.
* For a softer egg,


In [6]:
# Split the string at "Answer:" and take the part that follows it
answer_parts = answer.split("Query")
if len(answer_parts) > 1:
    # Take the part after "Answer:" and strip leading/trailing whitespace
    answer_only = answer_parts[1].strip()
else:
    # If "Answer:" is not found, use the whole string
    answer_only = answer_text.strip()

# Print only the answer part
print("Query",answer_only)

Query : Give me a brief overview of 2014 Indian general elections
Answer:

Sure, here's a brief overview of the 2014 Indian general elections:

- 543 of the 545 seats in the Lok Sabha were contested in the 2014 Indian general elections.
- The elections were held from April 12 to May 12, 2014.
- The results were declared on May 16, 2014.
- The Bharatiya Janata Party (BJP) emerged victorious, securing 303 seats in the Lok Sabha, the most by any party at the time.
- The Congress Party finished second with 288 seats, while the Indian National Congress (INC) secured 108 seats.
- The BJP's victory marked a significant shift in the political landscape of India, as it had won a majority of the Lok Sabha seats for the first time since 2004.
