In [None]:
!pip install transformers --upgrade

In [None]:
!pip install pypdf2

In [None]:
!pip install PyPDF2 pymupdf nltk sentence-transformers scikit-learn faiss-cpu transformers langchain bitsandbytes


In [17]:
import os
import requests
import re
from PyPDF2 import PdfMerger
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import faiss
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.utils import is_flash_attn_2_available

# List of Wikipedia page titles and a direct PDF URL
wiki_pages = [
    "Elections_in_India",
    "2024_Indian_general_election",
    "2024_elections_in_India",
    "Lok_Sabha",
    "Rajya_Sabha",
    "State_Legislative_Assemblies",
    "President_and_Vice_President",
    "Election_Commission_of_India",
    "2019_Indian_general_election",
    "2014_Indian_general_election",
    "2009_Indian_general_election",
    "2004_Indian_general_election",
    "1999_Indian_general_election",
    "1998_Indian_general_election",
    "1996_Indian_general_election",
    "1991_Indian_general_election",
    "1989_Indian_general_election",
    "1984_Indian_general_election",
    "1980_Indian_general_election",
    "1977_Indian_general_election",
    "1971_Indian_general_election",
    "1967_Indian_general_election",
    "1962_Indian_general_election",
    "1957_Indian_general_election",
    "1951–52_Indian_general_election",
    "https://cdn.downtoearth.org.in/library/0.61706000_1558592806_first-general-elections-in-india,-vol.pdf"
]

# Directory to save individual PDFs
pdf_dir = "pdfs"
os.makedirs(pdf_dir, exist_ok=True)

def safe_filename(page):
    """Generate a safe filename from the page title or URL."""
    return re.sub(r'[\\/*?:"<>|]', "_", page) + ".pdf"

def download_pdfs(wiki_pages):
    """Download PDFs from Wikipedia and other sources."""
    for page in wiki_pages:
        pdf_path = os.path.join(pdf_dir, safe_filename(page))
        if not os.path.exists(pdf_path):
            print(f"[INFO] {pdf_path} doesn't exist, downloading...")
            url = page if page.startswith("https://") else f"https://en.wikipedia.org/api/rest_v1/page/pdf/{page}"
            try:
                response = requests.get(url)
                response.raise_for_status()
                with open(pdf_path, "wb") as file:
                    file.write(response.content)
                print(f"[INFO] The file has been downloaded and saved as {pdf_path}")
            except requests.RequestException as e:
                print(f"[ERROR] Failed to download the file {page}. Error: {e}")
        else:
            print(f"[INFO] File {pdf_path} already exists.")

def combine_pdfs(wiki_pages, combined_pdf_path):
    """Combine multiple PDFs into one."""
    pdf_merger = PdfMerger()
    for page in wiki_pages:
        pdf_path = os.path.join(pdf_dir, safe_filename(page))
        if os.path.exists(pdf_path):
            pdf_merger.append(pdf_path)
        else:
            print(f"[WARN] {pdf_path} does not exist and will not be included in the combined PDF.")
    with open(combined_pdf_path, "wb") as output_file:
        pdf_merger.write(output_file)
    pdf_merger.close()
    print(f"[INFO] All PDFs have been combined into {combined_pdf_path}")

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    pdf_document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text += page.get_text()
    return text

def split_text_into_chunks(text, chunk_size=256, chunk_overlap=10):
    """Split text into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.create_documents([text])
    return [doc.page_content for doc in documents]

def embed_chunks(chunks, model_name='all-MiniLM-L6-v2'):
    """Embed text chunks using a Sentence Transformer model."""
    model = SentenceTransformer(model_name)
    return model.encode(chunks)

def build_faiss_index(embeddings):
    """Build a FAISS index for retrieval."""
    d = embeddings.shape[1]  # Dimension of embeddings
    index = faiss.IndexFlatL2(d)  # Create a FAISS index with the correct dimension
    index.add(embeddings)  # Add embeddings to the index
    return index

def retrieve_similar_documents(query, index, model, chunks, k=10):
    """Retrieve similar documents based on a query."""
    query_embedding = model.encode([query])  # Encode the query
    D, I = index.search(query_embedding, k)  # Search the index
    similar_docs = [chunks[i] for i in I[0]]
    distances = D[0]
    return similar_docs, distances

def get_gpu_memory_info():
    """Get available GPU memory."""
    gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
    gpu_memory_gb = round(gpu_memory_bytes / (2**30))
    return gpu_memory_gb

def configure_model(gpu_memory_gb):
    """Configure the model based on available GPU memory."""
    if gpu_memory_gb < 5.1:
        print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
    elif gpu_memory_gb < 8.1:
        print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
        use_quantization_config = True
        model_id = "google/gemma-2b-it"
    elif gpu_memory_gb < 19.0:
        print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
        use_quantization_config = False
        model_id = "google/gemma-2b-it"
    elif gpu_memory_gb > 19.0:
        print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
        use_quantization_config = False
        model_id = "google/gemma-7b-it"
    print(f"use_quantization_config set to: {use_quantization_config}")
    print(f"model_id set to: {model_id}")
    return use_quantization_config, model_id

def setup_model(model_id, use_quantization_config):
    os.environ['HF_TOKEN'] = "your_actual_hugging_face_token"
    
    """Set up the language model."""
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id, use_auth_token=os.getenv('HF_TOKEN'))
    quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) if use_quantization_config else None
    attn_implementation = "flash_attention_2" if (is_flash_attn_2_available() and torch.cuda.get_device_capability(0)[0] >= 8) else "sdpa"
    print(f"Using attention implementation: {attn_implementation}")
    llm_model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_id,
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        low_cpu_mem_usage=False,
        attn_implementation=attn_implementation,
        use_auth_token=os.getenv('HF_TOKEN')
    )
    if not use_quantization_config:
        llm_model.to("cuda")
    return tokenizer, llm_model

def format_answer(similar_docs, answer_text):
    """Format the answer text with context from similar documents."""
    formatted_answer = "### Retrieved Documents:\n"
    for i, doc in enumerate(similar_docs):
        formatted_answer += f"Document {i + 1}:\n{doc}\n\n"
    formatted_answer += "### Answer:\n"
    formatted_answer += answer_text
    return formatted_answer

def ask(query, index, model, tokenizer, llm_model, chunks, temperature=0.7, max_new_tokens=200, format_answer_text=True, return_answer_only=False):
    """
    Retrieve similar documents based on the query, then generate an answer using a language model.
    """
    # Retrieve similar documents
    similar_docs, _ = retrieve_similar_documents(query, index, model, chunks)

    # Format prompt
    context_items = [f"Document {i + 1}: {doc}" for i, doc in enumerate(similar_docs)]
    prompt = f"Context:\n{''.join(context_items)}\n\nQuery: {query}\nAnswer:"

    # Generate answer using language model
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = llm_model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature)
    answer_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Optionally format the answer text with context
    if format_answer_text:
        answer_text = format_answer(similar_docs, answer_text)

    if return_answer_only:
        answer_text = answer_text.split("### Answer:\n")[1]  # Strip the formatting to return only the answer

    return answer_text

# Main workflow
download_pdfs(wiki_pages)
combined_pdf_path = "combined_election_documents.pdf"
combine_pdfs(wiki_pages, combined_pdf_path)
text = extract_text_from_pdf(combined_pdf_path)
chunks = split_text_into_chunks(text)
chunk_embeddings = embed_chunks(chunks)

# Build FAISS index
index = build_faiss_index(chunk_embeddings)

# Configure model based on GPU memory
gpu_memory_gb = get_gpu_memory_info()
use_quantization_config, model_id = configure_model(gpu_memory_gb)

# Set up the model
tokenizer, llm_model = setup_model(model_id, use_quantization_config)

# Initialize the SentenceTransformer model
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')




[INFO] File pdfs/Elections_in_India.pdf already exists.
[INFO] File pdfs/2024_Indian_general_election.pdf already exists.
[INFO] File pdfs/2024_elections_in_India.pdf already exists.
[INFO] File pdfs/Lok_Sabha.pdf already exists.
[INFO] File pdfs/Rajya_Sabha.pdf already exists.
[INFO] File pdfs/State_Legislative_Assemblies.pdf already exists.
[INFO] pdfs/President_and_Vice_President.pdf doesn't exist, downloading...
[ERROR] Failed to download the file President_and_Vice_President. Error: 404 Client Error: Not Found for url: https://en.wikipedia.org/api/rest_v1/page/pdf/President_and_Vice_President
[INFO] File pdfs/Election_Commission_of_India.pdf already exists.
[INFO] File pdfs/2019_Indian_general_election.pdf already exists.
[INFO] File pdfs/2014_Indian_general_election.pdf already exists.
[INFO] File pdfs/2009_Indian_general_election.pdf already exists.
[INFO] File pdfs/2004_Indian_general_election.pdf already exists.
[INFO] File pdfs/1999_Indian_general_election.pdf already exists.

Batches:   0%|          | 0/229 [00:00<?, ?it/s]

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it
Using attention implementation: sdpa


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
# Ask a query
query = "Give me a brief overview of 2014 Indian general elections"
print(f"Query: {query}")
answer = ask(query, index, sentence_transformer_model, tokenizer, llm_model, chunks)
print(f"Answer: {answer}")

Query: Give me a brief overview of 2014 Indian general elections


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer: ### Retrieved Documents:
Document 1:
Information About India Elections (http://www.indiaelections.co.in)
News Coverages of General Elections – Wionews (https://www.wionews.com/elections)
Archived websites

Document 2:
9. "Final Results 2014 General Elections" (https://web.archive.org/web/20141027170410/http://www.pib.gov.
in/elections2014/). Press Information Bureau, Government of India. Archived from the original (http://www.

Document 3:
pib.gov.in/elections2014/) on 27 October 2014.
10. "General Election to Loksabha Trend and Result 2014" (https://web.archive.org/web/20140519040751/htt
p://www.eciresults.ap.nic.in/PartyWiseResult.htm). Election Commission of India. 16 May 2014. Archived

Document 4:
Election Commission of India website (http://eci.nic.in/eci_main1/index.aspx)
List of Members Elected General Election of the 16th Lok Sabha, 2014 (http://eci.nic.in/eci_main1/current/
ListofElectedMembers_%20fromE-gazette.pdf)

Document 5:
Indian General Elections 2009 Web Archi

In [21]:
# Split the string at "Answer:" and take the part that follows it
answer_parts = answer.split("Query")
if len(answer_parts) > 1:
    # Take the part after "Answer:" and strip leading/trailing whitespace
    answer_only = answer_parts[1].strip()
else:
    # If "Answer:" is not found, use the whole string
    answer_only = answer_text.strip()

# Print only the answer part
print("Query",answer_only)

Query : Give me a brief overview of 2014 Indian general elections
Answer:

Sure, here's a brief overview of the 2014 Indian general elections:

- 543 of the 545 seats in the Lok Sabha were contested in the 2014 Indian general elections.
- The elections were held from April 12 to May 12, 2014.
- The results were declared on May 16, 2014.
- The Bharatiya Janata Party (BJP) emerged victorious, securing 303 seats in the Lok Sabha, the most by any party at the time.
- The Congress Party finished second with 288 seats, while the Indian National Congress (INC) secured 108 seats.
- The BJP's victory marked a significant shift in the political landscape of India, as it had won a majority of the Lok Sabha seats for the first time since 2004.
