<a href="https://colab.research.google.com/github/kundana22bce8985/knowledge-based-search-engine/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# -------------------------------
# Install required packages
# -------------------------------
!pip install sentence-transformers faiss-cpu PyPDF2 transformers torch --quiet

# -------------------------------
# Import libraries
# -------------------------------
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# -------------------------------
# Path to your PDF(s)
# -------------------------------
# Make sure your PDFs are uploaded to /content/ or a specific folder
pdf_folder = "/content/"  # Change if needed
pdf_files = ["Resume.pdf"]  # Add more PDFs if needed

# -------------------------------
# Extract text from PDFs
# -------------------------------
documents = []

for file in pdf_files:
    path = os.path.join(pdf_folder, file)
    reader = PyPDF2.PdfReader(path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + " "
    documents.append(text)

print(f"Loaded {len(documents)} documents")

# -------------------------------
# Chunk text
# -------------------------------
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc))

print(f"Total chunks: {len(all_chunks)}")

# -------------------------------
# Generate embeddings
# -------------------------------
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(all_chunks, show_progress_bar=True)

# -------------------------------
# Create FAISS index
# -------------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
print(f"FAISS index has {index.ntotal} vectors")

# -------------------------------
# Define search and QA functions
# -------------------------------
qa_model = pipeline("question-answering")  # defaults to DistilBERT QA model

def search(query, k=5):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), k)
    return [all_chunks[i] for i in indices[0]]

def generate_answer(query):
    context = " ".join(search(query))
    answer = qa_model(question=query, context=context, max_length=200)['answer']
    return answer

# -------------------------------
# Interactive plain input
# -------------------------------
while True:
    query = input("Enter your question (or 'exit' to quit): ")
    if query.lower() == "exit":
        break
    answer = generate_answer(query)
    print("Answer:", answer)
    print("-" * 50)


Loaded 1 documents
Total chunks: 1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


FAISS index has 1 vectors


Device set to use cpu


Enter your question (or 'exit' to quit): What are Projects?
Answer: Eclipse Projects Plant Disease Detection
--------------------------------------------------
Enter your question (or 'exit' to quit): exit
