<a href="https://colab.research.google.com/github/kundana22bce8985/knowledge-based-search-engine/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -------------------------------
# Install required packages
# -------------------------------
!pip install sentence-transformers faiss-cpu PyPDF2 transformers torch --quiet

# -------------------------------
# Import libraries
# -------------------------------
import PyPDF2
import os
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

# -------------------------------
# Path to your PDF(s)
# -------------------------------
# Make sure your PDFs are uploaded to /content/ or a specific folder
pdf_folder = "/content/"  # Change if needed
pdf_files = ["Resume.pdf"] # Change if needed to process other files in the folder


# -------------------------------
# Extract text from PDFs
# -------------------------------
documents = []

for file in pdf_files:
    path = os.path.join(pdf_folder, file)
    reader = PyPDF2.PdfReader(path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + " "
    documents.append(text)

print(f"Loaded {len(documents)} documents")

# -------------------------------
# Chunk text
# -------------------------------
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc))

print(f"Total chunks: {len(all_chunks)}")

# -------------------------------
# Generate embeddings
# -------------------------------
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(all_chunks, show_progress_bar=True)

# -------------------------------
# Create FAISS index
# -------------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
print(f"FAISS index has {index.ntotal} vectors")

# -------------------------------
# Define search and QA functions
# -------------------------------
qa_model = pipeline("question-answering")

def search(query, k=5):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), k)
    return [all_chunks[i] for i in indices[0]]

def generate_answer(query):
    context = " ".join(search(query))
    answer = qa_model(question=query, context=context, max_length=200)['answer']
    return answer

# -------------------------------
# Example query
# -------------------------------
query = "What are the projects mentioned?"
answer = generate_answer(query)
print("🔍 Query:", query)
print("🧠 Answer:", answer)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hLoaded 1 documents
Total chunks: 1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


FAISS index has 1 vectors


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


🔍 Query: What are the projects mentioned?
🧠 Answer: Eclipse Projects Plant Disease Detection
