# RAG Chatbot Project

### 1st we should convert the pdf to text

In [10]:
# pdf to  text converter
import pdfplumber

pdf_files = [
    "content/admission_advertisement_25_batch.pdf",
    "content/ProspectusBatch_2514.05.2025.pdf"
]

documents = []

for pdf_path in pdf_files:
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                documents.append({
                    "text": text,
                    "source": pdf_path,
                    "page": page_num + 1
                })


# 2. Text Chunking

In [11]:
def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap

    return chunks


chunks = []

for doc in documents:
    split_texts = chunk_text(doc["text"])
    for chunk in split_texts:
        chunks.append({
            "text": chunk,
            "source": doc["source"],
            "page": doc["page"]
        })

### 3. Embeddings + FAISS

In [14]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

In [15]:
model = SentenceTransformer("all-MiniLM-L6-v2")

#### Create embeddings

In [16]:
texts = [c["text"] for c in chunks]
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)

embeddings = np.array(embeddings).astype("float32")

Batches: 100%|██████████| 20/20 [00:08<00:00,  2.30it/s]


#### Build FAISS index

In [17]:
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

#### Keep metadata aligned with vectors

In [18]:
metadata = chunks

### 4. Save the FAISS index and metadata

In [None]:
faiss.write_index(index, "faiss.index")

import pickle
with open("metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)

## Let's load the index and metadata

In [21]:
import faiss
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np

index = faiss.read_index("faiss.index")

with open("metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

model = SentenceTransformer("all-MiniLM-L6-v2")

## Retrieval function

In [None]:
def retrieve(query, k=5):
    q_embedding = model.encode([query]).astype("float32")
    distances, indices = index.search(q_embedding, k)

    results = []
    for idx in indices[0]:
        results.append(metadata[idx])

    return results

## Example test without LLM

In [23]:
query = "What is the admission deadline?"
docs = retrieve(query)

for d in docs:
    print(d["source"], "page", d["page"])
    print(d["text"][:200])
    print("----")


content/ProspectusBatch_2514.05.2025.pdf page 127
RULES AND PROCEDURE FOR ADMISSION
126
----
content/ProspectusBatch_2514.05.2025.pdf page 17
2.02.2026 28.07.2026
Pre-Admission Test (26-Batch) on 27.05.2026 Start of a new session (26-Batch) on 07.09.2026
Summer Vaca on
Including Make-up
semester
16
----
content/ProspectusBatch_2514.05.2025.pdf page 3
 supersede the old
ones.
Enquiries concerning admissions should be addressed to:
The Registrar or Chairman Admission Commi ee
Quaid-e-Awam University of Engineering, Science and Technology, Nawabshah,
----
content/ProspectusBatch_2514.05.2025.pdf page 17
ACADEMIC CALENDAR
SSSEEEMMMEEESSSTTTEEERRR SSSYYYSSSTTTEEEMMM (((OOOBBBEEE SSSYYYSSSTTTEEEMMM)))
1st Semester 2nd Semester Winter Summer Vaca on
Batch & Semester First Year First Year with
Vaca on
(25
----
content/admission_advertisement_25_batch.pdf page 3
PRE-ADMISSION TEST
01. All eligible candidates shall be required to appear in Pre-Admission Test date
the computer-based Pre-Admission

# APP

In [None]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI

os.environ["GOOGLE_API_KEY"] = "AIzaSyDpQ2gx80ZlIOchRlhgBkw_xY5AuhM4s2U"

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0
)

In [59]:
import os 
from langchain_groq import ChatGroq

os.environ["GROQ_API_KEY"] = "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"

llm = ChatGroq(
    model="llama-3.3-70b-versatile",
    temperature=0
)

In [None]:
def rag_answer(query, k=5):
    docs = retrieve(query, k)

    context = "\n\n".join(
        [f"Source: {d['source']} Page: {d['page']}\n{d['text']}" for d in docs]
    )

    prompt = f"""
Use only the context below.
Answer the question based on the context.
QUEST - Quaid-e-Awam University - Nawabshah
If answer is not present, say: Not found in documents But I will try to find the answer.


Context:
{context}

Question:
{query}
"""

    return llm.invoke(prompt).content

In [64]:
print(rag_answer("What are the eligibility criteria for Data Science?"))

duction of research projects. The university has a well-equipped computer lab with internet facility. The university has a well-equipped library with a collection of more than 50,000 books, journals, and other study materials. The university has a well-equipped gymnasium for the students. The university has a well-equipped auditorium for seminars, workshops, and other academic activities. The university has a well-equipped cafeteria for the students. The university has a well-equipped first aid center for the students. The university has a well-equipped transport system for the students. The university has a well-equipped sports complex for the students. The university has a well-equipped mosque for the students. The university has a well-equipped bank facility for the students. The university has a well-equipped shopping center for the students. The university has a well-equipped post office for the students. The university has a well-equipped telephone exchange for the students. The 

In [53]:
print(rag_answer("What is the admission deadline?"))

on for 26 Batch from 02.06.2026 to 28.06.2026
Pre-Admission Test (27-Batch) on 29.07.2026
Start of a new session (27-Batch) on 09.11.2026
Autumn Break for 26 Batch from 04.10.2026 to 11.10.2026
Winter Vacaon for 26 Batch from 27.12.2026 to 02.01.2027
Summer Vacaon for 27 Batch from 01.06.2027 to 27.06.2027

Source: content/ProspectusBatch_2514.05.2025.pdf Page: 127
RULES AND PROCEDURE FOR ADMISSION
127
4.2.2. Selection Criteria
The selection of candidates will be based on the Pre-Admission Test (PAT) 
scores. The test will assess the aptitude and knowledge of the candidates. 
The weightage of the test scores will be as follows:
- Pre-Admission Test (PAT): 100%
Merit list will be prepared based on the PAT scores. Candidates will be 
shortlisted according to their performance in the PAT.

Selection will be based solely on the Pre-Admission Test (PAT) scores. 
Candidates must meet the minimum eligibility criteria to be considered for 
admission.

Source: content/ProspectusBatch_2514.05.20