In [1]:
import os
import PyPDF2
import docx
import openpyxl
from pathlib import Path
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

DOC_FOLDER = "Enter your folder path"
os.makedirs(DOC_FOLDER, exist_ok=True)


In [2]:
from IPython.display import display
import ipywidgets as widgets

upload_widget = widgets.FileUpload(accept='.pdf,.docx,.xlsx', multiple=True)
display(upload_widget)

for name, file_info in upload_widget.value:
    with open(os.path.join(DOC_FOLDER, name), "wb") as f:
        f.write(file_info['content'])

FileUpload(value=(), accept='.pdf,.docx,.xlsx', description='Upload', multiple=True)

In [3]:
def extract_text_pdf(path):
    text = ""
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_docx(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

def extract_text_xlsx(path):
    wb = openpyxl.load_workbook(path)
    text = ""
    for sheet in wb.sheetnames:
        ws = wb[sheet]
        for row in ws.iter_rows(values_only=True):
            row_text = " ".join([str(cell) for cell in row if cell is not None])
            text += row_text + "\n"
    return text

def extract_text(path):
    ext = path.suffix.lower()
    if ext == ".pdf":
        return extract_text_pdf(path)
    elif ext == ".docx":
        return extract_text_docx(path)
    elif ext == ".xlsx":
        return extract_text_xlsx(path)

    elif ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    else:
        return ""


In [4]:
all_docs_text = {}
for file in os.listdir(DOC_FOLDER):
    path = Path(DOC_FOLDER) / file
    if path.is_file():
        print("Extracting:", file)
        text = extract_text(path)
        all_docs_text[file] = text
print("Extraction complete.")

Extracting: Test.txt
Extracting: 4th.pdf
Extraction complete.


In [5]:
def create_chunks(text, max_chars=500):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        chunks.append(text[start:end])
        start = end
    return chunks

all_chunks = []
metadata = []

for filename, text in all_docs_text.items():
    chunks = create_chunks(text)
    for i, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        metadata.append({
            "content": chunk,
            "source": filename,
            "chunk_id": i
        })

In [6]:
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embed_model.encode(all_chunks, convert_to_numpy=True).astype('float32')
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

# Save index & metadata
faiss.write_index(faiss_index, "faiss_index.bin")
with open("metadata.json", "w") as f:
    json.dump(metadata, f)


In [7]:
faiss_index = faiss.read_index("faiss_index.bin")
with open("metadata.json", "r") as f:
    metadata = json.load(f)


In [8]:
def get_embedding(text):
    return embed_model.encode([text])[0].astype('float32')

def retrieve_documents(query, top_k=3):
    query_vec = np.array([get_embedding(query)])
    distances, indices = faiss_index.search(query_vec, top_k)
    results = []
    for idx in indices[0]:
        if idx < len(metadata):
            results.append(metadata[idx])
    return results

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map="cpu"
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="cpu"  # you can go for gpu
)

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cpu


In [10]:
def generate_answer_local(query, docs, max_tokens=300):
    context = "\n\n".join([f"{d['content']} (Source: {d['source']})" for d in docs])
    prompt = f"Answer the query based on the context below:\n\n{context}\n\nQuery: {query}\nAnswer:"
    output = generator(prompt, max_new_tokens=max_tokens, do_sample=True)
    return output[0]['generated_text']


In [11]:
def answer_query(query, top_k=3):
    docs = retrieve_documents(query, top_k)
    answer = generate_answer_local(query, docs)
    return answer, docs


In [13]:
query = input("Enter the query: ")
answer, docs = answer_query(query)

print("Answer:\n", answer)
print("\nSources:")
for d in docs:
    print("-", d['source'])

Enter the query:  cat


Answer:
 Answer the query based on the context below:

notification is sent
with voice guidance whenever medicines are taken or
refilling happens.
• [14] Various functions, this device contains pills which
patients need to take. Monitoring dosage time for
patients/caregivers by send notification through LED,
alarm alerts. Here just LEDs and alarm are used for
reminding about prescription.
• A medicine box with preserving and reminding features
of medicines that reminds at the time of recording is
should be at open position.
• Devices with advanced f (Source: 4th.pdf)

d pin to
Ground. The analog pin of Arduino A0 is used to analog pin
of GSM module in order to interface it. Arduino uses AT
commands to communicate with the GSM module and send a
message to the programmed mobile phone number. The AT +
CMGS at command is used to send SMS.
Fig. 3. Photo of the developed medicine box
Authorized licensed use limited to: Dr. D. Y. Patil Institute of Technology Pimpri Pune. Downloaded on Februa