In [1]:
import fitz  # PyMuPDF לקריאת PDF
import nltk
from sentence_transformers import SentenceTransformer, util

# Ensure NLTK resources are downloaded
nltk.download("punkt")

# 🔹 טעינת המודל
model = SentenceTransformer('all-MiniLM-L6-v2')

# 🔹 **טעינת מסמך טקסטואלי (כאן נשתמש בטקסט דוגמה)**
document_text = """If you have a billing issue, please check your latest statement in the 'My Account' section of our website.
If there is an unexpected charge, ensure that it is not a pending transaction or a recurring subscription.
If you still have concerns, you can contact customer support via email at support@example.com or call 123-456-7890.
Our team is available 24/7 to help resolve your issue.

If your internet is not working, first try restarting your modem and router.
Ensure that all cables are properly connected and that your service is active.
If the problem persists, check for any service outages in your area by visiting our website.
You can also contact our technical support team at 123-456-7891 for further troubleshooting.

To change your subscription plan, log into your account and navigate to the 'Plans & Pricing' section.
You can upgrade or downgrade your plan at any time without penalty.
If you need assistance, you can reach out to our customer service team through live chat or call us during business hours.

Our refund policy allows returns within 30 days of purchase.
The product must be in its original condition with all packaging intact.
Refunds are typically processed within 5-7 business days.
If you have questions about eligibility or the process, please contact our support team.
"""


# 🔹 **פונקציה לחלוקת הטקסט ל-Chunks בגודל משתנה**
def split_text(text, chunk_size):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        current_chunk.append(sentence)
        current_length += len(sentence)
        if current_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


# 🔹 **יצירת שני סוגי Chunking**
small_chunks = split_text(document_text, chunk_size=50)  # קטנים מדי
large_chunks = split_text(document_text, chunk_size=300)  # גודל מתאים

# 🔹 **שאילתות לבדיקת חיפוש**
queries = [
    "How do I check unexpected charges on my bill?",
    "My internet is slow, what should I do?",
    "Can I downgrade my subscription?",
    "What is your refund process?"
]


# 🔹 **פונקציה לחיפוש תשובות במסד נתונים**
def search_chunks(chunks, query):
    chunk_embeddings = model.encode(chunks, convert_to_tensor=True)
    query_embedding = model.encode([query], convert_to_tensor=True)
    similarities = util.cos_sim(query_embedding, chunk_embeddings)[0]
    best_idx = similarities.argmax().item()
    return chunks[best_idx]


# 🔹 **הרצת בדיקות חיפוש על שני סוגי ה-Chunks**
for query in queries:
    print(f"\n🔹 **Query:** {query}")

    # חיפוש עם Chunks גדולים
    best_large_chunk = search_chunks(large_chunks, query)
    print(f"✅ **Best Answer (Large Chunks):** {best_large_chunk}\n")

    # חיפוש עם Chunks קטנים מדי
    best_small_chunk = search_chunks(small_chunks, query)
    print(f"⚠ **Best Answer (Small Chunks - Lost Context):** {best_small_chunk}\n")
    print("-" * 100)



[nltk_data] Downloading package punkt to /Users/grembek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



🔹 **Query:** How do I check unexpected charges on my bill?
✅ **Best Answer (Large Chunks):** If you have a billing issue, please check your latest statement in the 'My Account' section of our website. If there is an unexpected charge, ensure that it is not a pending transaction or a recurring subscription. If you still have concerns, you can contact customer support via email at support@example.com or call 123-456-7890.

⚠ **Best Answer (Small Chunks - Lost Context):** If there is an unexpected charge, ensure that it is not a pending transaction or a recurring subscription.

----------------------------------------------------------------------------------------------------

🔹 **Query:** My internet is slow, what should I do?
✅ **Best Answer (Large Chunks):** Our team is available 24/7 to help resolve your issue. If your internet is not working, first try restarting your modem and router. Ensure that all cables are properly connected and that your service is active. If the problem per