In [None]:
import pymupdf  
import nltk
import torch
import spacy
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoTokenizer

# טעינת המודל ליצירת Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# טוען את המודלים של Hugging Face ו-Spacy
nlp = spacy.load("en_core_web_sm")
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)

# מודל לניתוח סנטימנטים עבור טקסטים
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=0 if torch.cuda.is_available() else -1)

#  מודל Zero-Shot Classification לסיווג כוונה
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# רשימת קטגוריות לסיווג הכוונה של השאלה
INTENT_CATEGORIES = [
    "Clarification/Definition",
    "Opinions/Discussion",
    "Instructions",
    "Troubleshooting",
    "Facts"
]

# מזהה את הכוונה של השאלה שהמשתמש שואל
def classify_intent_nlp(question):
    result = classifier(question, INTENT_CATEGORIES)
    best_category = result["labels"][0]  
    return best_category

# פונקציה לקריאת טקסט מתוך קובץ PDF
def load_pdf(file_path):
    text = ""
    doc = pymupdf.open(file_path)
    for page in doc:
        text += page.get_text("text") + "\n"
    return text

# חלוקת הטקסט לחלקים
def chunk_text(text, chunk_size=512):
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        current_chunk.append(sentence)
        current_length += len(sentence)

        if current_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            current_length = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# בודק אם מילה מופיעה בהקשר טכנולוגי
def appears_in_tech_context(word, sentence):
    doc = nlp(sentence)

    for token in doc:
        if token.text.lower() == word.lower():
            if token.pos_ == "NOUN":
                for child in token.children:
                    if child.pos_ in {"VERB", "ADJ", "ADV"}:  
                        return True

            for ancestor in token.ancestors:
                if ancestor.pos_ == "VERB":
                    return True  

            if token.dep_ in {"dobj", "pobj", "attr", "nsubj"}:
                return True

    return False  

# מזהה ישויות בטקסט ומתקן שגיאות זיהוי
def extract_entities_combined(text):
    entities = {
        "locations": [], "people": [], "organizations": [],
        "dates": [], "numbers": [], }

    # קבלת ישויות עם Hugging Face
    ner_results = ner_pipeline(text)

    # זיהוי ישויות עם Spacy
    doc = nlp(text)

    # תיקון ישויות שזוהו בצורה שגויה
    for entity in ner_results:
        entity_text = entity['word'].strip()
        entity_label = entity['entity_group']

        # דילוג על ישויות שזוהו בצורה שגויה
        if is_misclassified_entity(entity_text, entity_label, text):
            continue

            # הוספת הישות המתוקנת לפי הקטגוריה המתאימה
        if entity_label == "LOC":
            entities["locations"].append(entity_text)
        elif entity_label == "PER":
            entities["people"].append(entity_text)
        elif entity_label == "ORG":
            entities["organizations"].append(entity_text)
        elif entity_label == "MISC":
            if any(char.isdigit() for char in entity_text):
                entities["numbers"].append(entity_text)
            else:
                entities["organizations"].append(entity_text)

    # תיקון והוספת ישויות נוספות מ-Spacy
    for ent in doc.ents:
        entity_text = ent.text.strip()
        entity_label = ent.label_

        if is_misclassified_entity(entity_text, entity_label, text):
            continue 

        if entity_label == "GPE":
            entities["locations"].append(entity_text)
        elif entity_label == "PERSON":
            entities["people"].append(entity_text)
        elif entity_label in {"ORG", "FAC"}:
            entities["organizations"].append(entity_text)
        elif entity_label == "DATE":
            entities["dates"].append(entity_text)
        elif entity_label in {"CARDINAL", "QUANTITY"}:
            entities["numbers"].append(entity_text)

    # ניקוי כפילויות ותיקון שמות
    entities["organizations"] = clean_organizations(entities["organizations"])

    # ניקוי כפילויות בכל הרשימות
    for key in entities:
        entities[key] = list(set(entities[key]))

    return entities

# מזהה אם ישות סווגה בצורה לא נכונה
def is_misclassified_entity(entity_text, entity_label, context):
    doc = nlp(context)

    for token in doc:
        if token.text.lower() == entity_text.lower():
            if entity_label == "ORG" and is_tech_term(entity_text):
                return True

            if entity_label == "GPE" and is_not_real_location(entity_text):
                return True

    return False

# מזהה אם מילה היא מונח טכנולוגי על בסיס Embeddings
def is_tech_term(word):
    embedding_vector = embedding_model.encode(word, convert_to_tensor=True)
    similarity_score = embedding_model.similarity(embedding_vector, embedding_model.encode("technology"))
    return similarity_score > 0.6  

#  פונקציה לבדוק אם מיקום הוא לא באמת מיקום
def is_not_real_location(word):
    embedding_vector = embedding_model.encode(word, convert_to_tensor=True)
    similarity_score = embedding_model.similarity(embedding_vector, embedding_model.encode("geographic place"))

    return similarity_score < 0.5

# מנקה שמות ישויות על ידי הסרת מילים לא רלוונטיות
def normalize_entity_name(entity_name):
    doc = nlp(entity_name) 
    tokens = [token.text for token in doc if token.dep_ != "det"] 
    cleaned_name = " ".join(tokens).strip() 
    return cleaned_name

# מנקה שמות ארגונים ומונע כפילויות
def clean_organizations(organization_list):
    cleaned_orgs = set()  

    for org in organization_list:
        normalized_org = normalize_entity_name(org)  
        cleaned_orgs.add(normalized_org)

    return list(cleaned_orgs)

# מבצע חיבור של ישויות מרובות מילים
def merge_multiword_entities(entity_list):
    merged_entities = []
    skip_next = False

    for i in range(len(entity_list) - 1):
        if skip_next:
            skip_next = False
            continue

        current_entity = entity_list[i]
        next_entity = entity_list[i + 1]

        #  מניעת חיבור של מותגים נפרדים יחד
        if len(current_entity.split()) == 1 and len(next_entity.split()) == 1:
            merged_entities.append(current_entity)
            merged_entities.append(next_entity)
            skip_next = True
            continue

        #  חיבור ישויות מרובות מילים
        if current_entity.istitle() and next_entity.istitle():
            combined_entity = f"{current_entity} {next_entity}"
            if combined_entity not in merged_entities:
                merged_entities.append(combined_entity)
            skip_next = True
        else:
            if current_entity not in merged_entities:
                merged_entities.append(current_entity)

    if not skip_next and entity_list[-1] not in merged_entities:
        merged_entities.append(entity_list[-1])

    return merged_entities

# בודק האם המיקום שניתן בשאלה אכן מיקום אמיתי
def is_ambiguous_location(word, sentence):

    doc = nlp(sentence)
    tech_terms = {"AI", "machine learning", "data", "software", "algorithm", "platform", "deep learning"}

    for token in doc:
        if token.text.lower() == word.lower():
            surrounding_words = {t.text.lower() for t in token.sent}
            if tech_terms & surrounding_words:
                return True 

    return False  

#  טעינת קובצי הטקסט
ai_text = load_pdf("Artificial Intelligence and Its Applications.pdf")
gamification_text = load_pdf("History of Gamification and Its Role in the Educational Process.pdf")

# תמיכה בקובץ חיצוני שהמשתמש יבחר
external_file = input("Enter the path for an external file (or press Enter to skip): ")
external_uploaded = bool(external_file)

if external_uploaded:
    external_text = load_pdf(external_file)
    external_chunks = chunk_text(external_text)
    external_embeddings = embedding_model.encode(external_chunks, convert_to_tensor=True)
else:
    external_chunks = []
    external_embeddings = None

# יצירת Chunks לכל מסמך
ai_chunks = chunk_text(ai_text)
gamification_chunks = chunk_text(gamification_text)

# יצירת Embeddings לכל צ'אנק
ai_embeddings = embedding_model.encode(ai_chunks, convert_to_tensor=True)
gamification_embeddings = embedding_model.encode(gamification_chunks, convert_to_tensor=True)

# חיפוש מידע רלוונטי במסמכים
def search_relevant_text(question, embeddings, chunks, doc_name="Document"):
    if embeddings is None or len(chunks) == 0:
        return f"No relevant information found in the {doc_name}."

    question_embedding = embedding_model.encode([question], convert_to_tensor=True)
    similarities = util.cos_sim(question_embedding, embeddings)[0]

    best_idx = similarities.argmax().item()
    best_score = similarities[best_idx].item()

    SIMILARITY_THRESHOLD = 0.4
    if best_score < SIMILARITY_THRESHOLD:
        return f"No relevant information found in the {doc_name}."

    return chunks[best_idx]

# לולאת השאלות
while True:
    user_question = input('What can I help you with? (Type "exit" to quit) ')
    if user_question.lower() == 'exit':
        print("Exiting...")
        break

    print('\nProcessing your question...\n')

    # סיווג כוונה באמצעות facebook/bart-large-mnli
    question_intent = classify_intent_nlp(user_question)

    # זיהוי ישויות בטקסט
    entities = extract_entities_combined(user_question)

    # ניתוח סנטימנט
    sentiment_result = sentiment_pipeline(user_question[:512])

    #  חיפוש מידע רלוונטי במסמכים
    ai_answer = search_relevant_text(user_question, ai_embeddings, ai_chunks, "AI Document")
    gamification_answer = search_relevant_text(user_question, gamification_embeddings, gamification_chunks, "Gamification Document")

    # חיפוש מידע רלוונטי בקובץ החיצוני אם קיים
    if external_uploaded:
        external_answer = search_relevant_text(user_question, external_embeddings, external_chunks, "External Document")
    else:
        external_answer = "No external file provided."

    # הדפסת התוצאות
    print(f'Q: {user_question}')
    print(f'📌 Intent: {question_intent}')
    print(f'📍 Entities: {entities}')
    print(f'📊 Sentiment Analysis: {sentiment_result}')
    print(f'🔎 AI Document:\n{ai_answer}\n')
    print(f'🎮 Gamification Document:\n{gamification_answer}\n')
    print(f'📂 External Document:\n{external_answer}\n')

print("\n---------------------------")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu
Device set to use cpu



Processing your question...

Q: what types of AI there are?
📌 Intent: Clarification/Definition
📍 Entities: {'locations': [], 'people': [], 'organizations': [], 'dates': [], 'numbers': []}
📊 Sentiment Analysis: [{'label': '3 stars', 'score': 0.2728259563446045}]
🔎 AI Document:
Today, 
AI is integrated into numerous industries, transforming the way we interact with technology. AI can be categorized into three main types: 
1. Narrow AI (Weak AI) – AI systems designed for specific tasks, such as virtual 
assistants and recommendation systems. 2. General AI (Strong AI) – Hypothetical AI that possesses human-like cognitive abilities 
and can perform any intellectual task that a human can do. 3. Super AI – A theoretical concept where AI surpasses human intelligence in all aspects. Despite its rapid advancements, AI still faces challenges, such as ethical considerations, bias in 
algorithms, and the need for large datasets for training.

🎮 Gamification Document:
No relevant information found 