In [None]:
import re
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
import torch
import spacy
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Initial configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load models (using lighter versions for demonstration)
sbert_model = SentenceTransformer(
    "paraphrase-MiniLM-L6-v2", device=DEVICE
)  # Lighter model
nlp = spacy.load("en_core_web_sm")  # Using small spaCy model

# Expanded word groups
group_1 = {
    "Economy",
    "Business",
    "Economic Management",
    "Innovation Policy",
    "Socioeconomics",
    "Governance",
    "Public Services",
    "Public Policies",
    "Urban Planning",
    "Social Equity",
    "Cybersecurity",
    "Living",
    "Housing",
    "Tourism",
    "Culture",
    "Buildings",
    "Education",
    "Healthcare",
    "Emergency Safety",
    "Mobility",
    "Traffic Management",
    "Transportation Systems",
    "Electric Vehicles",
    "Public Transit",
    "People",
    "Citizens",
    "Community Engagement",
    "Learning and Teaching",
    "Waste Management",
    "Pollution Control",
    "Resource Conservation",
    "Energy Management",
    "Smart Grids",
    "Lightning",
    "Air Quality",
    "Water Quality",
    "Green Spaces",
}

group_2 = {
    "gpt",
    "bert",
    "llama",
    "dall",
    "dall-e",
    "slm",
    "small language model",
    "gan",
    "generative adversarial network",
    "transformers",
    "transformer model",
    "large language model",
    "llm",
    "vae",
    "diffusion model",
    "neural language model",
    "foundation model",
    "multimodal model",
    "language model",
    "generative ai",
    "generative model",
    "ai model",
    "agent",
    "agents",
}


class EnhancedSemanticSearch:
    def __init__(self):
        # Pre-compute embeddings for the groups
        self.group1_embeddings = self._precompute_embeddings(group_1)
        self.group2_embeddings = self._precompute_embeddings(group_2)

    def _precompute_embeddings(self, terms):
        """Pre-computes embeddings for group terms"""
        return sbert_model.encode(list(terms), convert_to_tensor=True)

    def _calculate_similarity(self, text, group_embeddings):
        """Calculates similarity between text and group terms"""
        text_embedding = sbert_model.encode(text, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(text_embedding, group_embeddings)[0]
        return torch.max(cos_scores).item()

    def _find_key_terms(self, text, group_terms):
        """Finds group terms present in the text using improved matching"""
        found_terms = []
        text_lower = text.lower()

        # 1. Try exact matches first
        for term in group_terms:
            term_lower = term.lower()
            if term_lower in text_lower:
                found_terms.append(term)

        # 2. If no exact matches, try word boundary matches (whole words only)
        if not found_terms:
            text_tokens = set(re.findall(r"\b\w+\b", text_lower))
            for term in group_terms:
                # For multi-word terms, split and check if all words are present
                term_lower = term.lower()
                term_tokens = set(re.findall(r"\b\w+\b", term_lower))

                # If all tokens from the term are in the text tokens, it's a match
                if term_tokens.issubset(text_tokens):
                    found_terms.append(term)

        # 3. Try fuzzy matching for close matches
        if not found_terms:
            doc = nlp(text)
            for term in group_terms:
                term_lower = term.lower()
                # Use spaCy to check for named entities or noun chunks that might match
                for ent in doc.ents:
                    similarity = self._calculate_similarity(
                        ent.text,
                        sbert_model.encode([term_lower], convert_to_tensor=True),
                    )
                    if similarity > 0.7:  # High threshold for fuzzy matching
                        found_terms.append(term)
                        break  # Found a match for this term

        return found_terms

    def analyze_abstracts(self, abstract_data, similarity_threshold=0.5):
        """Analyzes abstract data with a hybrid approach (semantic similarity + term matching)"""
        results = {
            "high_similarity": [],
            "term_matches": [],
            "group1_count": 0,
            "group2_count": 0,
            "bridges": [],
        }

        for item in tqdm(abstract_data, desc="Analyzing abstracts"):
            # Get the introduction text from the dictionary
            text = item["introduction"]
            doi = item.get("doi", "Unknown")

            # Skip empty introductions
            if not text or not isinstance(text, str):
                continue

            # Semantic analysis
            sim1 = self._calculate_similarity(text, self.group1_embeddings)
            sim2 = self._calculate_similarity(text, self.group2_embeddings)

            # Direct term matching
            found_group1 = self._find_key_terms(text, group_1)
            found_group2 = self._find_key_terms(text, group_2)

            # Store analysis results in the item
            item["semantic_analysis"] = {
                "group1_similarity": sim1,
                "group2_similarity": sim2,
                "group1_terms": found_group1,
                "group2_terms": found_group2,
            }

            # Record results
            if sim1 >= similarity_threshold or found_group1:
                results["group1_count"] += 1

            if sim2 >= similarity_threshold or found_group2:
                results["group2_count"] += 1

            if (sim1 >= similarity_threshold or found_group1) and (
                sim2 >= similarity_threshold or found_group2
            ):
                entry = {
                    "text": text,
                    "doi": doi,
                    "group1_terms": found_group1,
                    "group2_terms": found_group2,
                    "group1_sim": sim1,
                    "group2_sim": sim2,
                }

                if sim1 >= similarity_threshold and sim2 >= similarity_threshold:
                    results["high_similarity"].append(entry)
                else:
                    results["term_matches"].append(entry)

                # Extract connecting terms
                doc = nlp(text)
                connectors = [
                    token.text.lower()
                    for token in doc
                    if token.pos_ in ["NOUN", "VERB", "ADJ"]
                    and token.text.lower() not in [t.lower() for t in group_1]
                    and token.text.lower() not in [t.lower() for t in group_2]
                ]
                results["bridges"].extend(connectors)

        # Analyze most common connecting terms
        if results["bridges"]:
            bridge_counts = Counter(results["bridges"])
            results["top_bridges"] = bridge_counts.most_common(10)

        return results


# Get abstracts with non-empty introductions
abstracts_with_intros = [
    item for item in cleaned_abstract_data if item["introduction"].strip()
]

print(
    f"\nAnalyzing {len(abstracts_with_intros)} abstracts with non-empty introductions..."
)

# Create analyzer and analyze abstracts
analyzer = EnhancedSemanticSearch()

analysis = analyzer.analyze_abstracts(abstracts_with_intros, similarity_threshold=0.3)

print("\n📊 Enhanced Results:")
print(f"Texts with high semantic similarity: {len(analysis['high_similarity'])}")
print(f"Texts with term matching: {len(analysis['term_matches'])}")
print(f"Mentions of Group 1 (Smart City domains): {analysis['group1_count']}")
print(f"Mentions of Group 2 (AI/ML technologies): {analysis['group2_count']}")

if "top_bridges" in analysis:
    print("\n🌉 Most Frequent Connecting Terms:")
    for term, count in analysis["top_bridges"]:
        print(f"- {term} (x{count})")

print("\n🔍 Relevant Examples:")
for i, item in enumerate(analysis["high_similarity"][:5], 1):  # Show first 5 examples
    print(f"\n📌 Example {i} (DOI: {item['doi']}):")
    print(f"Text: {item['text'][:200]}...")
    print(f"Group 1 Terms: {', '.join(item['group1_terms'])}")
    print(f"Group 2 Terms: {', '.join(item['group2_terms'])}")
    print(f"Similarity G1: {item['group1_sim']:.2f}, G2: {item['group2_sim']:.2f}")