# Generative AI Applications Analysis by Smart City Domain

In [1]:
# Generative AI Model Classification in Smart City Research
# Using semantic search and multi-strategy classification

import os
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter, defaultdict
from sentence_transformers import SentenceTransformer, util
import torch
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import warnings

warnings.filterwarnings("ignore")

# Set visualization style
sns.set(style="whitegrid")
plt.rcParams.update({"font.size": 12})
plt.rcParams.update({"font.family": "Times New Roman"})



## Load the Classified Abstracts

In [2]:
# Load the previously classified abstracts
with open("../data/06_classified_abstracts_smart_domains.json", "r") as f:
    classified_abstracts = json.load(f)

print(f"Loaded {len(classified_abstracts)} classified abstracts")

# Verify we have both domain classifications and contribution text
abstracts_with_both = [
    abstract
    for abstract in classified_abstracts
    if "macro_domains" in abstract
    and "contribution" in abstract
    and abstract["contribution"].strip()
]

print(
    f"Found {len(abstracts_with_both)} abstracts with both domain classifications and contribution text"
)

# Display a sample contribution
print("\nSample contribution:")
print(abstracts_with_both[0]["contribution"][:300] + "...")

Loaded 455 classified abstracts
Found 401 abstracts with both domain classifications and contribution text

Sample contribution:
To address these issues, this study developed a multi-scale global perceptron network based on Transformer and CNN using novel encoder-decoders for enhancing contextual representation of buildings. Specifically, an improved multi-head-attention encoder is employed by constructing multi-scale tokens ...


## Define Generative AI Model Categories

In [3]:
# Define model-focused generative AI categories
genai_model_categories = {
    "Transformer-Based Models": [
        "transformer",
        "gpt",
        "bert",
        "t5",
        "llama",
        "palm",
        "chatgpt",
        "gpt-2",
        "gpt-3",
        "gpt-4",
        "large language model",
        "llm",
        "small language model",
        "slm",
        "foundation model",
        "pretrained language model",
        "encoder-decoder",
        "attention mechanism",
        "self-attention",
        "generative pretrained transformer",
        "claude",
        "bard",
        "google bard",
        "mistral",
        "gemini",
        "genai",
        "large flow model",
        "lfm",
        "flow model",
        "foundation framework",
        "foundational framework",
        "gen ai",
        "generative capability",
        "generative capabilities",
        "designed with ai",
        "genai",
        "ai capability",
    ],
    "Generative Adversarial Networks": [
        "gan",
        "generative adversarial network",
        "wgan",
        "wgan-gp",
        "conditional gan",
        "cgan",
        "cycle gan",
        "cyclegan",
        "pix2pix",
        "pix2pixhd",
        "sr-gan",
        "srgan",
        "progressive gan",
        "biggan",
        "stylegan",
        "5gt-gan",
        "tabular gan",
        "ctgan",
        "stargan",
        "adv-gan",
    ],
    "Diffusion Models": [
        "diffusion model",
        "ddpm",
        "stable diffusion",
        "latent diffusion",
        "score-based model",
        "noise prediction",
        "denoising diffusion",
        "ddim",
        "guided diffusion",
        "classifier-free guidance",
        "ddpo",
        "text-to-image diffusion",
    ],
    "Variational Autoencoders": [
        "vae",
        "variational autoencoder",
        "beta-vae",
        "conditional vae",
        "cvae",
        "vq-vae",
        "vector quantized vae",
        "hierarchical vae",
        "disentangled vae",
        "autoregressive models",
        "autoencoder",
        "latent space modeling",
    ],
    "Neural Radiance Fields & 3D Models": [
        "nerf",
        "neural radiance field",
        "3d gan",
        "3d generative model",
        "neural rendering",
        "implicit representation",
        "point cloud generation",
        "mesh generation",
        "shape generation",
        "3d reconstruction",
        "neural implicit surface",
        "occupancy network",
        "signed distance function",
    ],
    "Hybrid & Multimodal Architectures": [
        "multimodal model",
        "vision-language model",
        "clip",
        "dall-e",
        "imagen",
        "flamingo",
        "multimodal transformer",
        "cross-attention",
        "contrastive learning",
        "multimodal embedding",
        "cross-modal generation",
        "multimodal fusion",
        "multimodal alignment",
    ],
}

# Add application context to help interpret the models in smart city context
model_application_context = {
    "Transformer-Based Models": "Text generation, policy analysis, citizen service automation, urban planning documents",
    "Generative Adversarial Networks": "Synthetic urban imagery, simulated traffic patterns, building facades, urban scene completion",
    "Diffusion Models": "High-fidelity urban visualization, satellite imagery enhancement, urban design generation",
    "Variational Autoencoders": "Urban pattern modeling, anomaly detection, compressed representations of city data",
    "Neural Radiance Fields & 3D Models": "Digital twins, virtual urban environments, building modeling, urban scene synthesis",
    "Hybrid & Multimodal Architectures": "Integrating visual and textual urban data, cross-modal urban information processing"
}

## Build the Semantic Classifier

In [4]:
# Initial configuration
DEVICE = "cuda" if torch.cuda.is_available() else "mps"
print(f"Using device: {DEVICE}")

# Check and download spaCy model if needed
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import subprocess

    print("Downloading spaCy model...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

Using device: mps


In [11]:
class EnhancedModelClassifier:
    def __init__(self, model_categories, use_context_validation=True, use_adaptive_threshold=True):
        """
        Initialize the classifier with options to enable/disable features
        
        Args:
            model_categories (dict): Dictionary of model categories and their terms
            use_context_validation (bool): Whether to use context validation
            use_adaptive_threshold (bool): Whether to use adaptive thresholding
        """
        # Store configuration flags
        self.use_context_validation = use_context_validation
        self.use_adaptive_threshold = use_adaptive_threshold

        print(f"Configuration: context validation: {use_context_validation}, adaptive threshold: {use_adaptive_threshold}")

        # Use multiple embedding models to reduce single-model bias
        print("Initializing embeddings models...")
        self.models = {
            # "general": SentenceTransformer(
            #     "sentence-transformers/all-MiniLM-L12-v2", device=DEVICE
            # ),
            # Using a more scientific model for technical text
            "scientific": SentenceTransformer(
                "pritamdeka/S-PubMedBert-MS-MARCO", device=DEVICE
            ),
        }

        self.model_categories = model_categories
        self.category_embeddings = {}
        self.term_embeddings = {}

        print("Precomputing embeddings for model terms...")
        # Precompute embeddings with both models
        for category, terms in model_categories.items():
            self.category_embeddings[category] = {}
            for model_name, model in self.models.items():
                self.category_embeddings[category][model_name] = model.encode(
                    category, convert_to_tensor=True
                )

            self.term_embeddings[category] = {}
            for term in terms:
                self.term_embeddings[category][term] = {
                    model_name: model.encode(term, convert_to_tensor=True)
                    for model_name, model in self.models.items()
                }

        # Create lemma patterns for domain-specific term detection
        self.lemma_patterns = {}
        self.nlp = nlp

        print("Creating linguistic patterns for domain terminology...")
        for category, terms in self.model_categories.items():
            category_patterns = set()
            for term in terms:
                term_doc = self.nlp(term)
                key_lemmas = sorted(
                    [
                        token.lemma_
                        for token in term_doc
                        if token.pos_ in ["NOUN", "ADJ", "VERB"]
                    ]
                )
                if key_lemmas:
                    category_patterns.add(tuple(key_lemmas))
            self.lemma_patterns[category] = category_patterns

    def get_best_similarity(self, text_embed, term_embeds):
        """Calculate the best similarity score across multiple embedding models"""
        similarities = [
            util.pytorch_cos_sim(text_embed[model_name], term_embeds[model_name])[
                0
            ].item()
            for model_name in self.models.keys()
        ]
        return max(similarities)

    def adaptive_threshold(self, term, base_threshold=0.75):
        """Calculate adaptive threshold based on term complexity"""
        if not self.use_adaptive_threshold:
            return base_threshold

        term_length = len(term.split())
        if term_length >= 4:  # Very specific terms
            return base_threshold - 0.15
        elif term_length >= 2:  # Multi-word terms
            return base_threshold - 0.05
        return base_threshold  # Single words need higher threshold

    def validate_term_context(self, text, term):
        """
        Check if term appears in a context indicating actual usage
        Returns True if context validation is disabled
        """
        # Skip validation if disabled
        if not self.use_context_validation:
            return True

        positive_contexts = [
            f"using {term}",
            f"use {term}",
            f"based on {term}",
            f"implement {term}",
            f"our {term}",
            f"proposed {term}",
            f"novel {term}",
            f"develop {term}",
            f"train {term}",
            f"fine-tune {term}",
            f"employ {term}",
            f"utilize {term}",
            f"apply {term}",
            f"architecture {term}",
            f"framework {term}",
            f"{term} approach",
            f"{term} method",
            f"{term} technique",
            f"{term} model",
            f"leveraging {term}",
            f"powered by {term}",
        ]

        negative_contexts = [
            f"unlike {term}",
            f"compared to {term}",
            f"in contrast to {term}",
            f"outperform {term}",
            f"better than {term}",
            f"future work",
            f"alternative to {term}",
            f"instead of {term}",
            f"limitation of {term}",
            f"beyond {term}",
            f"previous {term}",
            f"conventional {term}",
            f"traditional {term}",
            f"standard {term}",
            f"baseline {term}",
            f"other {term}",
            f"existing {term}",
        ]

        # Check for positive contexts
        positive_score = sum(
            1 for context in positive_contexts if context.lower() in text.lower()
        )

        # Check for negative contexts
        negative_score = sum(
            1 for context in negative_contexts if context.lower() in text.lower()
        )

        # Calculate a context score (-1 to 1)
        if positive_score + negative_score == 0:
            return False  # No context clues at all

        context_score = (positive_score - negative_score) / max(
            1, positive_score + negative_score
        )

        return context_score > 0  # Return True if positive context outweighs negative

    def classify_contribution(self, text, base_threshold=0.75):
        """
        Classify text using a balanced ensemble of matching strategies
        with bias reduction techniques and stricter filtering
        """
        if not text or not isinstance(text, str):
            return {}

        # Define match weights with stronger preference for exact matches
        match_weights = {
            "exact": 1.0,
            "token": 0.7,
            "semantic": 0.7,
            "domain_pattern": 0.5,
        }

        text = text.lower()
        results = {}

        # Track potential biases and configuration
        bias_metadata = {
            "text_truncated": len(text) > 10000,
            "language": "english",
            "embedding_models": list(self.models.keys()),
            "base_threshold": base_threshold,
            "tech_terms_analyzed": 0,
            "named_entities_found": 0,
            "use_context_validation": self.use_context_validation,
            "use_adaptive_threshold": self.use_adaptive_threshold
        }

        # Process text with spaCy for advanced linguistic analysis
        doc = self.nlp(text[:10000])  # Limit length to avoid memory issues

        # Extract technical terms and phrases that might be model references
        tech_keywords = {
            # Basic ML/AI terms
            "model",
            "network",
            "gan",
            "transformer",
            "ai",
            "algorithm",
            "vae",
            # Additional model architectures
            "encoder",
            "diffusion",
            "lstm",
            "cnn",
            "rnn",
            "neural",
            "deep",
            # Model characteristics
            "generative",
            "predictive",
            "adversarial",
            "supervised",
            "classifier",
            "unsupervised",
            # Domain-specific terms
            "synthesis",
            "recognition",
            "generation",
            "training",
            "inference",
        }

        # Extract tech phrases
        tech_phrases = []
        for chunk in doc.noun_chunks:
            # Extract more comprehensively
            if any(token.lemma_ in tech_keywords for token in chunk):
                tech_phrases.append(chunk.text)
            # Also check for tech bigrams/trigrams within longer chunks
            elif len(chunk) > 3:
                for i in range(len(chunk) - 1):
                    if (
                        chunk[i].lemma_ in tech_keywords
                        or chunk[i + 1].lemma_ in tech_keywords
                    ):
                        tech_phrases.append(
                            " ".join([token.text for token in chunk[i : i + 2]])
                        )

        # Add named entities that might be model names
        named_entities = [
            ent.text
            for ent in doc.ents
            if ent.label_ in ["ORG", "PRODUCT", "WORK_OF_ART"]
        ]
        tech_phrases.extend(named_entities)

        # Update metadata
        bias_metadata["tech_terms_analyzed"] = len(tech_phrases)
        bias_metadata["named_entities_found"] = len(named_entities)

        # Examine both the full text and extracted technical phrases
        texts_to_analyze = [text] + tech_phrases

        # Look for domain-specific patterns in sentences with stricter criteria
        domain_patterns = {}
        for sent in doc.sents:
            sent_lemmas = [
                token.lemma_ for token in sent if token.pos_ in ["NOUN", "ADJ", "VERB"]
            ]
            for i in range(len(sent_lemmas)):
                for j in range(i + 1, min(i + 5, len(sent_lemmas))):
                    ngram = tuple(sorted(sent_lemmas[i:j]))
                    if len(ngram) < 2:  # Skip single words
                        continue

                    for category, patterns in self.lemma_patterns.items():
                        for pattern in patterns:
                            overlap = len(set(ngram).intersection(pattern)) / len(
                                pattern
                            )
                            if overlap > 0.8:
                                phrase_text = " ".join(
                                    [token.text for token in sent[i:j]]
                                )

                                # Only consider if there's contextual evidence
                                if self.validate_term_context(text, phrase_text):
                                    if category not in domain_patterns:
                                        domain_patterns[category] = []

                                    domain_patterns[category].append(
                                        {
                                            "term": phrase_text,
                                            "match_type": "domain_pattern",
                                            "confidence": overlap * 0.7,
                                            "pattern_overlap": overlap,
                                        }
                                    )

        for category, terms in self.model_categories.items():
            all_matches = []

            # Level 1: Direct exact matching
            for term in terms:
                pattern = r"\b" + re.escape(term) + r"\b"
                if re.search(pattern, text):
                    # Validate the context for exact matches too
                    if self.validate_term_context(text, term):
                        all_matches.append(
                            {"term": term, "match_type": "exact", "confidence": 1.0}
                        )

            # Level 2: Token-based matching for multi-word terms
            text_tokens = set(re.findall(r"\b\w+\b", text))
            for term in terms:
                term_tokens = set(re.findall(r"\b\w+\b", term))
                if (
                    term_tokens
                    and term_tokens.issubset(text_tokens)
                    and len(term_tokens) > 1
                ):
                    # Only consider multi-word token matches with positive context
                    if self.validate_term_context(text, term):
                        all_matches.append(
                            {
                                "term": term,
                                "match_type": "token",
                                "confidence": 0.7,
                            }
                        )

            # Level 3: Semantic similarity using embeddings (more selective)
            for analyze_text in texts_to_analyze:
                # Skip if too short
                if len(analyze_text.split()) < 2:
                    continue

                # Encode with all models
                text_embeddings = {
                    model_name: model.encode(analyze_text, convert_to_tensor=True)
                    for model_name, model in self.models.items()
                }

                # Check similarity to each term across models
                for term in terms:
                    term_threshold = self.adaptive_threshold(term, base_threshold)
                    similarity = self.get_best_similarity(
                        text_embeddings, self.term_embeddings[category][term]
                    )

                    if similarity > term_threshold:
                        # Only add semantic matches with proper validation
                        if similarity > 0.9 or self.validate_term_context(text, term):
                            all_matches.append(
                                {
                                    "term": term,
                                    "match_type": "semantic",
                                    "confidence": similarity,
                                    "threshold_used": term_threshold,
                                }
                            )

            # Add domain pattern matches if any
            if category in domain_patterns:
                all_matches.extend(domain_patterns[category])

            # If we have any matches, calculate ensemble score and include in results
            if all_matches:
                # Remove duplicates, keeping highest confidence for each term
                unique_matches = {}
                for match in all_matches:
                    term = match["term"]
                    if (
                        term not in unique_matches
                        or match["confidence"] > unique_matches[term]["confidence"]
                    ):
                        unique_matches[term] = match

                # Extract final matches list
                final_matches = list(unique_matches.values())

                # Calculate weighted ensemble confidence score
                weighted_scores = [
                    match_weights[match["match_type"]] * match["confidence"]
                    for match in final_matches
                ]
                ensemble_confidence = sum(weighted_scores) / len(weighted_scores)

                # Store the results
                results[category] = {
                    "matches": final_matches,
                    "confidence": ensemble_confidence,
                    "match_count": len(final_matches),
                    "match_types": Counter(
                        [match["match_type"] for match in final_matches]
                    ),
                }

        # Add bias metadata to results
        results["_metadata"] = bias_metadata

        # Apply appropriate filtering criteria
        filtered_results = {}
        for category, result in results.items():
            # Keep metadata
            if category == "_metadata":
                filtered_results[category] = result
                continue

            # Adjust filtering criteria based on whether validation is enabled
            if self.use_context_validation:
                # Stricter criteria when using validation
                if (
                    result["confidence"] > 0.55
                    and (
                        "exact" in result["match_types"]
                        and result["match_types"]["exact"] > 0
                    )
                ) or (
                    result["confidence"] > 0.6 and result["match_count"] >= 2
                ) or (
                    result["confidence"] > 0.7
                ):
                    filtered_results[category] = result
            else:
                # More lenient criteria when not using validation
                # Use a staggered approach based on match types
                if (
                    "exact" in result["match_types"] and result["match_types"]["exact"] > 0
                ) or (
                    result["confidence"] > 0.5 and result["match_count"] >= 2
                ) or (
                    "semantic" in result["match_types"] and 
                    result["match_types"]["semantic"] > 0 and
                    result["confidence"] > 0.7
                ) or (
                    "domain_pattern" in result["match_types"] and
                    result["match_types"]["domain_pattern"] > 1  # Require multiple domain pattern matches
                ):
                    filtered_results[category] = result

        return filtered_results

    def analyze_abstract(self, abstract_data, base_threshold=0.75):
        """Analyze an abstract with weighted sections, prioritizing contribution"""
        if (
            "contribution" not in abstract_data
            or not abstract_data["contribution"].strip()
        ):
            return {}

        # Analyze the contribution with higher weight
        contribution_results = self.classify_contribution(
            abstract_data["contribution"], base_threshold=base_threshold
        )

        # If available, analyze introduction with lower weight
        introduction_results = {}
        if "introduction" in abstract_data and abstract_data["introduction"].strip():
            introduction_results = self.classify_contribution(
                abstract_data["introduction"], base_threshold=base_threshold+0.1
            )

        # Combine results, prioritizing contribution
        final_results = contribution_results.copy()

        # Only incorporate introduction models that have strong evidence
        for category, intro_data in introduction_results.items():
            if category == "_metadata":
                continue

            if category not in final_results and intro_data["confidence"] > 0.7:
                # Only add high-confidence introduction matches
                intro_data["confidence"] *= 0.7  # Discount introduction confidence
                final_results[category] = intro_data

        return final_results


# Initialize the classifier
# classifier = EnhancedModelClassifier(genai_model_categories)
print("\n--- Testing without context validation ---")
classifier_no_context = EnhancedModelClassifier(
    genai_model_categories, use_context_validation=False, use_adaptive_threshold=True
)


--- Testing without context validation ---
Configuration: context validation: False, adaptive threshold: True
Initializing embeddings models...
Precomputing embeddings for model terms...
Creating linguistic patterns for domain terminology...


## Process and Classify the Abstracts

In [15]:
# Process each abstract
print("Classifying model architectures in contributions...")
for abstract in tqdm(abstracts_with_both[:30], desc="Analyzing abstracts"):
    # Use the analyze_abstract method
    model_classification = classifier_no_context.analyze_abstract(abstract, base_threshold=0.95)

    # Store results
    abstract["model_architectures"] = model_classification

# Count abstracts with ACTUAL identified model architectures (excluding _metadata only)
abstracts_with_models = [
    abstract
    for abstract in abstracts_with_both[:30]
    if "model_architectures" in abstract
    and any(key != "_metadata" for key in abstract["model_architectures"].keys())
]

print(
    f"Found {len(abstracts_with_models)} abstracts mentioning generative model architectures"
)
print(
    f"This represents {len(abstracts_with_models)/len(abstracts_with_both[:30])*100:.2f}% of all abstracts with contributions"
)

# Save the results as a checkpoint
output_dir = "../data/"
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "genai_model_classifications.json"), "w") as f:
    json.dump(abstracts_with_both, f, indent=4)

Classifying model architectures in contributions...


Analyzing abstracts: 100%|██████████| 30/30 [00:30<00:00,  1.01s/it]

Found 29 abstracts mentioning generative model architectures
This represents 96.67% of all abstracts with contributions





In [None]:
# Test different base thresholds to find the optimal setting
thresholds = [0.65, 0.55, 0.45]
results = {}

for threshold in thresholds:
    print(f"\nTesting with threshold {threshold}:")

    # Process abstracts
    abstracts_test = copy.deepcopy(
        abstracts_with_both[:15]
    )  # Create a copy to avoid interference

    for abstract in tqdm(abstracts_test, desc=f"Threshold {threshold}"):
        model_classification = classifier_no_context.analyze_abstract(
            abstract, base_threshold=threshold
        )
        abstract["model_architectures"] = model_classification

    # Count matches
    abstracts_with_models = [
        abstract
        for abstract in abstracts_test
        if "model_architectures" in abstract
        and any(key != "_metadata" for key in abstract["model_architectures"].keys())
    ]

    # Count match types
    match_type_counts = {"exact": 0, "token": 0, "semantic": 0, "domain_pattern": 0}
    model_counts = Counter()

    for abstract in abstracts_with_models:
        for category, data in abstract["model_architectures"].items():
            if category == "_metadata":
                continue

            model_counts[category] += 1

            if "match_types" in data:
                for match_type, count in data["match_types"].items():
                    match_type_counts[match_type] += count

    # Print results
    print(f"Found {len(abstracts_with_models)} abstracts with models")
    print(f"Model distribution: {dict(model_counts)}")
    print(f"Match types: {match_type_counts}")

    # Store for comparison
    results[threshold] = {
        "count": len(abstracts_with_models),
        "models": dict(model_counts),
        "match_types": match_type_counts,
    }

# Compare and recommend the best threshold
print("\nThreshold Comparison:")
for threshold, data in results.items():
    semantic_ratio = (
        data["match_types"]["semantic"] / sum(data["match_types"].values())
        if sum(data["match_types"].values()) > 0
        else 0
    )
    print(
        f"Threshold {threshold}: {data['count']} abstracts, {semantic_ratio:.2%} semantic matches"
    )

# Recommend the best threshold
best_threshold = min(
    results.items(),
    key=lambda x: (
        abs(x[1]["match_types"]["semantic"] / sum(x[1]["match_types"].values()) - 0.4)
        if sum(x[1]["match_types"].values()) > 0
        else 1
    ),
)[0]
print(f"\nRecommended threshold: {best_threshold} (aims for ~40% semantic matches)")

## Analyze Model Distribution

In [14]:
# Create dictionary to store model frequency by category
model_category_counts = Counter()
match_type_counts = {"exact": 0, "token": 0, "semantic": 0, "domain_pattern": 0}
model_term_counts = Counter()

# Collect data from the abstracts
for abstract in abstracts_with_models:
    for category, data in abstract["model_architectures"].items():
        # Skip the metadata field when counting model categories
        if category == "_metadata":
            continue

        model_category_counts[category] += 1

        # Count each matching term and match type
        if "matches" in data:
            for match in data["matches"]:
                model_term_counts[match["term"]] += 1
                match_type_counts[match["match_type"]] += 1

# Display category counts
print("\nModel Category Distribution:")
for category, count in model_category_counts.most_common():
    print(f"  - {category}: {count} abstracts")

# Display match type distribution
print("\nMatch Method Distribution:")
total_matches = sum(match_type_counts.values())
for match_type, count in match_type_counts.items():
    if total_matches > 0:  # Avoid division by zero
        print(f"  - {match_type}: {count} ({count/total_matches*100:.2f}%)")
    else:
        print(f"  - {match_type}: {count} (0.00%)")

# Display top model terms
print("\nTop 10 Model Terms:")
for term, count in model_term_counts.most_common(10):
    print(f"  - {term}: {count}")


Model Category Distribution:
  - Transformer-Based Models: 30 abstracts
  - Generative Adversarial Networks: 30 abstracts
  - Diffusion Models: 30 abstracts
  - Variational Autoencoders: 30 abstracts
  - Neural Radiance Fields & 3D Models: 30 abstracts
  - Hybrid & Multimodal Architectures: 30 abstracts

Match Method Distribution:
  - exact: 36 (1.15%)
  - token: 1 (0.03%)
  - semantic: 1979 (63.43%)
  - domain_pattern: 1104 (35.38%)

Top 10 Model Terms:
  - large language model: 30
  - foundation model: 30
  - pretrained language model: 30
  - attention mechanism: 30
  - generative pretrained transformer: 30
  - large flow model: 30
  - flow model: 30
  - foundation framework: 30
  - foundational framework: 30
  - generative capability: 30


# Analyze Domain-Model Relationships

In [None]:
# Create a mapping of domains to model architectures
domain_to_models = defaultdict(Counter)

for abstract in abstracts_with_models:
    domains = [d["domain"] for d in abstract.get("macro_domains", [])]
    models = list(abstract["model_architectures"].keys())

    for domain in domains:
        domain_to_models[domain].update(models)

# Create matrix for domain-model relationships
domains = sorted(domain_to_models.keys())
model_categories = sorted(genai_model_categories.keys())

# Create matrix data for heatmap
matrix_data = []
for domain in domains:
    domain_data = []
    for model in model_categories:
        count = domain_to_models[domain][model]
        domain_data.append(count)
    matrix_data.append(domain_data)

# Create a DataFrame
heatmap_df = pd.DataFrame(matrix_data, index=domains, columns=model_categories)

# Print the top domains for each model category
print("\nTop Domains for Each Model Category:")
for model in model_categories:
    print(f"\n{model}:")
    top_domains = sorted(
        [
            (domain, domain_to_models[domain][model])
            for domain in domains
            if domain_to_models[domain][model] > 0
        ],
        key=lambda x: x[1],
        reverse=True,
    )[:3]

    for domain, count in top_domains:
        print(f"  - {domain}: {count}")

## Visualize the Results

In [None]:
# 1. Model Category Distribution Bar Chart
plt.figure(figsize=(12, 6))
categories = [cat for cat, _ in model_category_counts.most_common()]
counts = [count for _, count in model_category_counts.most_common()]

sns.barplot(x=counts, y=categories, palette="viridis")
plt.xlabel("Number of Abstracts")
plt.ylabel("Generative AI Model Category")
plt.title("Distribution of Generative AI Model Categories in Smart City Research")

# Add value labels to the end of each bar
for i, count in enumerate(counts):
    plt.text(count + 0.5, i, str(count), va="center")

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "genai_model_distribution.png"), dpi=300)
plt.show()

# 2. Match Type Distribution Pie Chart
plt.figure(figsize=(8, 8))
match_labels = [
    f"{match_type} ({count})" for match_type, count in match_type_counts.items()
]
match_counts = list(match_type_counts.values())

plt.pie(
    match_counts,
    labels=match_labels,
    autopct="%1.1f%%",
    colors=sns.color_palette("Set2"),
    startangle=90,
    explode=[0.05] * len(match_counts),
)
plt.title("Distribution of Match Types in Model Detection")
plt.savefig(os.path.join(output_dir, "match_type_distribution.png"), dpi=300)
plt.show()

# 3. Domain-Model Heatmap
plt.figure(figsize=(14, 10))
ax = sns.heatmap(heatmap_df, annot=True, fmt="d", cmap="YlGnBu", linewidths=0.5)
plt.xlabel("Generative AI Model Category")
plt.ylabel("Smart City Domain")
plt.title("Heatmap of Generative AI Models Across Smart City Domains")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "domain_model_heatmap.png"), dpi=300)
plt.show()

# 4. Top Models in Top Domains
# Get top 5 domains by total model mentions
top_domain_counts = {
    domain: sum(counts.values()) for domain, counts in domain_to_models.items()
}
top_domains = sorted(top_domain_counts.items(), key=lambda x: x[1], reverse=True)[:5]
top_domain_names = [d for d, _ in top_domains]

# Create a subset dataframe for these domains
top_domain_df = heatmap_df.loc[top_domain_names]

plt.figure(figsize=(14, 8))
top_domain_df.plot(kind="bar", stacked=False, colormap="tab10")
plt.xlabel("Smart City Domain")
plt.ylabel("Number of Abstracts")
plt.title("Distribution of GenAI Model Categories in Top Smart City Domains")
plt.legend(title="Model Category", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "top_domains_model_distribution.png"), dpi=300)
plt.show()

# 5. Model term cloud (horizontal bar chart of top terms)
plt.figure(figsize=(12, 8))
top_terms = [term for term, _ in model_term_counts.most_common(15)]
term_counts = [count for _, count in model_term_counts.most_common(15)]

sns.barplot(x=term_counts, y=top_terms, palette="rocket")
plt.xlabel("Number of Mentions")
plt.ylabel("Model Term")
plt.title("Top 15 Generative AI Model Terms in Smart City Research")

# Add value labels to the end of each bar
for i, count in enumerate(term_counts):
    plt.text(count + 0.3, i, str(count), va="center")

plt.tight_layout()
plt.savefig(os.path.join(output_dir, "top_model_terms.png"), dpi=300)
plt.show()

## Find Representatative Examples

In [None]:
# Find representative examples for each model category
examples = {}

for abstract in abstracts_with_models:
    for category in abstract["model_architectures"].keys():
        if category not in examples and "doi" in abstract:
            if "matches" in abstract["model_architectures"][category]:
                terms = [
                    match["term"]
                    for match in abstract["model_architectures"][category]["matches"]
                ]
                confidence = abstract["model_architectures"][category]["confidence"]
            else:
                terms = []
                confidence = 0.0

            # Store this example
            examples[category] = {
                "doi": abstract.get("doi", "Unknown"),
                "contribution": abstract["contribution"][:300]
                + "...",  # Truncate for readability
                "matched_terms": terms,
                "confidence": confidence,
                "domains": [d["domain"] for d in abstract.get("macro_domains", [])],
            }

# Display examples
print("\n📚 Representative Examples for Each Model Category:")

for category in genai_model_categories.keys():
    print(f"\n## {category}")
    if category in examples:
        example = examples[category]
        print(f"DOI: {example['doi']}")
        print(f"Smart City Domains: {', '.join(example['domains'])}")
        print(f"Matched Terms: {', '.join(example['matched_terms'])}")
        print(f"Confidence: {example['confidence']:.2f}")
        print(f"Contribution: {example['contribution']}")
    else:
        print("No example found for this category")
    print("-" * 80)

## Summary and Insights

In [None]:
# Calculate key statistics
total_abstracts = len(abstracts_with_both)
total_with_models = len(abstracts_with_models)
percentage_with_models = (total_with_models / total_abstracts) * 100

# Domains with highest model diversity
domain_model_diversity = {
    domain: len(counts) for domain, counts in domain_to_models.items()
}
diverse_domains = sorted(
    domain_model_diversity.items(), key=lambda x: x[1], reverse=True
)[:5]

# Most frequently co-occurring models
model_co_occurrence = defaultdict(int)
for abstract in abstracts_with_models:
    models = list(abstract["model_architectures"].keys())
    if len(models) >= 2:
        for i in range(len(models)):
            for j in range(i + 1, len(models)):
                pair = tuple(sorted([models[i], models[j]]))
                model_co_occurrence[pair] += 1

top_pairs = sorted(model_co_occurrence.items(), key=lambda x: x[1], reverse=True)[:5]

# Print summary insights
print("\n📊 Summary Insights:")
print(f"Total abstracts analyzed: {total_abstracts}")
print(
    f"Abstracts mentioning generative AI models: {total_with_models} ({percentage_with_models:.2f}%)"
)
print(f"Total model mentions detected: {sum(model_category_counts.values())}")

print("\nSmart City Domains with Highest Model Diversity:")
for domain, count in diverse_domains:
    print(f"  - {domain}: {count} different model categories")

print("\nMost Frequently Co-occurring Model Categories:")
for pair, count in top_pairs:
    print(f"  - {pair[0]} + {pair[1]}: {count} abstracts")

print("\nKey Applications by Model Category:")
for category, context in model_application_context.items():
    if category in model_category_counts:
        print(
            f"  - {category} ({model_category_counts[category]} abstracts): {context}"
        )

# Create a summary chart
plt.figure(figsize=(10, 6))
summary_data = [
    total_with_models,  # Abstracts with models
    total_abstracts - total_with_models,  # Abstracts without models
]
labels = [
    f"With GenAI Models\n({percentage_with_models:.1f}%)",
    f"Without GenAI Models\n({100-percentage_with_models:.1f}%)",
]
colors = ["#2ecc71", "#e74c3c"]

plt.pie(
    summary_data,
    labels=labels,
    autopct="%1.1f%%",
    colors=colors,
    startangle=90,
    explode=[0.1, 0],
)
plt.title("Proportion of Smart City Research Using Generative AI Models")
plt.savefig(os.path.join(output_dir, "genai_adoption_summary.png"), dpi=300)
plt.show()

# Save the Results

In [None]:
# Create output structure for the final analysis results
analysis_results = {
    "metadata": {
        "total_abstracts": total_abstracts,
        "abstracts_with_models": total_with_models,
        "percentage_with_models": percentage_with_models,
        "model_categories": genai_model_categories,
    },
    "model_counts": {
        category: count for category, count in model_category_counts.most_common()
    },
    "match_types": match_type_counts,
    "top_model_terms": {
        term: count for term, count in model_term_counts.most_common(20)
    },
    "domain_model_matrix": heatmap_df.to_dict(),
    "model_co_occurrence": {
        f"{pair[0]}__{pair[1]}": count for pair, count in top_pairs
    },
    "examples": examples,
}

# Save results to file
final_output_path = os.path.join(output_dir, "08_genai_model_analysis_results.json")
with open(final_output_path, "w") as f:
    json.dump(analysis_results, f, indent=2)

print(f"\n✅ Analysis complete! Final results saved to {final_output_path}")