In [23]:
# Import Required Libraries
import json
import pandas as pd
import numpy as np
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import torch
from sentence_transformers import SentenceTransformer, util
import spacy

# Set display options
pd.set_option("display.max_colwidth", 100)
sns.set(style="whitegrid")
plt.rcParams.update({"font.size": 12})
plt.rcParams.update({"font.family": "Times New Roman"})

# Load and Preprocess the data

In [24]:
df = pd.read_json("../data/06_classified_macro_smart_city_domains.json")

# Display the first few rows to understand the structure
df.head(3)

Unnamed: 0,title,authors,journal,doi,publication_date,document_type,prism:url,scopus_id,abstract,author_keywords,subject_areas,introduction,contribution,genai_classification,is_genai_application,classification_score,classification_labels,classification_scores,macro_domains
0,GeoAvatar: A big mobile phone positioning data-driven method for individualized pseudo personal ...,Li P.,"Computers, Environment and Urban Systems",10.1016/j.compenvurbsys.2025.102252,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/86000553754,86000553754,"The importance of personal mobility data is widely recognized in various fields. However, the ut...","[Big mobility data, Generative model, GIS, Mahince learning, Smart City]","[Geography, Planning and Development, Ecological Modeling, Environmental Science (all), Urban St...","The importance of personal mobility data is widely recognized in various fields. However, the ut...","Our method utilizes a deep generative model to generate heterogeneous individual life patterns, ...","{'is_genai_application': True, 'top_label': 'GenAI used for smart city application', 'score': 0....",True,0.870384,"[Mobility, Human, People, Transportation Systems, Logistics, Living, Business, Economic Manageme...","[0.9896059632, 0.8164116144, 0.8087953925, 0.5389818549000001, 0.3266232014, 0.2026114315, 0.063...","[{'domain': 'Smart Mobility', 'score': 0.9896059632}, {'domain': 'Smart People', 'score': 0.8164..."
1,Demystifying SAR with attention,Patnaik N.,Expert Systems with Applications,10.1016/j.eswa.2025.127182,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/86000797212,86000797212,"Synthetic Aperture Radar (SAR) imagery is indispensable for earth observation, offering the abil...","[Attention, Deep learning, Generative adversarial networks, Image colorization, Image restoratio...","[Engineering (all), Computer Science Applications, Artificial Intelligence]","Synthetic Aperture Radar (SAR) imagery is indispensable for earth observation, offering the abil...","This study introduces an innovative framework for SAR image colorization, leveraging an Attentio...","{'is_genai_application': True, 'top_label': 'GenAI used for smart city application', 'score': 0....",True,0.901335,"[Environment, Public Services, Business, Climate Change, Living, Buildings, Construction, Human,...","[0.6324490309, 0.028749804900000002, 0.0051401048, 0.0038975298, 0.0037373602, 0.0037206223, 0.0...","[{'domain': 'Smart Environment', 'score': 0.6324490309}]"
2,MiM-UNet: An efficient building image segmentation network integrating state space models,Liu D.,Alexandria Engineering Journal,10.1016/j.aej.2025.02.035,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/85218637730,85218637730,"With the advancement of remote sensing technology, the analysis of complex terrain images has be...","[Building segmentation, Complex terrain, Deep learning, Remote sensing images, State space models]",[Engineering (all)],"With the advancement of remote sensing technology, the analysis of complex terrain images has be...","To address these limitations, we propose a novel architecture, Mamba-in-Mamba U-Net (MiM-UNet), ...","{'is_genai_application': True, 'top_label': 'GenAI used for smart city application', 'score': 0....",True,0.815972,"[Environment, Urban Planning, Buildings, Urban Management, Housing, Business, Construction, Indu...","[0.8005703092, 0.7748115659, 0.5123480558, 0.26313909890000003, 0.017528373700000002, 0.00406645...","[{'domain': 'Smart Environment', 'score': 0.8005703092}, {'domain': 'Smart Governance', 'score':..."


# Define GenAI Technologies and Keywords

In [25]:
# Define Generative AI technologies and related keywords
# Load from a JSON file
with open("../data/config/genai_technology_domains.json", "r", encoding="utf8") as f:
    genai_technologies = json.load(f)

# Flatten keywords list for initial screening
all_genai_keywords = []
for tech, keywords in genai_technologies.items():
    all_genai_keywords.extend(keywords)
all_genai_keywords = set(all_genai_keywords)

print(
    f"Using {len(all_genai_keywords)} keywords to identify Generative AI applications"
)

Using 107 keywords to identify Generative AI applications


# Set Up Semantic Matching Environment

In [48]:
# Set device for computation
DEVICE = (
    "mps"
    if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available() else "cpu"
)
print(f"Using device: {DEVICE}")

# Load sentence transformer model
print("Loading sentence embedding model...")
sbert_model = SentenceTransformer("sentence-transformers/paraphrase-MiniLM-L6-v2", device=DEVICE)

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
    print("Loaded spaCy model")
except:
    print("Installing spaCy model...")
    import os

    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
    print("Loaded spaCy model")

# Pre-compute embeddings for all keywords
print("Computing embeddings for GenAI keywords...")
keyword_list = list(all_genai_keywords)
keyword_embeddings = sbert_model.encode(keyword_list, convert_to_tensor=True)

# Create mappings from technologies to keyword indices
tech_keyword_indices = {}
for tech, tech_keywords in genai_technologies.items():
    tech_keyword_indices[tech] = [
        keyword_list.index(kw) for kw in tech_keywords if kw in keyword_list
    ]

Using device: cuda
Loading sentence embedding model...
Loaded spaCy model
Computing embeddings for GenAI keywords...


# Implement Keywords Detection Functions

In [None]:
def detect_genai_semantic(text, similarity_threshold=0.4):
    """
    Enhanced detection using semantic similarity to find GenAI applications
    
    Args:
        text (str): The text to analyze
        similarity_threshold (float): Minimum similarity score to consider a match
        
    Returns:
        dict: Detection results with is_genai flag, confidence score, and matched keywords
    """
    # Handle None or empty text
    if text is None or not isinstance(text, str) or text.strip() == "":
        return {
            "is_genai": False,
            "confidence": 0.0,
            "matched_keywords": [],
            "technology_categories": [],
            "bridge_terms": [],
            "semantic_matches": []
        }
        
    # 3. SEMANTIC MATCHING: Improved approach
    semantic_matches = []
    try:
        # Split text into shorter segments for better matching
        sentences = re.split(r'[.!?]', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 20]
        
        # If text is very long, use sentences instead of full text
        text_segments = sentences if len(sentences) > 3 else [text]
        
        # Process each segment
        for segment in text_segments:
            # Encode the segment
            segment_embedding = sbert_model.encode(segment, convert_to_tensor=True)
            
            # Calculate similarity with all keywords
            similarities = util.pytorch_cos_sim(segment_embedding, keyword_embeddings)[0]
            
            # Find the most similar keywords
            best_matches = torch.topk(similarities, k=min(5, len(similarities)))
            
            for score_idx, keyword_idx in enumerate(best_matches.indices):
                score = best_matches.values[score_idx].item()
                if score >= similarity_threshold:
                    keyword = keyword_list[keyword_idx]
                    semantic_matches.append((keyword, score))
                    
        # Remove duplicates, keeping highest score
        unique_matches = {}
        for keyword, score in semantic_matches:
            if keyword not in unique_matches or score > unique_matches[keyword]:
                unique_matches[keyword] = score
        
        semantic_matches = [(k, v) for k, v in unique_matches.items()]
    except Exception as e:
        print(f"Error in semantic matching: {e}")
        semantic_matches = []
    
    # Sort matches by score in descending order
    semantic_matches.sort(key=lambda x: x[1], reverse=True)
    
    # Determine if this is a GenAI application based on semantic matches
    is_genai = len(semantic_matches) > 0
    
    # Calculate confidence score (highest match score or 0)
    confidence = semantic_matches[0][1] if semantic_matches else 0.0
    
    # Extract matched keywords without scores for easier processing
    matched_keywords = [keyword for keyword, _ in semantic_matches]
    
    # Determine technology categories based on matched keywords
    technology_categories = []
    for tech, tech_keywords in genai_technologies.items():
        # If any matched keyword is in this technology's keywords, add the technology
        if any(keyword in tech_keywords for keyword, _ in semantic_matches):
            technology_categories.append(tech)
    
    # Extract key terms from the text for bridge analysis
    # This extracts terms that might connect GenAI to smart city concepts
    bridge_terms = []
    if text and len(text) > 10:
        try:
            # Process with spaCy to get important terms
            doc = nlp(text[:5000])  # Limit text size to avoid processing too much
            
            # Extract nouns, verbs, and important modifiers
            term_counter = Counter()
            for token in doc:
                if token.is_alpha and not token.is_stop and len(token.text) > 2:
                    if token.pos_ in ("NOUN", "VERB", "ADJ", "PROPN"):
                        term_counter[token.lemma_.lower()] += 1
            
            # Get the most common terms
            bridge_terms = [(term, count) for term, count in term_counter.most_common(5)]
        except Exception as e:
            print(f"Error extracting bridge terms: {e}")
    
    # Return results in expected format
    return {
        "is_genai": is_genai,
        "confidence": confidence,
        "matched_keywords": matched_keywords,
        "technology_categories": technology_categories,
        "bridge_terms": bridge_terms,
        "semantic_matches": semantic_matches
    }

# Apply Keywords Detection Functions to the Data

In [55]:
# Apply semantic detection
tqdm.pandas(desc="Applying semantic detection")
# Add author keywords if available, otherwise just use contribution
df["semantic_detection"] = df.progress_apply(
    lambda row: detect_genai_semantic(
        row["contribution"] + (
            " " + " ".join(row["author_keywords"]) if isinstance(row["author_keywords"], list) else ""
        )
    ), 
    axis=1
)

# Extract semantic results
df["semantic_is_genai"] = df["semantic_detection"].apply(lambda x: x["is_genai"])
df["semantic_confidence"] = df["semantic_detection"].apply(lambda x: x["confidence"])
df["semantic_keywords"] = df["semantic_detection"].apply(lambda x: x["matched_keywords"])
df["semantic_categories"] = df["semantic_detection"].apply(lambda x: x["technology_categories"])
df["bridge_terms"] = df["semantic_detection"].apply(lambda x: x["bridge_terms"])
df["semantic_matches"] = df["semantic_detection"].apply(lambda x: x["semantic_matches"])

Applying semantic detection:   0%|          | 0/238 [00:00<?, ?it/s]

In [56]:
# Summary statistics
print(
    f"Abstracts with semantic GenAI detection: {df['semantic_is_genai'].sum()} ({df['semantic_is_genai'].mean()*100:.2f}%)"
)

Abstracts with semantic GenAI detection: 232 (97.48%)


In [57]:
df

Unnamed: 0,title,authors,journal,doi,publication_date,document_type,prism:url,scopus_id,abstract,author_keywords,...,classification_labels,classification_scores,macro_domains,semantic_detection,semantic_is_genai,semantic_confidence,semantic_keywords,semantic_categories,bridge_terms,semantic_matches
0,GeoAvatar: A big mobile phone positioning data-driven method for individualized pseudo personal ...,Li P.,"Computers, Environment and Urban Systems",10.1016/j.compenvurbsys.2025.102252,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/86000553754,86000553754,"The importance of personal mobility data is widely recognized in various fields. However, the ut...","[Big mobility data, Generative model, GIS, Mahince learning, Smart City]",...,"[Mobility, Human, People, Transportation Systems, Logistics, Living, Business, Economic Manageme...","[0.9896059632, 0.8164116144, 0.8087953925, 0.5389818549000001, 0.3266232014, 0.2026114315, 0.063...","[{'domain': 'Smart Mobility', 'score': 0.9896059632}, {'domain': 'Smart People', 'score': 0.8164...","{'is_genai': True, 'confidence': 0.48666954040527344, 'matched_keywords': ['generative model', '...",True,0.486670,"[generative model, 3d generative model]","[Transformer-Based Models, Neural Radiance Fields & 3D Models]","[(method, 4), (generate, 4), (mobility, 4), (model, 3), (individual, 3)]","[(generative model, 0.48666954040527344), (3d generative model, 0.41732609272003174)]"
1,Demystifying SAR with attention,Patnaik N.,Expert Systems with Applications,10.1016/j.eswa.2025.127182,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/86000797212,86000797212,"Synthetic Aperture Radar (SAR) imagery is indispensable for earth observation, offering the abil...","[Attention, Deep learning, Generative adversarial networks, Image colorization, Image restoratio...",...,"[Environment, Public Services, Business, Climate Change, Living, Buildings, Construction, Human,...","[0.6324490309, 0.028749804900000002, 0.0051401048, 0.0038975298, 0.0037373602, 0.0037206223, 0.0...","[{'domain': 'Smart Environment', 'score': 0.6324490309}]","{'is_genai': True, 'confidence': 0.6297489404678345, 'matched_keywords': ['attention mechanism',...",True,0.629749,"[attention mechanism, wgan-gp, self-attention, cross-attention, sr-gan, generative adversarial n...","[Transformer-Based Models, Generative Adversarial Networks, Diffusion Models, Neural Radiance Fi...","[(attention, 5), (image, 4), (sar, 3), (colorization, 2), (base, 2)]","[(attention mechanism, 0.6297489404678345), (wgan-gp, 0.524022102355957), (self-attention, 0.480..."
2,MiM-UNet: An efficient building image segmentation network integrating state space models,Liu D.,Alexandria Engineering Journal,10.1016/j.aej.2025.02.035,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/85218637730,85218637730,"With the advancement of remote sensing technology, the analysis of complex terrain images has be...","[Building segmentation, Complex terrain, Deep learning, Remote sensing images, State space models]",...,"[Environment, Urban Planning, Buildings, Urban Management, Housing, Business, Construction, Indu...","[0.8005703092, 0.7748115659, 0.5123480558, 0.26313909890000003, 0.017528373700000002, 0.00406645...","[{'domain': 'Smart Environment', 'score': 0.8005703092}, {'domain': 'Smart Governance', 'score':...","{'is_genai': True, 'confidence': 0.632642388343811, 'matched_keywords': ['encoder-decoder', 'lat...",True,0.632642,"[encoder-decoder, latent space modeling, autoencoder, shape generation, neural rendering, vision...","[Transformer-Based Models, Variational Autoencoders, Neural Radiance Fields & 3D Models, Hybrid ...","[(mamba, 4), (mim, 3), (unet, 3), (state, 3), (model, 3)]","[(encoder-decoder, 0.632642388343811), (latent space modeling, 0.44094717502593994), (autoencode..."
3,Building Change Detection in Aerial Imagery Using End-to-End Deep Learning Semantic Segmentation...,Teo T.A.,Buildings,10.3390/buildings15050695,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/86000578375,86000578375,"Automatic building change detection is essential for updating geospatial data, urban planning, a...","[buildings, change detection, deep learning, map updating]",...,"[Buildings, Urban Planning, Construction, Urban Management, Public Services, Environment, Housin...","[0.9861167073, 0.898650229, 0.6620252132000001, 0.6395537853000001, 0.2649227679, 0.2593763471, ...","[{'domain': 'Smart Living', 'score': 0.9861167073}, {'domain': 'Smart Governance', 'score': 0.89...","{'is_genai': True, 'confidence': 0.5988814830780029, 'matched_keywords': ['transformer', 'genera...",True,0.598881,"[transformer, generative pretrained transformer, multimodal transformer, imagen, multimodal fusi...","[Transformer-Based Models, Neural Radiance Fields & 3D Models, Hybrid & Multimodal Architectures]","[(change, 5), (base, 4), (end, 4), (building, 4), (detection, 4)]","[(transformer, 0.5988814830780029), (generative pretrained transformer, 0.5384024381637573), (mu..."
4,Generative spatial artificial intelligence for sustainable smart cities: A pioneering large flow...,Huang J.,Environmental Science and Ecotechnology,10.1016/j.ese.2025.100526,2025,Article,https://api.elsevier.com/content/abstract/scopus_id/85216848180,85216848180,"Rapid urbanization, alongside escalating resource depletion and ecological degradation, undersco...","[Foundation models, Generative artificial intelligence, Generative spatial artificial intelligen...",...,"[Urban Planning, Urban Management, Sustainability, Multimodal Transport, Construction, Environme...","[0.9958809614, 0.8571100831, 0.40149432420000003, 0.036717657, 0.0271584447, 0.0257922839, 0.023...","[{'domain': 'Smart Governance', 'score': 0.9958809614}, {'domain': 'Smart Environment', 'score':...","{'is_genai': True, 'confidence': 0.6537469625473022, 'matched_keywords': ['large flow model', 'f...",True,0.653747,"[large flow model, flow model, foundational framework, foundation model, foundation framework, l...","[Transformer-Based Models, Neural Radiance Fields & 3D Models]","[(urban, 9), (flow, 5), (model, 5), (city, 5), (lfm, 4)]","[(large flow model, 0.6537469625473022), (flow model, 0.6234424114227295), (foundational framewo..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,Demonstration of Electrically Injected Semipolar Laser Diodes Grown on Low-Cost and Scalable Sap...,Khoury M.,ACS Applied Materials and Interfaces,10.1021/acsami.9b17525,2019,Article,https://api.elsevier.com/content/abstract/scopus_id/85076790759,85076790759,"The last two decades have shown an increasing need for GaN-based laser diodes (LDs), which are c...","[GaN, laser diodes, scalable, semipolar, templates]",...,"[Industry, Business, Economy, Socioeconomics, Economic Management, Marketing, Environment, Mobil...","[0.5686096549, 0.5395241976, 0.0077464948, 0.0021609876000000003, 0.001608802, 0.0012988048, 0.0...","[{'domain': 'Smart Economy', 'score': 0.5686096549}]","{'is_genai': True, 'confidence': 0.6290763020515442, 'matched_keywords': ['3d gan', '5gt-gan', '...",True,0.629076,"[3d gan, 5gt-gan, sr-gan, conditional gan, gan, progressive gan, adv-gan, cycle gan]","[Generative Adversarial Networks, Neural Radiance Fields & 3D Models]","[(semipolar, 3), (low, 3), (gan, 3), (density, 2), (template, 2)]","[(3d gan, 0.6290763020515442), (5gt-gan, 0.5917875170707703), (sr-gan, 0.58832848072052), (condi..."
234,Phytogan: Unpaired dead-to-live phytoplankton translation,Han S.,"Proceedings - 2019 IEEE SmartWorld, Ubiquitous Intelligence and Computing, Advanced and Trusted ...",10.1109/SmartWorld-UIC-ATC-SCALCOM-IOP-SCI.2019.00109,2019,Conference Paper,https://api.elsevier.com/content/abstract/scopus_id/85083593256,85083593256,"Detecting phytoplankton that causes red tide is an urgent task. However, the live phytoplankton ...","[Contour, PCALoss, PhytoGAN, Phytoplankton]",...,"[Environment, Water Quality, Living, Emergency Safety, Public Services, Pollution Control, Cultu...","[0.9808947444, 0.2901338637, 0.0916244462, 0.031032999999999998, 0.0146472873, 0.0032697413, 0.0...","[{'domain': 'Smart Environment', 'score': 0.9808947444}]","{'is_genai': True, 'confidence': 0.48289957642555237, 'matched_keywords': ['generative adversari...",True,0.482900,"[generative adversarial network, neural rendering, imagen, diffusion model]","[Generative Adversarial Networks, Diffusion Models, Neural Radiance Fields & 3D Models, Hybrid &...","[(phytoplankton, 5), (image, 4), (phytogan, 3), (pcaloss, 3), (paper, 2)]","[(generative adversarial network, 0.48289957642555237), (neural rendering, 0.45412594079971313),..."
235,A GAN-based active terrain mapping for collaborative air-ground robotic system,Chen J.,"2019 4th IEEE International Conference on Advanced Robotics and Mechatronics, ICARM 2019",10.1109/ICARM.2019.8833919,2019,Conference Paper,https://api.elsevier.com/content/abstract/scopus_id/85073261923,85073261923,Collaborative air-ground robotic system has recently emerged as an important research area and s...,"[Active Learning, Collaborative Air-Ground Robotic System, Convolutional Neural Networks (CNN), ...",...,"[Mobility, Multimodal Transport, Transportation Systems, Industry, Logistics, Business, Urban Ma...","[0.9317734838, 0.9130275249, 0.40025082230000003, 0.0498190336, 0.0333033353, 0.0136193736, 0.01...","[{'domain': 'Smart Mobility', 'score': 0.9317734838}]","{'is_genai': True, 'confidence': 0.5831387639045715, 'matched_keywords': ['adv-gan', '3d gan', '...",True,0.583139,"[adv-gan, 3d gan, gan, generative adversarial network, progressive gan, sr-gan, conditional gan,...","[Transformer-Based Models, Generative Adversarial Networks, Neural Radiance Fields & 3D Models]","[(gan, 5), (terrain, 4), (map, 3), (active, 3), (system, 2)]","[(adv-gan, 0.5831387639045715), (3d gan, 0.531076967716217), (gan, 0.5111932754516602), (generat..."
236,"Macro-level traffic safety analysis in Shanghai, China",Wang X.,Accident Analysis and Prevention,10.1016/j.aap.2019.02.014,2019,Article,https://api.elsevier.com/content/abstract/scopus_id/85061832713,85061832713,"Continuing rapid growth in Shanghai, China, requires traffic safety to be considered at the earl...","[Bayesian conditional autoregressive model, Macro-level safety modeling, Traffic analysis zone, ...",...,"[Transportation Systems, Mobility, Traffic Management, Urban Management, Urban Planning, Economy...","[0.9042782784000001, 0.8140229583, 0.8014292121000001, 0.4872630835, 0.2150092274, 0.0802951008,...","[{'domain': 'Smart Mobility', 'score': 0.9042782784000001}, {'domain': 'Smart Governance', 'scor...","{'is_genai': False, 'confidence': 0.0, 'matched_keywords': [], 'technology_categories': [], 'bri...",False,0.000000,[],[],"[(safety, 4), (model, 4), (traffic, 4), (crash, 4), (frequency, 4)]",[]


In [59]:
# Remove entries with empty semantic matches
df = df[df["semantic_matches"].apply(lambda x: len(x) > 0)]

# Save the Results

In [60]:
df.to_json("../data/07_semantic_kw_genai_detection.json", orient="records", indent=4, )