# Import Required Libraries
Import necessary libraries for data processing, regex pattern matching, transformer models, and visualization.

In [2]:
# Import Required Libraries

# Data processing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import torch

# Regex pattern matching
import re

# Transformer models
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Set seaborn style for plots
sns.set(style="whitegrid")

# Load and Prepare Data
Load the JSON file containing abstracts, extract the abstracts, and perform basic cleaning operations.

In [3]:
# Load and Prepare Data

# Load the JSON file containing abstracts
import json

with open('/Users/joaocarlos/Developer/Projects/genai-smartcity/data/02_document_search_results.json', 'r') as file:
    data = json.load(file)

# Extract abstracts from the JSON data
abstracts = [entry['abstract'] for entry in data if 'abstract' in entry]

# Perform basic cleaning operations
def clean_text(text):
    # Remove newline characters
    text = text.replace('\n', ' ')
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

cleaned_abstracts = [clean_text(abstract) for abstract in abstracts]

# Display the first few cleaned abstracts
cleaned_abstracts[:5]

['Building rooftop extraction has been applied in various fields, such as cartography, urban planning, automatic driving, and intelligent city construction. Automatic building detection and extraction algorithms using high spatial resolution aerial images can provide precise location and geometry information, significantly reducing time, costs, and labor. Recently, deep learning algorithms, especially convolution neural networks (CNNs) and Transformer, have robust local or global feature extraction ability, achieving advanced performance in intelligent interpretation compared with conventional methods. However, buildings often exhibit scale variation, spectral heterogeneity, and similarity with complex geometric shapes. Hence, the building rooftop extraction results exist fragmentation and lack spatial details using these methods. To address these issues, this study developed a multi-scale global perceptron network based on Transformer and CNN using novel encoder-decoders for enhancing

# Extract Descriptive Sentences
Use regex patterns to extract sentences containing phrases like 'This paper', 'This work', 'This study', etc. that describe the focus of the article.

In [4]:
# Extract Descriptive Sentences with Expanded Patterns

# Define expanded regex patterns to match descriptive sentences
patterns = [
    r"\b[Tt]his (article|work|study|paper|research|review|survey)\b",
    r"\b[Ii]n this (work|study|paper|research|review|survey)\b",
    r"\b[Ww]e (propose|introduce|present|develop|describe|demonstrate|report|discuss|analyze|examine|investigate|explore|evaluate|address|outline)\b",
    r"\b[Ii]n this (manuscript|article|contribution|approach|framework|investigation|analysis|implementation)\b",
    r"\b[Tt]he (article|paper|study|work|research|review|survey|manuscript|current study|present study|present work|current work)\b",
    r"\b[Oo]ur (work|study|paper|research|approach|framework|method|system|contribution|focus|aim|objective|goal)\b",
    r"\b[Tt]his (manuscript|contribution|investigation|analysis|implementation|approach|framework|method|system)\b",
    r"\b[Tt]he (purpose|aim|goal|objective) of this (paper|work|study|research|article|manuscript)\b",
    r"\b[Hh]ere(,)? we\b",
]


# Function to extract sentences matching the patterns
def extract_descriptive_sentences(text, patterns):
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
    descriptive_sentences = []
    for sentence in sentences:
        for pattern in patterns:
            if re.search(pattern, sentence):
                descriptive_sentences.append(sentence)
                break
    return descriptive_sentences


# Apply the function to all cleaned abstracts
descriptive_sentences = [
    extract_descriptive_sentences(abstract, patterns) for abstract in cleaned_abstracts
]

# Flatten the list of lists
descriptive_sentences = [
    sentence for sublist in descriptive_sentences for sentence in sublist
]

# Display statistics about extracted sentences
print(f"Total descriptive sentences extracted: {len(descriptive_sentences)}")
print(
    f"Number of abstracts with extracted sentences: {sum(1 for sublist in [extract_descriptive_sentences(abstract, patterns) for abstract in cleaned_abstracts] if sublist)}"
)
print(
    f"Percentage of abstracts with extracted sentences: {sum(1 for sublist in [extract_descriptive_sentences(abstract, patterns) for abstract in cleaned_abstracts] if sublist) / len(cleaned_abstracts) * 100:.2f}%"
)

# Display the first few descriptive sentences
descriptive_sentences[:5]

Total descriptive sentences extracted: 909
Number of abstracts with extracted sentences: 452
Percentage of abstracts with extracted sentences: 88.28%


['To address these issues, this study developed a multi-scale global perceptron network based on Transformer and CNN using novel encoder-decoders for enhancing contextual representation of buildings.',
 'Purpose: This paper proposes a solution to address matching challenges, such as incomplete descriptions, reversed word order, and the diverse descriptions often found in Chinese addresses.',
 'Method: Leveraging the hierarchical structure of Chinese addresses, this study introduces the interactive address matching graph attention model (IAMGAM).',
 'To address these challenges, we propose an Unsupervised Federated Hypernetwork Method for Distributed Multivariate Time Series Anomaly Detection and Diagnosis (uFedHy-DisMTSADD).',
 'Specifically, we introduce a federated hypernetwork architecture that effectively mitigates the heterogeneity and fluctuations in distributed environments while protecting client data privacy.']

# Define Smart City Domains
Create a comprehensive dictionary of smart city domains (e.g., governance, mobility, safety, infrastructure, environment, healthcare) with detailed descriptions for each.

In [15]:
import json

# Define Smart City Domains

# Create a dictionary of smart city domains with detailed descriptions
# smart_city_domains = {
#     "Governance": "the frameworks, policies, and processes that ensure effective management and administration of smart city initiatives, including aspects like public policy, community and citizen engagement and social equity.",
#     "Planning": "the strategic development, modeling, design and expansion of building urban spaces, covering land use, zoning and infrastructure planning to enhance livability and sustainability.",
#     # "Social Aspects": "Addresses the societal dimensions of smart cities, focusing on community engagement, social equity, and the impact of technology on daily life.",
#     "Mobility and Transportation": "transportation systems, traffic flow management, and infrastructure of roads and streets that facilitate efficient movement of people and goods within the city.",
#     "Public Safety": "measures and technologies aimed at ensuring the security and well-being of citizens, such as surveillance systems, emergency response, and crime prevention.",
#     "Infrastructure": "the physical and digital structures that support city functions, including utilities, buildings, streets, and communication networks.",
#     "Environment": "initiatives for reducing pollution, waste management, air quality preservation, resource conservation, and enhancing green spaces.",
#     "Healthcare": "the integration of technology and data to improve healthcare delivery, public health monitoring, and overall citizen well-being.",
#     "Education": "the use of technology in educational institutions, promoting lifelong learning opportunities and community involvement in educational initiatives.",
#     "Energy": "the optimization of energy resources, including renewable energy sources, smart grids, and energy efficiency programs.",
# }

# Read from a json file
with open('../data/config/smart_city_domains_extended.json', 'r') as f:
    smart_city_domains = json.load(f)

# Display the smart city domains dictionary
smart_city_domains

# Flatten the hierarchy into subdomain labels with their context
flattened_domains = {}
for main_domain, subdomains in smart_city_domains.items():
    for subdomain in subdomains:
        # Create a key-value pair where the subdomain is the key
        # and includes context about its parent domain
        flattened_domains[subdomain] = f"{subdomain} (part of {main_domain})"

# Create labels list from subdomains
labels = list(flattened_domains.keys())

labels

['Economy',
 'Business',
 'Economic Management',
 'Innovation Policy',
 'Socioeconomics',
 'Governance',
 'Public Services',
 'Public Policies',
 'Urban Planning',
 'Social Equity',
 'Cybersecurity',
 'Living',
 'Home',
 'Tourism',
 'Culture',
 'Buildings',
 'Education',
 'Healthcare',
 'Emergency Safety',
 'Mobility',
 'Traffic Management',
 'Transportation Systems',
 'Electric Vehicles',
 'Public Transit',
 'People',
 'Citizens',
 'Community Engagement',
 'Learning and Teaching',
 'Waste Management',
 'Pollution Control',
 'Resource Conservation',
 'Energy',
 'Smart Grids',
 'Lightning',
 'Air Quality',
 'Water Quality',
 'Green Spaces']

# Prepare for Classification
Process the extracted sentences and domain definitions to create inputs suitable for the transformer model. This may include tokenization and encoding.

In [16]:
# Prepare for Classification

# model = "tasksource/ModernBERT-base-nli"
model = "facebook/bart-large-mnli"

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)
# tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom")

# Tokenize and encode the descriptive sentences
encoded_inputs = tokenizer(
    cleaned_abstracts, padding=True, truncation=True, return_tensors="pt"
)

# Display the tokenized and encoded inputs
encoded_inputs

{'input_ids': tensor([[    0, 37500, 23135,  ...,     1,     1,     1],
        [    0, 47481,    35,  ...,     1,     1,     1],
        [    0, 42390, 18926,  ...,     1,     1,     1],
        ...,
        [    0,   713,  2225,  ...,     1,     1,     1],
        [    0,   713,  1566,  ...,     1,     1,     1],
        [    0, 10105,     9,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

# Build Transformer Model
Initialize a pre-trained transformer model (e.g., BERT, RoBERTa) and set up either a zero-shot classification approach or fine-tune the model if labeled data is available.

In [17]:
# Build Transformer Model for Zero-Shot Classification

# Set tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Check available devices
print("CUDA available:", torch.cuda.is_available())
print("MPS available:", torch.backends.mps.is_available())

# Set device for Apple Silicon
if torch.backends.mps.is_available():
    device = "mps"
    print("Using MPS (Metal Performance Shaders)")
elif torch.cuda.is_available():
    device = 0
    print("Using CUDA GPU")
else:
    device = -1
    print("Using CPU")

# Initialize the classifier with device setting
try:
    classifier = pipeline(
        "zero-shot-classification", model=model, device=device
    )
    print("Classifier initialized successfully")
except Exception as e:
    print(f"Error initializing with {device}, falling back to CPU")
    device = -1
    classifier = pipeline(
        "zero-shot-classification", model=model, device=device
    )


# Define the labels for classification based on smart city domains
# labels = list(smart_city_domains.keys())

# Function to classify sentences into smart city domains
def classify_sentences(sentences, labels):
    results = []
    for sentence in tqdm(sentences):
        if sentence:  # Check if sentence is not empty
            try:
                result = classifier(
                    sentence,
                    candidate_labels=labels,
                    hypothesis_template="This text is about {}.",
                )
                results.append(result)
            except Exception as e:
                print(f"Error processing sentence: {str(e)}")
                results.append(None)
    return results


def classify_with_definitions(texts, domains):
    """
    Classify texts using domain definitions for enhanced context.

    Args:
        texts: Either a single string or list of strings to classify
        domains: Dictionary of domain labels and their definitions

    Returns:
        List of (label, score) tuples sorted by score in descending order
    """
    # Convert single string to list for consistent processing
    if isinstance(texts, str):
        texts = [texts]

    all_results = []
    for text in tqdm(texts):
        if not text or not isinstance(text, str):
            all_results.append((text, None))
            continue

        text_results = []
        for label, definition in domains.items():
            try:
                result = classifier(
                    text,
                    candidate_labels=[label],
                    hypothesis_template=f"This text is about {{}}, which relates to {definition}",
                )
                text_results.append((label, result["scores"][0]))
            except Exception as e:
                print(f"Error processing text: {text[:50]}... Error: {str(e)}")
                continue

        if text_results:
            all_results.append((text,
                sorted(text_results, key=lambda x: x[1], reverse=True))
            )
        else:
            all_results.append((text,None))

    return all_results


# Test with a single sentence first
# test_sentence = "This is a test sentence about guidelines and policies that promote innovation and creativity through scientific research and advanced technology. These economies emphasize competitiveness, the effective use of information and communication technologies, and the socially responsible management of resources."
test_sentence = "Fire detection has held stringent importance in computer vision for over half a century. The development of early fire detection strategies is pivotal to the realization of safe and smart cities, inhabitable in the future. However, the development of optimal fire and smoke detection models is hindered by limitations like publicly available datasets, lack of diversity, and class imbalance. In this work, we explore the possible ways forward to overcome these challenges posed by available datasets. We study the impact of a class-balanced dataset to improve the fire detection capability of state-of-the-art (SOTA) vision-based models and propose the use of generative models for data augmentation, as a future work direction. First, a comparative analysis of two prominent object detection architectures, You Only Look Once version 7 (YOLOv7) and YOLOv8 has been carried out using a balanced dataset, where both models have been evaluated across various evaluation metrics including precision, recall, and mean Average Precision (mAP). The results are compared to other recent fire detection models, highlighting the superior performance and efficiency of the proposed YOLOv8 architecture as trained on our balanced dataset. Next, a fractal dimension analysis gives a deeper insight into the repetition of patterns in fire, and the effectiveness of the results has been demonstrated by a windowing-based inference approach. The proposed Slicing-Aided Hyper Inference (SAHI) improves the fire and smoke detection capability of YOLOv8 for real-life applications with a significantly improved mAP performance over a strict confidence threshold. YOLOv8 with SAHI inference gives a mAP:50-95 improvement of more than 25% compared to the base YOLOv8 model. The study also provides insights into future work direction by exploring the potential of generative models like deep convolutional generative adversarial network (DCGAN) and diffusion models like stable diffusion, for data augmentation."
print("\nTesting single classification...")
test_result = classifier(
    test_sentence, candidate_labels=labels, hypothesis_template="This text is about {}."
)
print("Test successful!")
print(f"Result: {test_result}")

# If test passes, proceed with batch classification
print(f"\nClassifying {len(cleaned_abstracts[:5])} descriptive sentences...")
classification_results = classify_sentences(cleaned_abstracts[:5], labels)
# classification_results = classification_results = classify_with_definitions(cleaned_abstracts[:50], smart_city_domains)
# Display results summary
valid_results = [r for r in classification_results if r is not None]
print(
    f"\nSuccessfully classified {len(valid_results)} out of {len(classification_results)} sentences"
)

# Display first 3 results with abstracts
# print("\nFirst 3 classification results:")
# for i, (text, results) in enumerate(valid_results[:3]):
#     print(f"\nAbstract {i+1}:")
#     print(f"Text: {text[:200]}...")  # Show first 200 characters
#     print("Classifications:")
#     for label, score in results:
#         print(f"- {label}: {score:.4f}")

CUDA available: False
MPS available: True
Using MPS (Metal Performance Shaders)


Device set to use mps


Classifier initialized successfully

Testing single classification...
Test successful!
Result: {'sequence': 'Fire detection has held stringent importance in computer vision for over half a century. The development of early fire detection strategies is pivotal to the realization of safe and smart cities, inhabitable in the future. However, the development of optimal fire and smoke detection models is hindered by limitations like publicly available datasets, lack of diversity, and class imbalance. In this work, we explore the possible ways forward to overcome these challenges posed by available datasets. We study the impact of a class-balanced dataset to improve the fire detection capability of state-of-the-art (SOTA) vision-based models and propose the use of generative models for data augmentation, as a future work direction. First, a comparative analysis of two prominent object detection architectures, You Only Look Once version 7 (YOLOv7) and YOLOv8 has been carried out using a balan

100%|██████████| 5/5 [01:06<00:00, 13.39s/it]


Successfully classified 5 out of 5 sentences





In [None]:
valid_results[:5]

In [18]:
def display_classification_results(results):
    """
    Displays the classification results, including the sequence,
    predicted labels, and corresponding scores in a formatted manner.

    Args:
        results (list): A list of dictionaries, where each dictionary contains
                        the sequence, labels, and scores from the classification.
    """
    for i, result in enumerate(results):
        print(f"Result {i + 1}:")
        print(f"Sequence: {result['sequence'][:200]}...")  # Display first 200 characters
        print("Top Predicted Domains:")
        for label, score in zip(result['labels'][:3], result['scores'][:3]):  # Display top 3
            print(f"  - {label}: {score:.4f}")
        print("-" * 40)

# Call the display function with the classification results
display_classification_results(valid_results[:5])

Result 1:
Sequence: Building rooftop extraction has been applied in various fields, such as cartography, urban planning, automatic driving, and intelligent city construction. Automatic building detection and extraction a...
Top Predicted Domains:
  - Buildings: 0.2422
  - Urban Planning: 0.0699
  - Living: 0.0517
----------------------------------------
Result 2:
Sequence: Problem: Modernizing and standardizing place names and addresses is a key challenge in the development of smart cities. Purpose: This paper proposes a solution to address matching challenges, such as ...
Top Predicted Domains:
  - Home: 0.0877
  - Citizens: 0.0639
  - Living: 0.0549
----------------------------------------
Result 3:
Sequence: Distributed multivariate time series anomaly detection is widely-used in industrial equipment monitoring, financial risk management, and smart cities. Although Federated learning (FL) has garnered sig...
Top Predicted Domains:
  - Business: 0.0740
  - Home: 0.0585
  - Community 

# Classify Abstracts
Apply the transformer model to classify each abstract into the most appropriate smart city domain based on the extracted descriptive sentences.

In [None]:
# Classify Abstracts

# Apply the transformer model to classify each abstract into the most appropriate smart city domain based on the extracted descriptive sentences.

# Function to classify abstracts based on descriptive sentences
def classify_abstracts(abstracts, labels):
    abstract_classifications = []
    for abstract in abstracts:
        sentences = extract_descriptive_sentences(abstract, patterns)
        if sentences:
            classification = classify_sentences(sentences, labels)
            # Aggregate the classification results to determine the most frequent domain
            domain_counts = {}
            for result in classification:
                for label, score in zip(result['labels'], result['scores']):
                    if label in domain_counts:
                        domain_counts[label] += score
                    else:
                        domain_counts[label] = score
            # Determine the domain with the highest aggregated score
            most_frequent_domain = max(domain_counts, key=domain_counts.get)
            abstract_classifications.append(most_frequent_domain)
        else:
            abstract_classifications.append("Unclassified")
    return abstract_classifications

# Classify the cleaned abstracts
abstract_classifications = classify_abstracts(cleaned_abstracts[:50], labels)

# Display the first few abstract classifications
abstract_classifications[:5]

# Create a DataFrame to visualize the classification results
df_classifications = pd.DataFrame({
    'Abstract': cleaned_abstracts[:50],
    'Classification': abstract_classifications
})

# Display the DataFrame
df_classifications.head()

# Plot the distribution of classifications
plt.figure(figsize=(10, 6))
sns.countplot(y='Classification', data=df_classifications, order=df_classifications['Classification'].value_counts().index)
plt.title('Distribution of Smart City Domain Classifications')
plt.xlabel('Count')
plt.ylabel('Smart City Domain')
plt.show()

In [None]:
# List Unclassified Abstracts
unclassified_abstracts = df_classifications[df_classifications['Classification'] == 'Unclassified']
print(f"Number of unclassified abstracts: {len(unclassified_abstracts)}")
unclassified_abstracts_list = unclassified_abstracts['Abstract'].tolist()
print("Unclassified abstracts:")
for abstract in unclassified_abstracts_list:
    print(abstract)

# Evaluate Results
Assess the performance of the classification model using appropriate metrics and analyze any misclassifications to improve the model.

In [None]:
# Evaluate Results

# Import necessary libraries for evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Assuming we have true labels for evaluation purposes
# For demonstration, let's create some dummy true labels
true_labels = ["Governance", "Mobility", "Safety", "Infrastructure", "Environment", "Healthcare"] * (len(abstract_classifications) // 6)
true_labels += true_labels[:len(abstract_classifications) % 6]  # Add remaining labels
true_labels = true_labels[:len(abstract_classifications)]

# Calculate accuracy
accuracy = accuracy_score(true_labels, abstract_classifications)
print(f"Accuracy: {accuracy:.2f}")

# Generate classification report
extended_labels = labels + ["Unclassified"]
report = classification_report(true_labels, abstract_classifications, target_names=extended_labels, zero_division=0)
print("Classification Report:\n", report)

# Generate confusion matrix
conf_matrix = confusion_matrix(true_labels, abstract_classifications, labels=extended_labels)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=extended_labels, yticklabels=extended_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# Analyze misclassifications
misclassified = df_classifications[df_classifications['Classification'] != pd.Series(true_labels, index=df_classifications.index)]
print("Misclassified Abstracts:\n", misclassified)

# Visualize Domain Distribution
Create visualizations (e.g., bar charts, word clouds) to display the distribution of smart city domains in the dataset and highlight key terms associated with each domain.

In [None]:
# Visualize Domain Distribution

# Import necessary libraries for visualization
from wordcloud import WordCloud

# Create a bar chart to display the distribution of smart city domains
plt.figure(figsize=(10, 6))
sns.countplot(y='Classification', data=df_classifications, order=df_classifications['Classification'].value_counts().index)
plt.title('Distribution of Smart City Domain Classifications')
plt.xlabel('Count')
plt.ylabel('Smart City Domain')
plt.show()

# Create word clouds for each smart city domain
for domain in labels:
    domain_sentences = ' '.join(df_classifications[df_classifications['Classification'] == domain]['Abstract'])
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(domain_sentences)
    
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {domain}')
    plt.axis('off')
    plt.show()