In [4]:
import json
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
from collections import defaultdict

# Download NLTK resources
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("omw-1.4")

# Load JSON data
with open("../data/04_author_keywords.json") as file:
    data = json.load(file)

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()


# Normalization function
def normalize_keyword(keyword):
    keyword = keyword.lower()
    keyword = re.sub(r"[^a-zA-Z0-9\s]", " ", keyword)
    keyword = re.sub(r"\s+", " ", keyword).strip()
    tokens = word_tokenize(keyword)
    lemmatized = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized)


# Aggregate keywords
aggregated_keywords = defaultdict(int)

for keyword, count in data.items():
    normalized = normalize_keyword(keyword)
    aggregated_keywords[normalized] += count

# Convert results to DataFrame
df_aggregated = pd.DataFrame(
    aggregated_keywords.items(), columns=["Normalized Keyword", "Count"]
)
df_aggregated.sort_values(by="Count", ascending=False, inplace=True)

# Display results
print(df_aggregated.head(40))

                     Normalized Keyword  Count
439                          smart city     56
210      generative adversarial network     31
138                       deep learning     26
209                       generative ai     20
295                large language model     18
198                                 gan     17
211  generative adversarial network gan     17
23              artificial intelligence     16
310                    machine learning     11
509                     urban computing     10
540             variational autoencoder     10
290                                lstm      9
274                   internet of thing      9
253                                 iot      8
149                     diffusion model      8
513                      urban planning      7
400                      remote sensing      7
151                        digital twin      6
125                   data augmentation      6
49                   autonomous driving      6
39           

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joaocarlos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/joaocarlos/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joaocarlos/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/joaocarlos/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
