In [None]:
# Yo ensure that all required libraries were installed correctly

import pandas as pd
import numpy as np
import requests
import langchain
import reportlab
import qrcode
import redis
import celery

In [None]:
import os
from bs4 import BeautifulSoup

# Define the directory containing the HTML files
data_path = r"C:\Users\anduj\Desktop\landmarks"

In [None]:
# List all files in the directory
files = os.listdir(data_path)
print(f"Found {len(files)} files.")

In [None]:
# Print each filename
print("List of files:")
for idx, file in enumerate(files, start=1):
    print(f"{idx}. {file}")

In [None]:
# Process the first 3 files to analyze the structure

for file in files[:3]:  # Read only 3 files for initial evaluation
    file_path = os.path.join(data_path, file)
    
    # Read the HTML file
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    
    # Use BeautifulSoup to parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract the main text content without HTML tags
    extracted_text = soup.get_text(separator="\n", strip=True)
    
    print(f"\n--- {file} ---\n")
    print(extracted_text[:1000])  # Display only the first 1000 characters
    print("\n" + "-" * 50)

# **Cleaning Test No. 01 - Relevant content extracted but relevant links were eliminated**
import os
from bs4 import BeautifulSoup

# Define the directories
data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\data_test"
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\cleaned_data"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Function to clean and extract meaningful content
def clean_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract text from relevant tags (e.g., <p>, <h1>, <h2>)
    paragraphs = soup.find_all(['h1', 'h2', 'p'])
    
    # Combine and clean extracted text
    cleaned_text = "\n".join([para.get_text(strip=True) for para in paragraphs])
    return cleaned_text

# Process and save cleaned content
for file in os.listdir(data_path):
    file_path = os.path.join(data_path, file)
    cleaned_content = clean_html(file_path)
    
    # Save cleaned content to a new file
    output_file_path = os.path.join(output_path, file)
    with open(output_file_path, "w", encoding="utf-8") as out_file:
        out_file.write(cleaned_content)
    
    print(f"Cleaned and saved: {file}")

In [None]:
# **Cleaning Test No. 02 - Relevant content extracted and relevant links were include but it was found that geolocation info were eliminated**
import os
import requests
from bs4 import BeautifulSoup

# Define the directories
data_path = r"C:\Users\anduj\Desktop\landmarks"
output_path = r"C:\Users\anduj\Desktop\landmarks"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Function to validate links
def validate_link(url):
    try:
        response = requests.head(url, timeout=5)  # Perform a HEAD request
        return response.status_code == 200
    except requests.RequestException:
        return False

# Function to clean and extract meaningful content and links
def clean_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        html_content = f.read()
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract text from relevant tags (e.g., <p>, <h1>, <h2>)
    paragraphs = soup.find_all(['h1', 'h2', 'p'])
    cleaned_text = "\n".join([para.get_text(strip=True) for para in paragraphs])
    
    # Extract and validate hyperlinks
    links = [a['href'] for a in soup.find_all('a', href=True)]
    valid_links = [link for link in links if validate_link(link)]
    
    # Combine content and validated links in structured format
    combined_content = f"Main Content:\n{cleaned_text}\n\nRelevant Links:\n{valid_links}"
    return combined_content

# Process and save cleaned content
for file in os.listdir(data_path):
    file_path = os.path.join(data_path, file)
    cleaned_content = clean_html(file_path)
    
    # Save combined content (cleaned text + links) to a new file
    output_file_path = os.path.join(output_path, file)
    with open(output_file_path, "w", encoding="utf-8") as out_file:
        out_file.write(cleaned_content)
    
    print(f"Cleaned and saved: {file}")

# **Defining and testing geo_metadata ectracion process**

In [None]:
import re
import json

# Define the path to the single test file
test_file_path = r"C:\Users\anduj\Desktop\landmarks"
geo_extract_output = r"C:\Users\anduj\Desktop\landmarks"

# Function to extract geospatial metadata without strict JSON parsing
def extract_geo_metadata_loose(content):
    geo_metadata = {}
    
    # Find coordinates
    coordinates_pattern = r'"coordinates":\s*\[.*?\]'
    coordinates_match = re.search(coordinates_pattern, content)
    if coordinates_match:
        try:
            # Extract and parse the coordinates
            coordinates = json.loads(coordinates_match.group(0).split(":")[1].strip())
            geo_metadata["coordinates"] = coordinates
        except json.JSONDecodeError:
            geo_metadata["coordinates"] = "Invalid format"

    # Find title
    title_pattern = r'"title":\s*".*?"'
    title_match = re.search(title_pattern, content)
    if title_match:
        geo_metadata["title"] = title_match.group(0).split(":")[1].strip().strip('"')

    # Find marker type
    marker_pattern = r'"marker-symbol":\s*".*?"'
    marker_match = re.search(marker_pattern, content)
    if marker_match:
        geo_metadata["type"] = marker_match.group(0).split(":")[1].strip().strip('"')

    return geo_metadata

# Read the test file and extract geo metadata
with open(test_file_path, "r", encoding="utf-8") as f:
    raw_content = f.read()

# Apply the extraction function
geo_metadata = extract_geo_metadata_loose(raw_content)

# Save the extracted metadata to a test file
with open(geo_extract_output, "w", encoding="utf-8") as out_file:
    out_file.write(json.dumps(geo_metadata, indent=4))

print(f"Geo metadata extracted and saved to: {geo_extract_output}")
print("Extracted Geo Metadata:", geo_metadata)

# **Cleaning Test No. 03 - Relevant content extracted, including relevant links and geolocation info.**

In [None]:
import os
import json
import re
import requests
from bs4 import BeautifulSoup

# Define the directories
data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\data_test"
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\cleaned_data"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Function to validate links
def validate_link(url):
    try:
        response = requests.head(url, timeout=5)
        return response.status_code == 200
    except requests.RequestException:
        return False

# Function to extract geospatial metadata without strict JSON parsing
def extract_geo_metadata_loose(content):
    geo_metadata = {}
    
    # Find coordinates
    coordinates_pattern = r'"coordinates":\s*\[.*?\]'
    coordinates_match = re.search(coordinates_pattern, content)
    if coordinates_match:
        try:
            coordinates = json.loads(coordinates_match.group(0).split(":")[1].strip())
            geo_metadata["coordinates"] = coordinates
        except json.JSONDecodeError:
            geo_metadata["coordinates"] = "Invalid format"

    # Find title
    title_pattern = r'"title":\s*".*?"'
    title_match = re.search(title_pattern, content)
    if title_match:
        geo_metadata["title"] = title_match.group(0).split(":")[1].strip().strip('"')

    # Find marker type
    marker_pattern = r'"marker-symbol":\s*".*?"'
    marker_match = re.search(marker_pattern, content)
    if marker_match:
        geo_metadata["type"] = marker_match.group(0).split(":")[1].strip().strip('"')

    return geo_metadata if geo_metadata else None

# Function to clean and extract meaningful content, links, and geospatial data
def clean_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_content = f.read()
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(raw_content, "html.parser")
    
    # Extract text from relevant tags (e.g., <p>, <h1>, <h2>)
    paragraphs = soup.find_all(['h1', 'h2', 'p'])
    cleaned_text = "\n".join([para.get_text(strip=True) for para in paragraphs])
    
    # Extract hyperlinks
    links = [a['href'] for a in soup.find_all('a', href=True)]
    valid_links = [link for link in links if validate_link(link)]
    
    # Extract geospatial metadata
    geo_metadata = extract_geo_metadata_loose(raw_content)
    
    # Combine all data
    combined_content = {
        "main_content": cleaned_text,
        "relevant_links": valid_links,
        "geo_metadata": geo_metadata
    }
    return combined_content

# Process and save cleaned content
for file in os.listdir(data_path):
    file_path = os.path.join(data_path, file)
    cleaned_data = clean_html(file_path)
    
    # Save combined content to a new file
    output_file_path = os.path.join(output_path, file)
    with open(output_file_path, "w", encoding="utf-8") as out_file:
        out_file.write(json.dumps(cleaned_data, indent=4))
    
    print(f"Cleaned and saved: {file}")

In [None]:
!pip install ftfy

In [None]:
!pip install unidecode

# **Cleaning Test No. 04 - Testing for special characters handling**
# Define the directories
cleaned_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks\\cleaned_data"
final_output_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks\\final_data"

# Create final output directory if it doesn't exist
if not os.path.exists(final_output_path):
    os.makedirs(final_output_path)

# Define special character replacements
special_chars_mapping = {
    "\\xc3\\xa1": "á",
    "\\xc3\\xa9": "é",
    "\\xc3\\xad": "í",
    "\\xc3\\xb3": "ó",
    "\\xc3\\xba": "ú",
    "\\xc3\\xb1": "ñ",
    "\\xc3\\x9c": "Ü",
    "\\xc3\\xbc": "ü",
    "\\xe2\\x80\\x99": "'",
    "\\xe2\\x80\\x9c": "\"",
    "\\xe2\\x80\\x9d": "\""
}

# Function to replace special characters
def fix_special_characters(text):
    for wrong, correct in special_chars_mapping.items():
        text = text.replace(wrong, correct)
    return text

# Process files to apply character mapping
for file in os.listdir(cleaned_data_path):
    file_path = os.path.join(cleaned_data_path, file)
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
        cleaned_content = fix_special_characters(content)
    
    # Save the updated file
    final_file_path = os.path.join(final_output_path, file)
    with open(final_file_path, "w", encoding="utf-8") as out_file:
        out_file.write(cleaned_content)
    
    print(f"Processed special characters for: {file}")

# **Step 1: Removal of html codes, manage spaces issues, management of geo_metadata and special characters handling**

In [None]:
import os
import json
import re
import requests
from bs4 import BeautifulSoup

# Define the directories
data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks"#\\data_test"
output_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\step1_cleaned_data"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Step 1: Define Mapping
special_chars_mapping = {
    "\\xc3\\xa1": "á",
    "\\xc3\\xa9": "é",
    "\\xc3\\xad": "í",
    "\\xc3\\xb3": "ó",
    "\\xc3\\xba": "ú",
    "\\xc3\\xb1": "ñ",
    "\\xc3\\x9c": "Ü",
    "\\xc3\\xbc": "ü",
    "\\xe2\\x80\\x99": "'",
    "\\xe2\\x80\\x9c": "\"",
    "\\xe2\\x80\\x9d": "\""
}

# Step 2: Function for Fixing Special Characters
def fix_special_characters(text):
    for wrong, correct in special_chars_mapping.items():
        text = text.replace(wrong, correct)
    return text

# Step 3: Clean Function
def clean_html(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_content = f.read()

    # Parse with BeautifulSoup
    soup = BeautifulSoup(raw_content, "html.parser")

    # Insert space after <a> tags and remove duplicates
    for a in soup.find_all("a"):
        a.insert_after(" ")

    # Extract text
    paragraphs = soup.find_all(['h1', 'h2', 'p'])
    cleaned_text = "\n".join([para.get_text(" ", strip=True) for para in paragraphs])
    
    # Apply character replacement
    cleaned_text = fix_special_characters(cleaned_text)

    # Extract links and validate
    links = list(set([a['href'] for a in soup.find_all('a', href=True)]))

    # Extract geo metadata
    geo_metadata = extract_geo_metadata_loose(raw_content)

    # Combine all data
    return {
        "main_content": cleaned_text,
        "relevant_links": links,
        "geo_metadata": geo_metadata
    }

# Apply to All Files
for file in os.listdir(data_path):
    file_path = os.path.join(data_path, file)
    cleaned_data = clean_html(file_path)
    output_file_path = os.path.join(output_path, file)

    with open(output_file_path, "w", encoding="utf-8") as out_file:
        json.dump(cleaned_data, out_file, indent=4, ensure_ascii=False)

    print(f"Processed: {file}")

# **Step 2: Removal of duplicate and broken links**

In [None]:
import os
import json
import requests

# Define directories
cleaned_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\step1_cleaned_data"#cleaned_data"
final_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\step2_cleaned_data"#final_data"

# Create the final output directory if it doesn't exist
if not os.path.exists(final_data_path):
    os.makedirs(final_data_path)

# Function to validate and deduplicate links
def process_relevant_links(links):
    """
    Validates and deduplicates a list of relevant links.
    Only keeps unique, valid links that start with "http".
    """
    # Filter for HTTP(S) links
    http_links = [link for link in links if link.startswith("http")]
    
    # Deduplicate links
    unique_links = list(set(http_links))
    
    # Validate links
    valid_links = []
    for link in unique_links:
        try:
            response = requests.head(link, timeout=5)
            if response.status_code == 200:
                valid_links.append(link)
        except requests.RequestException:
            continue

    return valid_links

# Process each file in the cleaned_data directory
for file_name in os.listdir(cleaned_data_path):
    file_path = os.path.join(cleaned_data_path, file_name)
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Process relevant links if they exist
    if "relevant_links" in data:
        data["relevant_links"] = process_relevant_links(data["relevant_links"])

    # Save the updated data into the final_data directory
    final_file_path = os.path.join(final_data_path, file_name)
    with open(final_file_path, "w", encoding="utf-8") as final_file:
        json.dump(data, final_file, indent=4, ensure_ascii=False)

    print(f"Processed and saved: {file_name}")

# **Step 3: Removal of links references and not relevant links**
Note: Removal of link references does not works within this code. LWD 06Feb25

In [None]:
import os
import json
import re

# Define input and output directories
final_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\step2_cleaned_data"#final_data"
final_cleaned_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\step3_cleaned_data"

# Create final cleaned output directory if it doesn't exist
if not os.path.exists(final_cleaned_data_path):
    os.makedirs(final_cleaned_data_path)

# Define trusted sources for relevant links
TRUSTED_DOMAINS = [
    "wikipedia.org",
    "wikivoyage.org",
    "discoverpuertorico.com",
    "travel.usnews.com",
    "tripadvisor.com",
    "skyscanner.com",
    "lonelyplanet.com",
    "puertorico.com"
]

# Function to filter relevant links
def filter_relevant_links(links):
    filtered_links = []
    for link in links:
        if any(domain in link for domain in TRUSTED_DOMAINS):
            filtered_links.append(link)
    return list(set(filtered_links))  # Remove duplicates

# Function to remove citation numbers like "[1]" from text
def remove_citation_numbers(text):
    return re.sub(r"\[\d+\]", "", text)

# Process each file in the final_data directory
for file_name in os.listdir(final_data_path):
    file_path = os.path.join(final_data_path, file_name)

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Clean relevant links
    if "relevant_links" in data:
        data["relevant_links"] = filter_relevant_links(data["relevant_links"])

    # Clean main content from citation numbers
    if "main_content" in data:
        data["main_content"] = remove_citation_numbers(data["main_content"])

    # Save the cleaned file to final_cleaned_data directory
    output_file_path = os.path.join(final_cleaned_data_path, file_name)
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        json.dump(data, output_file, indent=4, ensure_ascii=False)

    print(f"Final cleaned and saved: {file_name}")

# **Step 4: Removal of links references**

In [None]:
import os
import json
import re

# Define directories
cleaned_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\step3_cleaned_data"
final_data_path = r"C:\\Users\\larry\\OneDrive\\Documents\\GitHub\\project-aieng-interactive-travel-planner\\data\\landmarks_cleanng\\final_step_cleaned_data"

# Create output directory if it doesn't exist
if not os.path.exists(final_data_path):
    os.makedirs(final_data_path)

# Function to clean the relevant_links list
def clean_relevant_links(links):
    unique_links = list(set(links))  # Remove duplicates
    return unique_links

# Function to remove reference numbers like [3], [15], etc.
def remove_reference_numbers(text):
    return re.sub(r"\[ \d+\ ]", "", text)

# Process each file in cleaned_data
for file in os.listdir(cleaned_data_path):
    file_path = os.path.join(cleaned_data_path, file)

    with open(file_path, "r", encoding="utf-8") as f:
        content = json.load(f)

    # Clean the relevant_links
    if "relevant_links" in content:
        content["relevant_links"] = clean_relevant_links(content["relevant_links"])

    # Remove reference numbers from main content
    if "main_content" in content:
        content["main_content"] = remove_reference_numbers(content["main_content"])

    # Save the final cleaned content
    output_file_path = os.path.join(final_data_path, file)
    with open(output_file_path, "w", encoding="utf-8") as out_file:
        json.dump(content, out_file, indent=4, ensure_ascii=False)

    print(f"Final cleaned file saved: {file}")

import os
import json
import re

# Define paths
raw_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\test"
cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\final_step_cleaned_data"                     
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\10Feb25_step01_title_cat_geo_clean"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Keyword-based category mapping
category_keywords = {
    "Extreme Adventure": ["zipline", "canopy", "extreme sports", "rock climbing", "caving", "spelunking", "skydiving", "bungee jumping", "off-road", "ATV"],
    "Soft Adventure": ["hiking", "kayaking", "snorkeling", "paddleboarding", "scuba diving", "tubing", "water sports"],
    "Outdoor": ["nature", "park", "trail", "scenic", "river", "camping", "wildlife", "birdwatching", "cave", "natural reserve"],
    "Beach": ["beach", "sand", "ocean", "coast", "surfing", "snorkeling", "seaside"],
    "Culture": ["museum", "history", "heritage", "colonial", "art", "landmark", "architecture"],
    "Food": ["food", "cuisine", "restaurant", "culinary", "gastronomy", "local dishes"],
    "Nightlife": ["nightlife", "bar", "party", "music", "club", "cocktail", "dance"],
    "Shopping": ["shopping", "mall", "store", "boutique", "souvenirs", "market"],
    "Wildlife": ["animals", "zoo", "wildlife", "birdwatching", "conservation"],
    "Family-Friendly": ["family", "kids", "playground", "amusement", "theme park", "activities"],
    "Religion": ["church", "cathedral", "parish", "temple", "sanctuary", "basilica", "shrine", "holy", "worship"],
    "Wellness & Relaxation": ["spa", "wellness", "yoga", "retreat", "meditation", "hot springs", "thermal baths", "relaxation", "peaceful"],
    "Luxury & Resorts": ["resort", "luxury", "boutique hotel", "exclusive", "private beach", "high-end"],
    "Festivals & Events": ["festival", "event", "parade", "local traditions", "carnival", "concert", "Puerto Rican festival"]
}

# Function to extract coordinates
def extract_coordinates(text):
    coordinate_pattern = re.compile(r'(-?\d+\.\d+);\s*(-?\d+\.\d+)')
    matches = coordinate_pattern.findall(text)
    if matches:
        lat, lon = float(matches[0][0]), float(matches[0][1])
        return [lat, lon]
    return None

# Function to remove pronunciation sections (generic patterns within parentheses)
def remove_pronunciation(text):
    # Define the regex pattern to match different pronunciation formats
    pronunciation_pattern = re.compile(
        r"\(\s*(US:.*?|UK:.*?|Spanish:.*?|US :.*?|UK :.*?|Spanish :.*?|.*?pronunciation:.*?|/.*?/.*?)\s*\)",  # Include "pronunciation:" explicitly
        flags=re.DOTALL
    )
    # Remove all matching patterns
    return re.sub(pronunciation_pattern, "", text)

# Function to assign categories based on keywords
def assign_categories(text):
    assigned_categories = []
    text_lower = text.lower()
    for category, keywords in category_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            assigned_categories.append(category)
    return list(set(assigned_categories))  # Ensure unique categories

# Process all files
for file_name in os.listdir(cleaned_data_path):
    raw_file_path = os.path.join(raw_data_path, file_name)
    cleaned_file_path = os.path.join(cleaned_data_path, file_name)
    
    if not os.path.exists(raw_file_path):
        print(f"Skipping {file_name}: Raw file not found.")
        continue

    # Load raw and cleaned data
    with open(raw_file_path, "r", encoding="utf-8") as raw_file:
        raw_content = raw_file.read()

    with open(cleaned_file_path, "r", encoding="utf-8") as cleaned_file:
        cleaned_data = json.load(cleaned_file)

    # Extract title (use file name as fallback)
    title_match = re.search(r"<h1>(.*?)</h1>", raw_content)
    title = title_match.group(1).strip() if title_match else file_name.replace(".txt", "")

    # Extract coordinates
    coordinates = extract_coordinates(raw_content)
    
    # Assign categories
    categories = assign_categories(cleaned_data["main_content"])

    # Remove pronunciation details
    cleaned_data["main_content"] = remove_pronunciation(cleaned_data["main_content"])

    # Update geo_metadata
    cleaned_data["geo_metadata"] = {
        "coordinates": coordinates if coordinates else cleaned_data["geo_metadata"].get("coordinates", "Not Found"),
        "title": title,
        "type": "Point"
    }

    # Add categories
    cleaned_data["categories"] = categories

    # Save updated JSON
    output_file_path = os.path.join(output_path, file_name)
    with open(output_file_path, "w", encoding="utf-8") as out_file:
        json.dump(cleaned_data, out_file, indent=4, ensure_ascii=False)

    print(f"Updated and saved: {file_name}")

Sure, I can help you integrate the provided code into your existing code. Here is the updated code with the new functions and logic included:



import os
import json
import re

# Define paths
raw_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\test"
cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\final_step_cleaned_data"                     
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\10Feb25_step01_title_cat_geo_clean"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Keyword-based category mapping
category_keywords = {
    "Extreme Adventure": ["zipline", "canopy", "extreme sports", "rock climbing", "caving", "spelunking", "skydiving", "bungee jumping", "off-road", "ATV"],
    "Soft Adventure": ["hiking", "kayaking", "snorkeling", "paddleboarding", "scuba diving", "tubing", "water sports"],
    "Outdoor": ["nature", "park", "trail", "scenic", "river", "camping", "wildlife", "birdwatching", "cave", "natural reserve"],
    "Beach": ["beach", "sand", "ocean", "coast", "surfing", "snorkeling", "seaside"],
    "Culture": ["museum", "history", "heritage", "colonial", "art", "landmark", "architecture"],
    "Food": ["food", "cuisine", "restaurant", "culinary", "gastronomy", "local dishes"],
    "Nightlife": ["nightlife", "bar", "party", "music", "club", "cocktail", "dance"],
    "Shopping": ["shopping", "mall", "store", "boutique", "souvenirs", "market"],
    "Wildlife": ["animals", "zoo", "wildlife", "birdwatching", "conservation"],
    "Family-Friendly": ["family", "kids", "playground", "amusement", "theme park", "activities"],
    "Religion": ["church", "cathedral", "parish", "temple", "sanctuary", "basilica", "shrine", "holy", "worship"],
    "Wellness & Relaxation": ["spa", "wellness", "yoga", "retreat", "meditation", "hot springs", "thermal baths", "relaxation", "peaceful"],
    "Luxury & Resorts": ["resort", "luxury", "boutique hotel", "exclusive", "private beach", "high-end"],
    "Festivals & Events": ["festival", "event", "parade", "local traditions", "carnival", "concert", "Puerto Rican festival"]
}

# Function to extract coordinates
def extract_coordinates(text):
    coordinate_pattern = re.compile(r'(-?\d+\.\d+);\s*(-?\d+\.\d+)')
    matches = coordinate_pattern.findall(text)
    if matches:
        lat, lon = float(matches[0][0]), float(matches[0][1])
        return [lat, lon]
    return None

# Function to remove pronunciation sections (generic patterns within parentheses)
def remove_pronunciation(text):
    # Define the regex pattern to match different pronunciation formats
    pronunciation_pattern = re.compile(
        r"\(\s*(US:.*?|UK:.*?|Spanish:.*?|US :.*?|UK :.*?|Spanish :.*?|.*?pronunciation:.*?|/.*?/.*?)\s*\)",  # Include "pronunciation:" explicitly
        flags=re.DOTALL
    )
    # Remove all matching patterns
    return re.sub(pronunciation_pattern, "", text)

# Function to assign categories based on keywords
def assign_categories(text):
    assigned_categories = []
    text_lower = text.lower()
    for category, keywords in category_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            assigned_categories.append(category)
    return list(set(assigned_categories))  # Ensure unique categories

# Function to update geo_metadata and categories in cleaned files
def update_cleaned_file(cleaned_data, title, coordinates, categories):
    # Validate and update geo_metadata
    cleaned_data["geo_metadata"] = {
        "coordinates": coordinates,
        "title": cleaned_data["geo_metadata"].get("title", title) if "geo_metadata" in cleaned_data else title,
        "type": "Point"
    }

    # Add categories
    cleaned_data["categories"] = categories

    return cleaned_data

# Function to clean and update main content
def update_main_content(cleaned_data):
    # Remove pronunciation sections if available
    if "main_content" in cleaned_data:
        cleaned_data["main_content"] = remove_pronunciation(cleaned_data["main_content"])
    return cleaned_data

# Process files and handle exceptions
for file in os.listdir(cleaned_data_path):
    try:
        cleaned_file_path = os.path.join(cleaned_data_path, file)
        raw_file_path = os.path.join(raw_data_path, file)

        # Load cleaned data
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Extract title, coordinates, and categories from raw data
        with open(raw_file_path, "r", encoding="utf-8") as f:
            raw_content = f.read()
        title = re.search(r"<h1>(.*?)</h1>", raw_content).group(1).strip() if re.search(r"<h1>(.*?)</h1>", raw_content) else file.replace(".txt", "")
        coordinates = extract_coordinates(raw_content)
        categories = assign_categories(raw_content)

        # Update cleaned data
        cleaned_data = update_cleaned_file(cleaned_data, title, coordinates, categories)
        cleaned_data = update_main_content(cleaned_data)

        # Save updated cleaned data
        output_file_path = os.path.join(output_path, file)
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Updated and saved: {file}")

    except Exception as e:
        print(f"Error processing file {file}: {e}")



This code integrates the new functions `update_cleaned_file` and `update_main_content` into the existing processing loop. It ensures that the cleaned data is updated with the title, coordinates, and categories, and that the main content is cleaned by removing pronunciation sections. The updated cleaned data is then saved to the specified output path.

import os
import json
import re

# Define paths
raw_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\test"
cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\final_step_cleaned_data"                     
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\10Feb25a_step01_title_cat_geo_clean"
municipality_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\municipalities_cleaning\step3_final_cleaned_data"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Keyword-based category mapping
category_keywords = {
    "Extreme Adventure": ["zipline", "canopy", "bike", "rappelling", "mountain bike", "paragliding", "skate", "skating", "extreme sports", "rock climbing", "caving", "spelunking", "skydiving", "bungee jumping", "off-road", "ATV"],
    "Soft Adventure": ["hiking", "kayaking", "snorkeling", "paddleboarding", "scuba diving", "tubing", "water sports"],
    "Outdoor": ["nature", "park", "trail", "scenic", "river", "camping", "wildlife", "birdwatching", "cave", "natural reserve"],
    "Beach": ["beach", "sand", "ocean", "coast", "surfing", "snorkeling", "seaside"],
    "Culture": ["museum", "history", "heritage", "colonial", "art", "landmark", "architecture"],
    "Food": ["food", "cuisine", "restaurant", "culinary", "gastronomy", "local dishes"],
    "Nightlife": ["nightlife", "bar", "party", "music", "club", "cocktail", "dance"],
    "Shopping": ["shopping", "mall", "store", "boutique", "souvenirs", "market"],
    "Wildlife": ["animals", "zoo", "wildlife", "birdwatching", "conservation"],
    "Family-Friendly": ["family", "kids", "playground", "amusement", "theme park", "activities"],
    "Religion": ["church", "cathedral", "parish", "temple", "sanctuary", "basilica", "shrine", "holy", "worship"],
    "Wellness & Relaxation": ["spa", "wellness", "yoga", "retreat", "meditation", "hot springs", "thermal baths", "relaxation", "peaceful"],
    "Luxury & Resorts": ["resort", "luxury", "boutique hotel", "exclusive", "private beach", "high-end"],
    "Festivals & Events": ["festival", "event", "parade", "local traditions", "carnival", "concert", "Puerto Rican festival"]
}

# Function to extract coordinates
def extract_coordinates(text):
    coordinate_pattern = re.compile(r'(-?\d+\.\d+);\s*(-?\d+\.\d+)')
    matches = coordinate_pattern.findall(text)
    if matches:
        lat, lon = float(matches[0][0]), float(matches[0][1])
        return [lat, lon]
    return None

# Function to remove pronunciation sections (generic patterns within parentheses)
def remove_pronunciation(text):
    # Define the regex pattern to match different pronunciation formats
    pronunciation_pattern = re.compile(
        r"\(\s*(US:.*?|UK:.*?|Spanish:.*?|US :.*?|UK :.*?|Spanish :.*?|.*?pronunciation:.*?|/.*?/.*?)\s*\)",  # Include "pronunciation:" explicitly
        flags=re.DOTALL
    )
    # Remove all matching patterns
    return re.sub(pronunciation_pattern, "", text)

# Function to assign categories based on keywords
def assign_categories(text):
    assigned_categories = []
    text_lower = text.lower()
    for category, keywords in category_keywords.items():
        if any(keyword in text_lower for keyword in keywords):
            assigned_categories.append(category)
    return list(set(assigned_categories))  # Ensure unique categories

# Function to update geo_metadata and categories in cleaned files
def update_cleaned_file(cleaned_data, title, coordinates, categories):
    # Validate and update geo_metadata
    cleaned_data["geo_metadata"] = {
        "coordinates": coordinates if coordinates else cleaned_data["geo_metadata"].get("coordinates", "Not Found"),
        "title": cleaned_data["geo_metadata"].get("title", title) if "geo_metadata" in cleaned_data else title,
        "type": "Point"
    }

    # Add categories
    cleaned_data["categories"] = categories

    return cleaned_data

# Function to clean and update main content
def update_main_content(cleaned_data):
    # Remove pronunciation sections if available
    if "main_content" in cleaned_data:
        cleaned_data["main_content"] = remove_pronunciation(cleaned_data["main_content"])
    return cleaned_data

# Process files and handle exceptions
files_without_coordinates = []
for file in os.listdir(cleaned_data_path):
    try:
        cleaned_file_path = os.path.join(cleaned_data_path, file)
        raw_file_path = os.path.join(raw_data_path, file)

        # Load cleaned data
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Extract title, coordinates, and categories from raw data
        with open(raw_file_path, "r", encoding="utf-8") as f:
            raw_content = f.read()
        title = re.search(r"<h1>(.*?)</h1>", raw_content).group(1).strip() if re.search(r"<h1>(.*?)</h1>", raw_content) else file.replace(".txt", "")
        coordinates = extract_coordinates(raw_content)
        categories = assign_categories(cleaned_data["main_content"])

        # Update cleaned data
        cleaned_data = update_cleaned_file(cleaned_data, title, coordinates, categories)
        cleaned_data = update_main_content(cleaned_data)

        # Save updated cleaned data
        output_file_path = os.path.join(output_path, file)
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Updated and saved: {file}")

        if not coordinates:
            files_without_coordinates.append(file)

    except Exception as e:
        print(f"Error processing file {file}: {e}")

# List files without coordinates
print("Files without coordinates:")
for file in files_without_coordinates:
    print(file)

# Assign coordinates based on municipality location
for file in files_without_coordinates:
    try:
        cleaned_file_path = os.path.join(output_path, file)
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Assign coordinates based on municipality
        for municipality_file in os.listdir(municipality_data_path):
            municipality_name = municipality_file.replace(".json", "")
            if municipality_name.lower() in cleaned_data["geo_metadata"]["title"].lower():
                municipality_file_path = os.path.join(municipality_data_path, municipality_file)
                with open(municipality_file_path, "r", encoding="utf-8") as mf:
                    municipality_data = json.load(mf)
                cleaned_data["geo_metadata"]["coordinates"] = municipality_data["geo_metadata"]["coordinates"]
                break

        # Save updated cleaned data with municipality coordinates
        with open(cleaned_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Assigned coordinates to: {file}")

    except Exception as e:
        print(f"Error assigning coordinates to file {file}: {e}")

Entiendo, vamos a realizar las siguientes modificaciones para solucionar los problemas:

1. Asegurarnos de que la verificación de `geo_metadata` maneje correctamente los casos en los que no existe.
2. Ajustar la asignación de categorías para que haga una coincidencia exacta de las palabras clave.
3. Asegurarnos de que el sistema intente aplicar la información de los municipios cuando no se encuentren coordenadas.

Aquí está el código actualizado:



import os
import json
import re

# Define paths
raw_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks\test"
cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\final_step_cleaned_data"                     
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleanng\10Feb25b_step01_title_cat_geo_clean"
municipality_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\municipalities_cleaning\step3_final_cleaned_data"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Keyword-based category mapping
category_keywords = {
    "Extreme Adventure": ["zipline", "canopy", "rappelling", "mountain bike", "paragliding", "skate", "skating", "extreme sports", "rock climbing", "caving", "spelunking", "skydiving", "bungee jumping", "off-road", "ATV"],
    "Soft Adventure": ["hiking", "kayaking", "bike", "snorkeling", "paddleboarding", "scuba diving", "tubing", "water sports"],
    "Outdoor": ["nature", "park", "trail", "scenic", "river", "trails", "camping", "picnic", "wildlife", "birdwatching", "cave", "natural reserve", "outdoor", "forrest"],
    "Beach": ["beach", "sand", "ocean", "coast", "surfing", "snorkeling", "seaside"],
    "Culture": ["museum", "history", "heritage", "colonial", "art", "architecture"],
    "Food": ["food", "cuisine", "restaurant", "culinary", "gastronomy", "local dishes"],
    "Nightlife": ["nightlife", "bar", "party", "dance music", "night club", "cocktail", "dancing stage", "pub", "sport bar"],
    "Shopping": ["shopping", "shop", "mall", "store", "boutique", "souvenirs", "market"],
    "Wildlife": ["animals", "zoo", "wildlife", "birdwatching", "feeding"],
    "Family-Friendly": ["family", "kids", "playground", "camping", "park", "amusement", "theme park", "picnic", "kite flying", "kites"],
    "Religion": ["church", "cathedral", "parish", "temple", "sanctuary", "basilica", "shrine", "holy", "worship"],
    "Wellness & Relaxation": ["spa", "wellness", "yoga", "retreat", "meditation", "hot springs", "thermal baths", "relaxation", "peaceful"],
    "Luxury & Resorts": ["resort", "luxury", "boutique hotel", "exclusive", "private beach", "high-end", "hotel", "inn", "lodging"],
    "Festivals & Events": ["festival", "event", "parade", "local traditions", "carnival", "concert", "Puerto Rican festival"]
}

# Function to extract coordinates
def extract_coordinates(text):
    coordinate_pattern = re.compile(r'(-?\d+\.\d+);\s*(-?\d+\.\d+)')
    matches = coordinate_pattern.findall(text)
    if matches:
        lat, lon = float(matches[0][0]), float(matches[0][1])
        return [lat, lon]
    return None

# Function to remove pronunciation sections (generic patterns within parentheses)
def remove_pronunciation(text):
    # Define the regex pattern to match different pronunciation formats
    pronunciation_pattern = re.compile(
        r"\(\s*(US:.*?|UK:.*?|Spanish:.*?|US :.*?|UK :.*?|Spanish :.*?|.*?pronunciation:.*?|/.*?/.*?)\s*\)",  # Include "pronunciation:" explicitly
        flags=re.DOTALL
    )
    # Remove all matching patterns
    return re.sub(pronunciation_pattern, "", text)

# Function to assign categories based on keywords
def assign_categories(text):
    assigned_categories = []
    text_lower = text.lower()
    words = re.findall(r'\b\w+\b', text_lower)  # Extract words for exact match
    for category, keywords in category_keywords.items():
        if any(keyword in words for keyword in keywords):
            assigned_categories.append(category)
    return list(set(assigned_categories))  # Ensure unique categories

# Function to update geo_metadata and categories in cleaned files
def update_cleaned_file(cleaned_data, title, coordinates, categories):
    # Validate and update geo_metadata
    if "geo_metadata" not in cleaned_data or cleaned_data["geo_metadata"] is None:
        cleaned_data["geo_metadata"] = {}
    cleaned_data["geo_metadata"]["coordinates"] = coordinates if coordinates else cleaned_data["geo_metadata"].get("coordinates", "Not Found")
    cleaned_data["geo_metadata"]["title"] = cleaned_data["geo_metadata"].get("title", title)
    cleaned_data["geo_metadata"]["type"] = "Point"

    # Add categories
    cleaned_data["categories"] = categories

    return cleaned_data

# Function to clean and update main content
def update_main_content(cleaned_data):
    # Remove pronunciation sections if available
    if "main_content" in cleaned_data:
        cleaned_data["main_content"] = remove_pronunciation(cleaned_data["main_content"])
    return cleaned_data

# Process files and handle exceptions
files_without_coordinates = []
for file in os.listdir(cleaned_data_path):
    try:
        cleaned_file_path = os.path.join(cleaned_data_path, file)
        raw_file_path = os.path.join(raw_data_path, file)

        # Load cleaned data
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Extract title, coordinates, and categories from raw data
        with open(raw_file_path, "r", encoding="utf-8") as f:
            raw_content = f.read()
        title = re.search(r"<h1>(.*?)</h1>", raw_content).group(1).strip() if re.search(r"<h1>(.*?)</h1>", raw_content) else file.replace(".txt", "")
        coordinates = extract_coordinates(raw_content)
        categories = assign_categories(cleaned_data["main_content"])

        # Update cleaned data
        cleaned_data = update_cleaned_file(cleaned_data, title, coordinates, categories)
        cleaned_data = update_main_content(cleaned_data)

        # Save updated cleaned data
        output_file_path = os.path.join(output_path, file)
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Updated and saved: {file}")

        if not coordinates:
            files_without_coordinates.append(file)

    except Exception as e:
        print(f"Error processing file {file}: {e}")

# List files without coordinates
print("Files without coordinates:")
for file in files_without_coordinates:
    print(file)

# Assign coordinates based on municipality location
for file in files_without_coordinates:
    try:
        cleaned_file_path = os.path.join(output_path, file)
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Assign coordinates based on municipality
        for municipality_file in os.listdir(municipality_data_path):
            municipality_name = municipality_file.replace(".json", "")
            if municipality_name.lower() in cleaned_data["geo_metadata"]["title"].lower():
                municipality_file_path = os.path.join(municipality_data_path, municipality_file)
                with open(municipality_file_path, "r", encoding="utf-8") as mf:
                    municipality_data = json.load(mf)
                cleaned_data["geo_metadata"]["coordinates"] = municipality_data["geo_metadata"]["coordinates"]
                break

        # Save updated cleaned data with municipality coordinates
        with open(cleaned_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Assigned coordinates to: {file}")

    except Exception as e:
        print(f"Error assigning coordinates to file {file}: {e}")



Este código realiza las siguientes modificaciones:
1. Asegura que la verificación de `geo_metadata` maneje correctamente los casos en los que no existe.
2. Ajusta la asignación de categorías para que haga una coincidencia exacta de las palabras clave.
3. Asegura que el sistema intente aplicar la información de los municipios cuando no se encuentren coordenadas.

Asegúrate de que los archivos de municipios en la carpeta `municipality_data_path` tengan la estructura correcta y contengan las coordenadas necesarias.

# **Final Cleaning Step No. 1: Adding Title, geo info and Categories & removal of pronunciation section**

In [None]:
import os
import json
import re

# Define paths
raw_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks"
cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\final_step_cleaned_data"                     
output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\10Feb25_final_step01_title_cat_geo_clean"
municipality_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\municipalities_cleaning\step3_final_cleaned_data"
no_coordinates_output_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\no_coordinates_records.json"

# Create output directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)

# Keyword-based category mapping
category_keywords = {
    "Extreme Adventure": ["zipline", "canopy", "rappelling", "mountain bike", "paragliding", "skate", "skating", "extreme sports", "rock climbing", "caving", "spelunking", "skydiving", "bungee jumping", "off-road", "ATV"],
    "Soft Adventure": ["hiking", "kayaking", "bike", "snorkeling", "paddleboarding", "scuba diving", "tubing", "water sports"],
    "Outdoor": ["nature", "park", "trail", "scenic", "river", "trails", "camping", "picnic", "wildlife", "birdwatching", "cave", "natural reserve", "outdoor", "forrest"],
    "Beach": ["beach", "sand", "ocean", "coast", "surfing", "snorkeling", "seaside"],
    "Culture": ["museum", "history", "heritage", "colonial", "art", "architecture"],
    "Food": ["food", "cuisine", "restaurant", "culinary", "gastronomy", "local dishes"],
    "Nightlife": ["nightlife", "bar", "party", "dance music", "night club", "cocktail", "dancing stage", "pub", "sport bar"],
    "Shopping": ["shopping", "shop", "mall", "store", "boutique", "souvenirs", "market"],
    "Wildlife": ["animals", "zoo", "wildlife", "birdwatching", "feeding"],
    "Family-Friendly": ["family", "kids", "playground", "camping", "park", "amusement", "theme park", "picnic", "kite flying", "kites"],
    "Religion": ["church", "cathedral", "parish", "temple", "sanctuary", "basilica", "shrine", "holy", "worship"],
    "Wellness & Relaxation": ["spa", "wellness", "yoga", "retreat", "meditation", "hot springs", "thermal baths", "relaxation", "peaceful"],
    "Luxury & Resorts": ["resort", "luxury", "boutique hotel", "exclusive", "private beach", "high-end", "hotel", "inn", "lodging"],
    "Festivals & Events": ["festival", "event", "parade", "local traditions", "carnival", "concert", "Puerto Rican festival"]
}

# Function to extract coordinates
def extract_coordinates(text):
    coordinate_pattern = re.compile(r'(-?\d+\.\d+);\s*(-?\d+\.\d+)')
    matches = coordinate_pattern.findall(text)
    if matches:
        lat, lon = float(matches[0][0]), float(matches[0][1])
        return [lat, lon]
    return None

# Function to remove pronunciation sections (generic patterns within parentheses)
def remove_pronunciation(text):
    # Define the regex pattern to match different pronunciation formats
    pronunciation_pattern = re.compile(
        r"\(\s*(US:.*?|UK:.*?|Spanish:.*?|US :.*?|UK :.*?|Spanish :.*?|.*?pronunciation:.*?|/.*?/.*?)\s*\)",  # Include "pronunciation:" explicitly
        flags=re.DOTALL
    )
    # Remove all matching patterns
    return re.sub(pronunciation_pattern, "", text)

# Function to assign categories based on keywords
def assign_categories(text):
    assigned_categories = []
    text_lower = text.lower()
    words = re.findall(r'\b\w+\b', text_lower)  # Extract words for exact match
    for category, keywords in category_keywords.items():
        if any(keyword in words for keyword in keywords):
            assigned_categories.append(category)
    return list(set(assigned_categories))  # Ensure unique categories

# Function to format title from filename
def format_title(filename):
    return ' '.join(word.capitalize() for word in filename.replace(".txt", "").replace("_", " ").split())

# Function to update geo_metadata and categories in cleaned files
def update_cleaned_file(cleaned_data, title, coordinates, categories):
    # Validate and update geo_metadata
    if "geo_metadata" not in cleaned_data or cleaned_data["geo_metadata"] is None:
        cleaned_data["geo_metadata"] = {}
    cleaned_data["geo_metadata"]["coordinates"] = coordinates if coordinates else cleaned_data["geo_metadata"].get("coordinates", "Not Found")
    cleaned_data["geo_metadata"]["title"] = cleaned_data["geo_metadata"].get("title", title)
    cleaned_data["geo_metadata"]["type"] = "Point"

    # Add categories
    cleaned_data["categories"] = categories

    return cleaned_data

# Function to clean and update main content
def update_main_content(cleaned_data):
    # Remove pronunciation sections if available
    if "main_content" in cleaned_data:
        cleaned_data["main_content"] = remove_pronunciation(cleaned_data["main_content"])
    return cleaned_data

# Process files and handle exceptions
files_without_coordinates = []
for file in os.listdir(cleaned_data_path):
    try:
        cleaned_file_path = os.path.join(cleaned_data_path, file)
        raw_file_path = os.path.join(raw_data_path, file)

        # Load cleaned data
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Extract title, coordinates, and categories from raw data
        with open(raw_file_path, "r", encoding="utf-8") as f:
            raw_content = f.read()
        title = re.search(r"<h1>(.*?)</h1>", raw_content).group(1).strip() if re.search(r"<h1>(.*?)</h1>", raw_content) else format_title(file)
        coordinates = extract_coordinates(raw_content)
        categories = assign_categories(cleaned_data["main_content"])

        # Update cleaned data
        cleaned_data = update_cleaned_file(cleaned_data, title, coordinates, categories)
        cleaned_data = update_main_content(cleaned_data)

        # Save updated cleaned data
        output_file_path = os.path.join(output_path, file)
        with open(output_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Updated and saved: {file}")

        if not coordinates:
            files_without_coordinates.append(file)

    except Exception as e:
        print(f"Error processing file {file}: {e}")

# List files without coordinates
print("Files without coordinates:")
for file in files_without_coordinates:
    print(file)

# Assign coordinates based on municipality location
for file in files_without_coordinates:
    try:
        cleaned_file_path = os.path.join(output_path, file)
        with open(cleaned_file_path, "r", encoding="utf-8") as f:
            cleaned_data = json.load(f)

        # Assign coordinates based on municipality
        for municipality_file in os.listdir(municipality_data_path):
            municipality_name = municipality_file.replace(".json", "")
            if municipality_name.lower() in cleaned_data["geo_metadata"]["title"].lower():
                municipality_file_path = os.path.join(municipality_data_path, municipality_file)
                with open(municipality_file_path, "r", encoding="utf-8") as mf:
                    municipality_data = json.load(mf)
                cleaned_data["geo_metadata"]["coordinates"] = municipality_data["geo_metadata"]["coordinates"]
                break

        # Save updated cleaned data with municipality coordinates
        with open(cleaned_file_path, "w", encoding="utf-8") as f:
            json.dump(cleaned_data, f, indent=4, ensure_ascii=False)
        
        print(f"Assigned coordinates to: {file}")

    except Exception as e:
        print(f"Error assigning coordinates to file {file}: {e}")

# Save list of files without coordinates to a JSON file
with open(no_coordinates_output_path, "w", encoding="utf-8") as f:
    json.dump(files_without_coordinates, f, indent=4, ensure_ascii=False)

print(f"List of files without coordinates saved to {no_coordinates_output_path}")

# **Final Cleaning Step No. 2: Update relevant links & Special Characters Removal**

In [None]:
import os
import json
import re

# Define input and output directories
final_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\10Feb25_final_step01_title_cat_geo_clean"
final_cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\10Feb25_final_step02_links_clean"

# Create final cleaned output directory if it doesn't exist
if not os.path.exists(final_cleaned_data_path):
    os.makedirs(final_cleaned_data_path)

# Define whitelisted domains for relevant links
WHITELISTED_DOMAINS = ["en.wikipedia.org", 
                       "es.wikipedia.org", 
                       "en.wikivoyage.org",  
                       "es.wikivoyage.org", 
                       "discoverpuertorico.com", 
                       "travel.usnews.com", 
                       "tripadvisor.com", 
                       "skyscanner.com", 
                       "lonelyplanet.com", 
                       "puertorico.com",
                       "geohack.toolforge.org", 
                       "paralanaturaleza.org", 
                       "sanjuanpuertorico.com", 
                       "zeepuertorico.com", 
                       "weather-us.com", 
                       "estuario.org", 
                       "lonelyplanet.com", 
                       "toroverdepr.com", 
                       "distritot-mobile.com", 
                       "prcomiccon.com", 
                       "www.nps.gov", 
                       "prconvention.com", 
                       "nationalgeographic.com", 
                       "areciboweb.50megs.com", 
                       "flickr.com", 
                       "islaculebra.com",
                       ]
# Function to filter relevant links based on whitelist
def filter_relevant_links(links):
    filtered_links = []
    for link in links:
        if any(domain in link for domain in WHITELISTED_DOMAINS) and "web.archive.org" not in link and "www.gotopuertorico.com" not in link and "stats.wikimedia.org" not in link:
            filtered_links.append(link)
    return list(set(filtered_links))  # Remove duplicates

# Define Main Special Characters Mapping
special_chars_mapping = {
    "\\xc3\\xa1": "á",
    "\\xc3\\xa9": "é",
    "\\xc3\\xad": "í",
    "\\xc3\\xb3": "ó",
    "\\xc3\\xba": "ú",
    "\\xc3\\xb1": "ñ",
    "\\xc3\\x9c": "Ü",
    "\\xc3\\xbc": "ü",
    "\\xe2\\x80\\x99": "'",
    "\\xe2\\x80\\x9c": "\"",
    "\\xe2\\x80\\x9d": "\"",
    "\\xca\\x9du\\xca\\x9d": "ü",
    "\\xcb\\x88": "'",
    "\\xa": " ",
    "\\xc2\\xb0F": "°F",
    "\\xc2\\xb0C": "°C",
    "\\xc2\\xb0": "°",
    "\\xe2\\x80\\x93": "-"
}

# Function for Fixing Special Characters
def fix_special_characters(text):
    for wrong, correct in special_chars_mapping.items():
        text = text.replace(wrong, correct)
    return text

# Function to ensure the relevant Wikipedia link is present
def ensure_wikipedia_link(links, landmark_name):
    # Generate the expected Wikipedia link
    wikipedia_link = f"https://en.wikipedia.org/wiki/{landmark_name}"
    
    # Check if the link is already in the list, if not, add it
    if wikipedia_link not in links:
        links.append(wikipedia_link)
    
    return links

# Process each file in the final_data directory
for file_name in os.listdir(final_data_path):
    file_path = os.path.join(final_data_path, file_name)

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Clean relevant links
    if "relevant_links" in data:
        data["relevant_links"] = filter_relevant_links(data["relevant_links"])

    # Clean main content from Special Characters
    if "main_content" in data:
        data["main_content"] = fix_special_characters(data["main_content"])

    # Ensure relevant links contain the Wikipedia link for the landmark
    if "relevant_links" in data:
        landmark_name = os.path.splitext(file_name)[0]  # Extract landmark name from the file name
        data["relevant_links"] = ensure_wikipedia_link(data["relevant_links"], landmark_name)

    # Save the cleaned file to final_cleaned_data directory
    output_file_path = os.path.join(final_cleaned_data_path, file_name)
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        json.dump(data, output_file, indent=4, ensure_ascii=False)

    print(f"Final cleaned and saved: {file_name}")

# **Final Cleaning Step No. 3: Fix Special Characters on Title**

In [None]:
import os
import json

# Define input and output directories
final_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\10Feb25_final_step02_links_clean"
final_cleaned_data_path = r"C:\Users\larry\OneDrive\Documents\GitHub\project-aieng-interactive-travel-planner\data\landmarks_cleaning\11Feb25_final_step03_chars_title_clean"

# Create final cleaned output directory if it doesn't exist
if not os.path.exists(final_cleaned_data_path):
    os.makedirs(final_cleaned_data_path)

# Define Main Special Characters Mapping
special_chars_mapping = {
    "\\xc3\\xa1": "á",
    "\\xc3\\xa9": "é",
    "\\xc3\\xad": "í",
    "\\xc3\\xb3": "ó",
    "\\xc3\\xba": "ú",
    "\\xc3\\xb1": "ñ",
    "\\xc3\\x9c": "Ü",
    "\\xc3\\xbc": "ü",
    "\\xe2\\x80\\x99": "'",
    "\\xe2\\x80\\x9c": "\"",
    "\\xe2\\x80\\x9d": "\"",
    "\\xca\\x9du\\xca\\x9d": "ü",
    "\\xcb\\x88": "'",
    "\\xa": " ",
    "\\xc2\\xb0F": "°F",
    "\\xc2\\xb0C": "°C",
    "\\xc2\\xb0": "°",
    "\\xe2\\x80\\x93": "-",
    "\\\\u0026amp;": "&"
}

# Function for Fixing Special Characters
def fix_special_characters(text):
    for wrong, correct in special_chars_mapping.items():
        text = text.replace(wrong, correct)
    return text

# Process each file in the final_data directory
for file_name in os.listdir(final_data_path):
    file_path = os.path.join(final_data_path, file_name)

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Fix special characters in geo_metadata title
    if "geo_metadata" in data and "title" in data["geo_metadata"]:
        data["geo_metadata"]["title"] = fix_special_characters(data["geo_metadata"]["title"])

    # Save the cleaned file to final_cleaned_data directory
    output_file_path = os.path.join(final_cleaned_data_path, file_name)
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        json.dump(data, output_file, indent=4, ensure_ascii=False)

    print(f"Final cleaned and saved: {file_name}")