In [40]:
import requests
from bs4 import BeautifulSoup
import re

def clean_content(text):
    # Replace accented characters and special cases
    replacements = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U',
        'ñ': 'n', 'Ñ': 'N', 'ü': 'u', 'Ü': 'U'
    }
    
    # Replace each character in the dictionary
    for accented_char, replacement_char in replacements.items():
        text = text.replace(accented_char, replacement_char)
    
    # Remove references like [1], [ 99 ], etc.
    text = re.sub(r'\[\s*[a-z0-9]*\s*\]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Return cleaned text
    return text.strip()
    

# URL de la página de Wikipedia
url = "https://en.wikipedia.org/wiki/Public_holidays_in_Puerto_Rico"

# Obtener el contenido HTML de la página
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

print('Processing', end=' ')
# Función para extraer y estructurar los datos de las tablas
def extract_holidays(table, event, city_object, full_url, image_url):
    holidays = []
    rows = table.find_all("tr")
    
    # Iterar por cada fila, excluyendo el encabezado
    for row in rows[1:]:
        print('.', end='')
        cols = row.find_all("td")
        if len(cols) >= 4:  # Asegurarse de que la fila tenga las columnas necesarias
            date = cols[0].get_text(strip=True)
            english_name = clean_content(cols[1].get_text(strip=True))
            local_name = clean_content(cols[2].get_text(strip=True))
            remarks = clean_content(cols[3].get_text(strip=True))
            
            holiday_data = {
                "content": remarks,
                "metadata": {
                    "type": 'event',
                    "city": city_object["city"],
                    "name": english_name,
                    "categories": [],
                    'date': date,
                    "url": full_url,
                    "image_url": image_url
                }
            }
            holidays.append(holiday_data)
    return holidays

# Datos generales
city_object = {"city": "Puerto Rico"}
full_url = url
image_url = ""  # No se especificó ninguna imagen, así que queda vacío

# Extraer ambas tablas (Official y Religious holidays)
tables = soup.find_all("table", {"class": "wikitable"})

# Verificar si se encuentran las tablas necesarias
official_holidays = extract_holidays(tables[0], "Official public holidays", city_object, full_url, image_url)
religious_holidays = extract_holidays(tables[1], "Religious holidays", city_object, full_url, image_url)

# Combinar los resultados y mostrar la salida
all_holidays = official_holidays + religious_holidays

# Imprimir los datos
# for holiday in all_holidays:
#     print(f'{holiday['content']}\t{holiday['metadata']['city']}\t{holiday['metadata']['name']}')

print('Data processing finish.')

Processing .............................Data processing finish.


In [42]:
# Save the data into a JSON File
with open('./datasets/puerto_rico_events.json', 'w') as file:
    json.dump(all_holidays, file, indent=4)

print('Data saved to puerto_rico_events.json')

Data saved to puerto_rico_events.json
