# Testing notebook

refs
* https://github.com/iodepo/odis-in/blob/master/dataGraphs/thematics/expinst/graphs/odisCatOrganization-example.json#L14

In [3]:
import sys
import httpx  # Replacing requests with httpx
import xml.etree.ElementTree as ET
import json
from urllib.parse import urlparse
import pyoxigraph
from pyld import jsonld
import io
import pandas as pd
import extruct
from w3lib.html import get_base_url
import string
from tqdm import tqdm  # Importing tqdm for progress indication
from zmq.backend import backend


In [4]:

def extract_value(cell):
    if isinstance(cell, (pyoxigraph.Literal, pyoxigraph.NamedNode, pyoxigraph.BlankNode)):
        return cell.value
    return cell


def parse_sitemap(sitemap_url):
    try:
        # Fetch the sitemap
        response = httpx.get(sitemap_url)
        response.raise_for_status()

        # Parse the XML
        root = ET.fromstring(response.content)

        # Handle potential XML namespaces
        namespace = {'ns': root.tag.split('}')[0].strip('{')} if '}' in root.tag else ''

        # Extract URLs based on whether there's a namespace or not
        if namespace:
            urls = [url.find('ns:loc', namespace).text for url in root.findall('.//ns:url', namespace)]
        else:
            urls = [url.find('loc').text for url in root.findall('.//url')]

        return urls

    except httpx.RequestError as e:
        print(f"Error fetching sitemap: {e}")
        return []
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        return []

def trimit(input_str):
    # Define the control characters
    control_chars = ''.join(map(chr, range(0, 32))) + chr(127)

    # Create a translation table
    translation_table = str.maketrans('', '', control_chars + string.whitespace)

    # Translate the input string using the translation table
    result_str = input_str.translate(translation_table)

    return result_str

def extract_jsonld(url):
    try:
        # Fetch the webpage
        response = httpx.get(trimit(url))
        response.raise_for_status()

        # Get base URL for handling relative URLs in the HTML
        base_url = get_base_url(response.text, str(response.url))

        # Extract all metadata formats using extruct
        data = extruct.extract(
            response.text,
            base_url=base_url,
            syntaxes=['json-ld']  # Only extract JSON-LD
        )

        # Get JSON-LD data
        jsonld_data = data.get('json-ld', [])

        if jsonld_data:
            # If we found JSON-LD data, return the first item pretty-printed
            # print(json.dumps(jsonld_data[0], indent=2))
            # print("============================")
            return json.dumps(jsonld_data[0], indent=2)

        return None

    except httpx.RequestError as e:
        print(f"Error fetching URL {url}: {e}")
        return None



In [9]:
 # set up oxygraph
store = pyoxigraph.Store()  # store = pyoxigraph.Store(path="./store")
mime_type = "application/n-triples"


sitemap_url = "https://catalogue.odis.org/sitemapIndex.xml"

In [13]:


# Validate URL format
try:
    result = urlparse(sitemap_url)
    if not all([result.scheme, result.netloc]):
        raise ValueError("Invalid URL format")
except ValueError as e:
    print(f"Error: {e}")
    sys.exit(1)

# Parse sitemap
print(f"Parsing sitemap: {sitemap_url}")
urls = parse_sitemap(sitemap_url)

if not urls:
    print("No URLs found in sitemap")
    sys.exit(1)

print(f"Found {len(urls)} URLs in sitemap")

for url in tqdm(urls, desc="Processing URLs", ncols=100):
    try:
        # print(f"\nChecking {trimit(url)} for JSON-LD data...")
        jsonld_content = extract_jsonld(url)
        if jsonld_content:
            normalized = jsonld.normalize(json.loads(jsonld_content),
                                          {'algorithm': 'URDNA2015', 'format': 'application/n-quads'})
            store.load(io.StringIO(normalized), mime_type, base_iri=None, to_graph=None)
        else:
            pass
            # print("No JSON-LD content found")
    except Exception as e:
        print(f"An error occurred while processing URL {url}: {e}")


Parsing sitemap: https://catalogue.odis.org/sitemapIndex.xml
Found 3138 URLs in sitemap


  store.load(io.StringIO(normalized), mime_type, base_iri=None, to_graph=None)
Processing URLs:  41%|█████████████████▏                        | 1287/3138 [16:22<30:14,  1.02it/s]

An error occurred while processing URL 
            https://catalogue.odis.org/view/1105
        : Invalid \escape: line 16 column 665 (char 1223)


Processing URLs: 100%|██████████████████████████████████████████| 3138/3138 [40:31<00:00,  1.29it/s]


In [36]:
# ref: https://github.com/iodepo/odis-in/blob/master/dataGraphs/thematics/expinst/graphs/odisCatOrganization-example.json#L14
sparql = """
    PREFIX shacl: <http://www.w3.org/ns/shacl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX schema: <https://schema.org/>

    SELECT ?s ?url ?value
    WHERE {
    ?s a schema:CreativeWork .
    ?s schema:additionalType "sitemap" .
    ?s schema:additionalProperty ?p .
        ?p schema:propertyID "iode-approved" .
        ?p schema:value ?value .
    ?s schema:url ?url .
    }
    """

r = store.query(sparql)
q1 = list(r)
v = r.variables
value_list = [variable.value for variable in v]

df = pd.DataFrame(q1, columns=value_list)
df = df.map(extract_value)


In [37]:
df

Unnamed: 0,s,url,value
0,a386b2b9d169c8710a7cec80a850f2bb,https://pensoft.net/marine-sitemap.xml,false
1,dca00f7e79b3c337928dabc534af9c22,https://github.com/lechatpito/NASA-ODIS-Exampl...,false
2,c1ab879d9efebb31ae44b1d99cbfedc6,https://data.ioos.us/sitemap.xml,false
3,d5408f08a0e425ab76a6ed576f7ebf92,https://osmc.noaa.gov/erddap/sitemap.xml,false
4,d64f0af1c88836dfedda2f9dcf0f3597,https://bioeco-graph.s3.amazonaws.com/bioeco_g...,false
...,...,...,...
134,f22091ba2a571ed3c2e935f38f2d4455,https://portal.medin.org.uk/portal/sitemap.php,false
135,d84d17db0aeb39843114679498db45c,https://api.linked-systems.uk/sitemap_pap_api.xml,false
136,f04fd3c5d2ddace57cb6cad67fcc6efa,https://oceanexpert.org/assets/sitemaps/sitema...,false
137,f500a17f3154ad7050801d8691b4eef0,https://oceanexpert.org/assets/sitemaps/sitema...,false


In [112]:
from datetime import date
from typing import Optional, List
from pydantic import BaseModel, HttpUrl, Field
import yaml
from pathlib import Path

class Source(BaseModel):
    name: str = Field(description="Short identifier for the source")
    propername: str = Field(description="Full proper name of the catalogue")
    catalogue: Optional[str] = Field(description="URL of the catalogue")
    domain: str = Field(description="Base domain URL")
    logo: Optional[str] = Field(description="URL of the source logo")
    pid: Optional[str] = Field(description="Persistent identifier URL")
    sourcetype: str = Field(description="Type of the source (e.g., sitemap)")
    url: str = Field(description="URL to the source data")
    changefreq: Optional[str] = Field(None, description="Change frequency")
    backend: Optional[str] = Field(None, description="Backend system type")
    headless: bool = Field(description="Whether the source is headless")
    dateadded: Optional[str] = Field(description="Date when the source was added")
    cron: Optional[str] = Field(description="Cron schedule expression")
    active: Optional[str] = Field(description="Whether the source is active")

class SourceConfig(BaseModel):
    sources: List[Source] = Field(description="List of source configurations")

def remove_none_values(d):
    """Recursively remove keys with None values from dictionaries"""
    if not isinstance(d, dict):
        return d
    return {
        k: remove_none_values(v)
        for k, v in d.items()
        if v is not None
    }

In [113]:

def generate_yaml_config(config: SourceConfig, output_path: Optional[Path] = None) -> str:
    """
    Generate YAML from a SourceConfig object.
    
    Args:
        config (SourceConfig): The configuration object to serialize
        output_path (Optional[Path]): If provided, writes the YAML to this file
        
    Returns:
        str: The generated YAML content
    """
    # Convert Pydantic model to dict and remove None values
    config_dict = config.model_dump()
    config_dict = remove_none_values(config_dict)

    # Custom representer for HttpUrl to convert to string
    def represent_http_url(dumper, data):
        return dumper.represent_str(str(data))

    # Custom representer for date to convert to ISO format string
    def represent_date(dumper, data):
        return dumper.represent_str(data.isoformat())

    # Add custom representers
    yaml.add_representer(HttpUrl, represent_http_url)
    yaml.add_representer(date, represent_date)

    # Generate YAML with proper formatting
    yaml_content = yaml.dump(config_dict, sort_keys=False, allow_unicode=True, default_flow_style=False)

    # Write to file if output path is provided
    if output_path:
        output_path.write_text(yaml_content)

    return yaml_content

In [116]:

#             backend="GeoNode",

sample_config = SourceConfig(
    sources=[
        Source(
            name="acma",
            propername="African Coastal and Marine Atlas catalogue (ACMA)",
            catalogue="https://acma.africanmarineatlas.org",
            domain="https://africanmarineatlas.org",
            logo="https://acma.africanmarineatlas.org/static/mapstore/img/geonode-logo.svg",
            pid="https://catalogue.odis.org/view/3125",
            sourcetype="sitemap",
            url="https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-acma/sitemap.xml",
            changefreq="None",
            backend= None,
            headless="false",
            dateadded="2024, 4, 23",
            cron="0 6 * * 0",
            active="true"
        )
    ]
)

ValidationError: 4 validation errors for Source
headless
  Field required [type=missing, input_value={'name': 'acma', 'propern...'None', 'backend': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
dateadded
  Field required [type=missing, input_value={'name': 'acma', 'propern...'None', 'backend': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
cron
  Field required [type=missing, input_value={'name': 'acma', 'propern...'None', 'backend': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing
active
  Field required [type=missing, input_value={'name': 'acma', 'propern...'None', 'backend': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.9/v/missing

In [117]:
yaml_output = generate_yaml_config(sample_config)
print("Generated YAML:")
print(remove_none_values(yaml_output))


Generated YAML:
sources:
- name: acma
  propername: African Coastal and Marine Atlas catalogue (ACMA)
  catalogue: https://acma.africanmarineatlas.org
  domain: https://africanmarineatlas.org
  logo: https://acma.africanmarineatlas.org/static/mapstore/img/geonode-logo.svg
  pid: https://catalogue.odis.org/view/3125
  sourcetype: sitemap
  url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-acma/sitemap.xml
  changefreq: None
  backend: null
  headless: false
  dateadded: 2024, 4, 23
  cron: 0 6 * * 0
  active: 'true'

