# Testing notebook

refs
* https://github.com/iodepo/odis-in/blob/master/dataGraphs/thematics/expinst/graphs/odisCatOrganization-example.json#L14

In [209]:
import sys
import httpx  # Replacing requests with httpx
import xml.etree.ElementTree as ET
import json
from urllib.parse import urlparse
import pyoxigraph
from pyld import jsonld
import io
import pandas as pd
import extruct
from w3lib.html import get_base_url
import string
from tqdm import tqdm  # Importing tqdm for progress indication
# from zmq.backend import backend
import re
import yaml

In [4]:

def extract_value(cell):
    if isinstance(cell, (pyoxigraph.Literal, pyoxigraph.NamedNode, pyoxigraph.BlankNode)):
        return cell.value
    return cell


def parse_sitemap(sitemap_url):
    try:
        # Fetch the sitemap
        response = httpx.get(sitemap_url)
        response.raise_for_status()

        # Parse the XML
        root = ET.fromstring(response.content)

        # Handle potential XML namespaces
        namespace = {'ns': root.tag.split('}')[0].strip('{')} if '}' in root.tag else ''

        # Extract URLs based on whether there's a namespace or not
        if namespace:
            urls = [url.find('ns:loc', namespace).text for url in root.findall('.//ns:url', namespace)]
        else:
            urls = [url.find('loc').text for url in root.findall('.//url')]

        return urls

    except httpx.RequestError as e:
        print(f"Error fetching sitemap: {e}")
        return []
    except ET.ParseError as e:
        print(f"Error parsing XML: {e}")
        return []

def trimit(input_str):
    # Define the control characters
    control_chars = ''.join(map(chr, range(0, 32))) + chr(127)

    # Create a translation table
    translation_table = str.maketrans('', '', control_chars + string.whitespace)

    # Translate the input string using the translation table
    result_str = input_str.translate(translation_table)

    return result_str

def extract_jsonld(url):
    try:
        # Fetch the webpage
        response = httpx.get(trimit(url))
        response.raise_for_status()

        # Get base URL for handling relative URLs in the HTML
        base_url = get_base_url(response.text, str(response.url))

        # Extract all metadata formats using extruct
        data = extruct.extract(
            response.text,
            base_url=base_url,
            syntaxes=['json-ld']  # Only extract JSON-LD
        )

        # Get JSON-LD data
        jsonld_data = data.get('json-ld', [])

        if jsonld_data:
            # If we found JSON-LD data, return the first item pretty-printed
            # print(json.dumps(jsonld_data[0], indent=2))
            # print("============================")
            return json.dumps(jsonld_data[0], indent=2)

        return None

    except httpx.RequestError as e:
        print(f"Error fetching URL {url}: {e}")
        return None



In [9]:
 # set up oxygraph
store = pyoxigraph.Store()  # store = pyoxigraph.Store(path="./store")
mime_type = "application/n-triples"


sitemap_url = "https://catalogue.odis.org/sitemapIndex.xml"

In [13]:


# Validate URL format
try:
    result = urlparse(sitemap_url)
    if not all([result.scheme, result.netloc]):
        raise ValueError("Invalid URL format")
except ValueError as e:
    print(f"Error: {e}")
    sys.exit(1)

# Parse sitemap
print(f"Parsing sitemap: {sitemap_url}")
urls = parse_sitemap(sitemap_url)

if not urls:
    print("No URLs found in sitemap")
    sys.exit(1)

print(f"Found {len(urls)} URLs in sitemap")

for url in tqdm(urls, desc="Processing URLs", ncols=100):
    try:
        # print(f"\nChecking {trimit(url)} for JSON-LD data...")
        jsonld_content = extract_jsonld(url)
        if jsonld_content:
            normalized = jsonld.normalize(json.loads(jsonld_content),
                                          {'algorithm': 'URDNA2015', 'format': 'application/n-quads'})
            store.load(io.StringIO(normalized), mime_type, base_iri=None, to_graph=None)
        else:
            pass
            # print("No JSON-LD content found")
    except Exception as e:
        print(f"An error occurred while processing URL {url}: {e}")


Parsing sitemap: https://catalogue.odis.org/sitemapIndex.xml
Found 3138 URLs in sitemap


  store.load(io.StringIO(normalized), mime_type, base_iri=None, to_graph=None)
Processing URLs:  41%|█████████████████▏                        | 1287/3138 [16:22<30:14,  1.02it/s]

An error occurred while processing URL 
            https://catalogue.odis.org/view/1105
        : Invalid \escape: line 16 column 665 (char 1223)


Processing URLs: 100%|██████████████████████████████████████████| 3138/3138 [40:31<00:00,  1.29it/s]


In [165]:
# ref: https://github.com/iodepo/odis-in/blob/master/dataGraphs/thematics/expinst/graphs/odisCatOrganization-example.json#L14
sparql = """
    PREFIX shacl: <http://www.w3.org/ns/shacl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX schema: <https://schema.org/>

    SELECT ?pid ?propername ?url ?value
    WHERE {
    ?pid a schema:Organization .
    ?pid schema:name ?propername .
    ?pid schema:makesOffer ?offer .
    ?offer schema:itemOffered ?s .
    ?s a schema:CreativeWork .
    ?s schema:additionalType "sitemap" .
    ?s schema:additionalProperty ?p .
        ?p schema:propertyID "iode-approved" .
        ?p schema:value ?value .
    ?s schema:url ?url .
    }
    """

r = store.query(sparql)
q1 = list(r)
v = r.variables
value_list = [variable.value for variable in v]

df = pd.DataFrame(q1, columns=value_list)
df = df.map(extract_value)


In [166]:
def process_string(input_string: str) -> str:
    # Step 1: Lowercase the string
    lowercased_string = input_string.lower()
    # Step 2: Remove all spaces and non-alphabetic characters
    cleaned_string = re.sub(r'[^a-z]', '', lowercased_string)
    # Step 3: Return the first 8 characters
    return cleaned_string[:12]


In [167]:
def get_last_path_element(url: str) -> str:
    parsed_url = urlparse(url)

    # Split the path and get the last element
    path_elements = parsed_url.path.strip('/').split('/')
    if path_elements:
        return path_elements[-1]
    else:
        return ''


In [168]:
# Process the dataframe
df['name'] = df.apply(lambda row: get_last_path_element(row['pid']) + process_string(row['propername']), axis=1)

In [169]:
df

Unnamed: 0,pid,propername,url,value,name
0,https://catalogue.odis.org/view/3312,ARPHA Journals - Мarine content source journals,https://pensoft.net/marine-sitemap.xml,false,3312arphajournal
1,https://catalogue.odis.org/view/3310,Space Physics Archive Search and Extract,https://github.com/lechatpito/NASA-ODIS-Exampl...,false,3310spacephysics
2,https://catalogue.odis.org/view/3308,Integrated Ocean Observing System (IOOS®) Data...,https://data.ioos.us/sitemap.xml,false,3308integratedoc
3,https://catalogue.odis.org/view/3307,GOOS OCG ERDDAP server,https://osmc.noaa.gov/erddap/sitemap.xml,false,3307goosocgerdda
4,https://catalogue.odis.org/view/3306,GOOS BioEco metadata portal,https://bioeco-graph.s3.amazonaws.com/bioeco_g...,false,3306goosbioecome
...,...,...,...,...,...
134,https://catalogue.odis.org/view/4,OceanExpert - A Directory of Marine and Freshw...,https://oceanexpert.org/assets/sitemaps/sitema...,false,4oceanexperta
135,https://catalogue.odis.org/view/4,OceanExpert - A Directory of Marine and Freshw...,https://oceanexpert.org/assets/sitemaps/sitema...,false,4oceanexperta
136,https://catalogue.odis.org/view/29,All data series - UK National Oceanographic Da...,https://api.linked-systems.uk/sitemap_pap_api.xml,false,29alldataserie
137,https://catalogue.odis.org/view/29,All data series - UK National Oceanographic Da...,https://api.linked-systems.uk/sitemap_pap_api.xml,false,29alldataserie


In [229]:
from datetime import date
from typing import Optional, List
from pydantic import BaseModel, HttpUrl, Field
import yaml
from pathlib import Path

class Source(BaseModel):
    name: str = Field(description="Short identifier for the source")
    propername:  Optional[str] = Field(description="Full proper name of the catalogue")
    catalogue: Optional[str] = Field(description="URL of the catalogue")
    domain: Optional[str] = Field(description="Base domain URL")
    logo: Optional[str] = Field(description="URL of the source logo")
    pid: Optional[str] = Field(description="Persistent identifier URL")
    sourcetype: Optional[str] = Field(description="Type of the source (e.g., sitemap)")
    url: Optional[str] = Field(description="URL to the source data")
    changefreq: Optional[str] = Field(None, description="Change frequency")
    backend: Optional[str] = Field(None, description="Backend system type")
    headless: Optional[str] = Field(description="Whether the source is headless")
    dateadded: Optional[str] = Field(description="Date when the source was added")
    cron: Optional[str] = Field(description="Cron schedule expression")
    active: Optional[str] = Field(description="Whether the source is active")

class SourceConfig(BaseModel):
    sources: List[Source] = Field(description="List of source configurations")

def remove_none_values(d):
    """Recursively remove keys with None values from dictionaries"""
    if not isinstance(d, dict):
        return d
    return {
        k: remove_none_values(v)
        for k, v in d.items()
        if v is not None and v != 'None'
    }

In [237]:

def generate_yaml_config(config: SourceConfig) -> str:
    config_dict = config.model_dump()
    config_dict = remove_none_values(config_dict)

    # Custom representer for HttpUrl to convert to string
    # def represent_http_url(dumper, data):
    #     return dumper.represent_str(str(data))

    # Custom representer for date to convert to ISO format string
    # def represent_date(dumper, data):
    #     return dumper.represent_str(data.isoformat())

    # Add custom representers
    # yaml.add_representer(HttpUrl, represent_http_url)
    # yaml.add_representer(date, represent_date)

    # Generate YAML with proper formatting
    yaml_content = yaml.dump(config_dict, sort_keys=False, allow_unicode=True, default_flow_style=False)

    # # Write to file if output path is provided
    # if output_path:
    #     output_path.write_text(yaml_content)

    return yaml_content

In [240]:

#             backend="GeoNode",

sample_config = SourceConfig(
    sources=[
        Source(
            name="acma",
            propername="African Coastal and Marine Atlas catalogue (ACMA)",
            catalogue="https://acma.africanmarineatlas.org",
            domain="https://africanmarineatlas.org",
            logo="https://acma.africanmarineatlas.org/static/mapstore/img/geonode-logo.svg",
            pid="https://catalogue.odis.org/view/3125",
            sourcetype="sitemap",
            url="https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-acma/sitemap.xml",
            changefreq="None",
            backend= None,
            headless="false",
            dateadded="2024, 4, 23",
            cron="0 6 * * 0",
            active="true"
        )
    ]
)

In [241]:
yaml_output = generate_yaml_config(sample_config)
print("Generated YAML:")
print(remove_none_values(yaml_output))


Generated YAML:
sources:
- name: acma
  propername: African Coastal and Marine Atlas catalogue (ACMA)
  catalogue: https://acma.africanmarineatlas.org
  domain: https://africanmarineatlas.org
  logo: https://acma.africanmarineatlas.org/static/mapstore/img/geonode-logo.svg
  pid: https://catalogue.odis.org/view/3125
  sourcetype: sitemap
  url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-acma/sitemap.xml
  changefreq: None
  backend: null
  headless: 'false'
  dateadded: 2024, 4, 23
  cron: 0 6 * * 0
  active: 'true'



In [197]:
default_fields = {field: None for field in Source.model_fields.keys()}


In [198]:
# Create a list of Source instances
sources: List[Source] = []

for _, row in df.iterrows():
    # Create a dictionary of data for the Source pydantic model
    row_data = {**default_fields, **row.to_dict()}
    # Create an instance of Source
    source_instance = Source(**row_data)
    sources.append(source_instance)

In [199]:
print(sources)

[Source(name='3312arphajournal', propername='ARPHA Journals - Мarine content source journals', catalogue=None, domain=None, logo=None, pid='https://catalogue.odis.org/view/3312', sourcetype=None, url='https://pensoft.net/marine-sitemap.xml', changefreq=None, backend=None, headless=None, dateadded=None, cron=None, active=None), Source(name='3310spacephysics', propername='Space Physics Archive Search and Extract', catalogue=None, domain=None, logo=None, pid='https://catalogue.odis.org/view/3310', sourcetype=None, url='https://github.com/lechatpito/NASA-ODIS-Examples/raw/main/sitemap.xml', changefreq=None, backend=None, headless=None, dateadded=None, cron=None, active=None), Source(name='3308integratedoc', propername='Integrated Ocean Observing System (IOOS®) Data Catalog', catalogue=None, domain=None, logo=None, pid='https://catalogue.odis.org/view/3308', sourcetype=None, url='https://data.ioos.us/sitemap.xml', changefreq=None, backend=None, headless=None, dateadded=None, cron=None, acti

In [200]:
sample_config = SourceConfig(
    sources=sources
)

In [218]:
prefix = """---
minio:
  address:  
  port:  
  accessKey:
  secretKey:
  ssl:  
  bucket: oih
gleaner:
  runid: oih # this will be the bucket the output is placed in...
  summon: true # do we want to visit the web sites and pull down the files
  mill: false
context:
  cache: true
contextmaps:
  - prefix: "https://schema.org/"
    file: "./assets/jsonldcontext.json"  # wget http://schema.org/docs/jsonldcontext.jsonld
  - prefix: "http://schema.org/"
    file: "./assets/jsonldcontext.json"  # wget http://schema.org/docs/jsonldcontext.jsonld
summoner:
  after: ""      # "21 May 20 10:00 UTC"   
  mode: full  # full || diff:  If diff compare what we have currently in gleaner to sitemap, get only new, delete missing
  threads: 5
  delay:  # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) 
  headless: http://0.0.0.0:9222  # URL for headless see docs/headless
millers:
  graph: false
"""

In [227]:
def remove_none_values(d):
    """Recursively remove keys with None or 'null' string values from dictionaries"""
    if not isinstance(d, dict):
        return d
    return {
        k: remove_none_values(v)
        for k, v in d.items()
        if v is not None and v != 'null'
    }

# Function to read, clean, and write the YAML file
def clean_yaml_string(yaml_string):
    # Load the YAML from the string
    data = yaml.safe_load(yaml_string)

    # Clean the data
    cleaned_data = remove_none_values(data)

    # Convert cleaned data back to YAML string
    cleaned_yaml_string = yaml.dump(cleaned_data, sort_keys=False, default_flow_style=False)
    return cleaned_yaml_string



In [221]:
yaml_output = generate_yaml_config(sample_config)
full = prefix + yaml_output

yaml_string = """
key1: value1
key2: null
key3: "null"
nested:
  key4: value4
  key5: null
  key6: "null"
"""

cleaned_yaml_string = clean_yaml_string(full)

print("Generated YAML:")
print(cleaned_yaml_string)
# print(remove_none_values(cleaned_yaml_string))


minio:
  bucket: oih
gleaner:
  runid: oih
  summon: true
  mill: false
context:
  cache: true
contextmaps:
- prefix: https://schema.org/
  file: ./assets/jsonldcontext.json
- prefix: http://schema.org/
  file: ./assets/jsonldcontext.json
summoner:
  after: ''
  mode: full
  threads: 5
  headless: http://0.0.0.0:9222
millers:
  graph: false
sources:
- name: 3312arphajournal
  propername: "ARPHA Journals - \u041Carine content source journals"
  catalogue: null
  domain: null
  logo: null
  pid: https://catalogue.odis.org/view/3312
  sourcetype: null
  url: https://pensoft.net/marine-sitemap.xml
  changefreq: null
  backend: null
  headless: null
  dateadded: null
  cron: null
  active: null
- name: 3310spacephysics
  propername: Space Physics Archive Search and Extract
  catalogue: null
  domain: null
  logo: null
  pid: https://catalogue.odis.org/view/3310
  sourcetype: null
  url: https://github.com/lechatpito/NASA-ODIS-Examples/raw/main/sitemap.xml
  changefreq: null
  backend: null
