In [1]:
import json
import xarray as xr


In [2]:
file_path = "/home/kobrien/schema/MEOP_profiles_v2024_b58d_3806_3fe3.nc"
ds = xr.open_dataset(file_path, decode_times=False)
attrs = ds.attrs


In [3]:
# Known mappings from CF -> schema.org
known_mappings = {
    "title": "name",
    "summary": "description",
    "creator_name": "creator",
    "institution": "publisher",
    "keywords": "keywords",
    "license": "license",
    "references": "citation",
    "source": "measurementTechnique",
    "geospatial_lat_min": "lat_min",
    "geospatial_lat_max": "lat_max",
    "geospatial_lon_min": "lon_min",
    "geospatial_lon_max": "lon_max",
    "time_coverage_start": "time_start",
    "time_coverage_end": "time_end"
}


In [4]:
# Initialize schema.org Dataset
jsonld = {
    "@context": "https://schema.org/",
    "@type": "Dataset",
    "distribution": {
        "@type": "DataDownload",
        "encodingFormat": "application/x-netcdf",
        "contentUrl": "https://example.org/data/MEOP_profiles_v2024.nc"
    },
    "additionalProperty": []
}


In [5]:
# Handle known fields
if "title" in attrs:
    jsonld["name"] = attrs["title"]
if "summary" in attrs:
    jsonld["description"] = attrs["summary"]
if "creator_name" in attrs:
    jsonld["creator"] = { "@type": "Organization", "name": attrs["creator_name"] }
if "institution" in attrs:
    jsonld["publisher"] = { "@type": "Organization", "name": attrs["institution"] }
if "keywords" in attrs:
    jsonld["keywords"] = attrs["keywords"].split(", ")
if "license" in attrs:
    jsonld["license"] = attrs["license"]
if "references" in attrs:
    jsonld["citation"] = attrs["references"]
if "source" in attrs:
    jsonld["measurementTechnique"] = attrs["source"]


In [6]:
# Add spatial/temporal coverage if available
lat_min = attrs.get("geospatial_lat_min")
lat_max = attrs.get("geospatial_lat_max")
lon_min = attrs.get("geospatial_lon_min")
lon_max = attrs.get("geospatial_lon_max")
if lat_min and lat_max and lon_min and lon_max:
    jsonld["spatialCoverage"] = {
        "@type": "Place",
        "geo": {
            "@type": "GeoShape",
            "box": f"{lat_min} {lon_min} {lat_max} {lon_max}"
        }
    }
time_start = attrs.get("time_coverage_start")
time_end = attrs.get("time_coverage_end")
if time_start and time_end:
    jsonld["temporalCoverage"] = f"{time_start}/{time_end}"


In [7]:
# Add all remaining global attributes to additionalProperty
for key, value in attrs.items():
    if key not in known_mappings:
        if (type(value).__name__ == "str"):
            myType =  "Text"
        if (type(value).__name__ == "float64"):
            myType = "Number"        
        jsonld["additionalProperty"].append({
            "@type": "PropertyValue",
            "propertyID": key,
            "value": str(value),
            "type": myType
        })


In [8]:
# Save JSON-LD
output_path = "/home/kobrien/notebooks/schema/MEOP_profiles_v2024_schemaorg_all2.jsonld"
with open(output_path, "w") as f:
    json.dump(jsonld, f, indent=2)

output_path


'/home/kobrien/notebooks/schema/MEOP_profiles_v2024_schemaorg_all2.jsonld'