In [1]:
from mango_mdschema import Schema

In [2]:
import json
import os, os.path

In [44]:
dv_schema = Schema("../doc/metadata/mango2dv-demo-1.0.0-published.json")
dv_schema

<mango_mdschema.schema.Schema at 0x7fb5a47f6410>

In [3]:
dv_schema = Schema("../doc/metadata/mango2dv-rdr-1.0.0-published.json")
dv_schema

<mango_mdschema.schema.Schema at 0x7f062663a7d0>

In [156]:
?Schema

[0;31mInit signature:[0m [0mSchema[0m[0;34m([0m[0mpath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m [0mprefix[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'mgs'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Class representing a Metadata Schema.

Attributes:
    name (str): Name of the schema.
    version (str): Version of the schema.
    title (str): Title of the schema, for messages.
        The name if no such title is provided in the JSON (which should not happen).
    root (CompositeField): Root field of the schema.
    fields (dict): Dictionary of fields in the schema (alias for root.fields)
    required_fields (dict): Dictionary of required fields and their default values.
        If a field is required and has no default value, it is not present in the dictionary.
[0;31mInit docstring:[0m
Init a Schema object from a JSON file.

Args:
    path (str): Path to the metadata schema.
    prefix (str): Prefix to add to the metadata names. Default is 'mgs'

In [4]:
print(dv_schema)

[1mManGO Dataset to RDR[0m
Metadata annotated with the schema 'mango2dv-rdr' (1.0.0) carry the prefix 'mgs'.
This schema contains the following 7 fields:
- [1mtitle[0m, of type 'text' (required).
- [1mauthor[0m, of type 'object' (required).
- [1mdatasetContact[0m, of type 'object' (required).
- [1mdsDescription[0m, of type 'object' (required).
- [1mkeyword[0m, of type 'object' (required).
- [1mtechnicalFormat[0m, of type 'text' (required).
- [1maccess[0m, of type 'object'.


In [5]:
dv_schema.print_requirements("author")

[1mType[0m: object.
[1mRequired[0m: True. (2 of its 2 fields are required.)
[1mRepeatable[0m: False.

Composed of the following fields:
[4mmango2dv-rdr.author.authorName[0m
[1mType[0m: text.
[1mRequired[0m: True. [1mDefault[0m: None.
[1mRepeatable[0m: False.

[4mmango2dv-rdr.author.authorAffiliation[0m
[1mType[0m: text.
[1mRequired[0m: True. [1mDefault[0m: None.
[1mRepeatable[0m: False.


In [183]:
with open("../doc/metadata/template_RDR.json") as f:
    template = json.load(f)
template

{'datasetVersion': {'metadataBlocks': {'citation': {'fields': [{'value': '...Title...',
      'typeClass': 'primitive',
      'multiple': False,
      'typeName': 'title'},
     {'value': [{'authorName': {'value': '...LastName..., ...FirstName...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'authorName'},
        'authorAffiliation': {'value': '...Affiliation...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'authorAffiliation'}}],
      'typeClass': 'compound',
      'multiple': False,
      'typeName': 'author'},
     {'value': [{'datasetContactEmail': {'value': '...Email...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'datasetContactEmail'},
        'datasetContactName': {'value': '...LastName..., ...FirstName...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'datasetContactName'}}],
      'typeClass': 'compound',
      'mult

In [109]:
# this is an array of fields
fields = template["datasetVersion"]["metadataBlocks"]["citation"]["fields"]

In [110]:
fields

[{'value': 'Minimum Viable Workflow - 16 May 2024',
  'typeClass': 'primitive',
  'multiple': False,
  'typeName': 'title'},
 {'value': [{'authorName': {'value': 'Kafetzaki, Danai',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'authorName'},
    'authorAffiliation': {'value': 'KU Leuven',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'authorAffiliation'}}],
  'typeClass': 'compound',
  'multiple': False,
  'typeName': 'author'},
 {'value': [{'datasetContactEmail': {'value': 'danai.kafetzaki@kuleuven.be',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'datasetContactEmail'},
    'datasetContactName': {'value': 'Kafetzaki, Danai',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'datasetContactName'}}],
  'typeClass': 'compound',
  'multiple': False,
  'typeName': 'datasetContact'},
 {'value': [{'dsDescriptionValue': {'value': 'This is a minimal end-to-end implementation for iRODS-Dataver

# Simulate getting metadata from iRODS

The code below is to simulate, based on the template metadata, doing the following:

```python
with iRODSSession(irods_env_file=env_file) as session:
     obj = session.data_objects.get('path/to/my/object')
     avus_as_json = dv_schema.extract(obj)

```

In [16]:
def field(f):
    if f["typeClass"] == "compound":
        values = f["value"] if type(f["value"]) != list else f["value"][0]
        return (f["typeName"], {k: v for k, v in [field(ff) for ff in values.values()]})
    else:
        return (f["typeName"], f["value"])

In [22]:
metadata_dict = {k: v for k, v in [field(f) for f in fields]}

In [23]:
metadata_dict

{'title': 'Minimum Viable Workflow - 16 May 2024',
 'author': {'authorName': 'Kafetzaki, Danai',
  'authorAffiliation': 'KU Leuven'},
 'datasetContact': {'datasetContactEmail': 'danai.kafetzaki@kuleuven.be',
  'datasetContactName': 'Kafetzaki, Danai'},
 'dsDescription': {'dsDescriptionValue': 'This is a minimal end-to-end implementation for iRODS-Dataverse integration, a KU Leuven and SURF collaboration'},
 'keyword': {'keywordValue': 'required-keyword'},
 'technicalFormat': 'json',
 'access': {'accessRights': 'open'}}

In [24]:
dv_schema.validate(metadata_dict)

{'title': 'Minimum Viable Workflow - 16 May 2024',
 'author': {'authorName': 'Kafetzaki, Danai',
  'authorAffiliation': 'KU Leuven'},
 'datasetContact': {'datasetContactEmail': 'danai.kafetzaki@kuleuven.be',
  'datasetContactName': 'Kafetzaki, Danai'},
 'dsDescription': [{'dsDescriptionValue': 'This is a minimal end-to-end implementation for iRODS-Dataverse integration, a KU Leuven and SURF collaboration'}],
 'keyword': [{'keywordValue': 'required-keyword'}],
 'technicalFormat': 'json',
 'access': {'accessRights': 'open'}}

In [25]:
# if we extracted the metadata from irods
as_avus = dv_schema.to_avus(metadata_dict)

In [26]:
as_avus

[<iRODSMeta None mgs.mango2dv-rdr.title Minimum Viable Workflow - 16 May 2024 None>,
 <iRODSMeta None mgs.mango2dv-rdr.author.authorName Kafetzaki, Danai 1>,
 <iRODSMeta None mgs.mango2dv-rdr.author.authorAffiliation KU Leuven 1>,
 <iRODSMeta None mgs.mango2dv-rdr.datasetContact.datasetContactEmail danai.kafetzaki@kuleuven.be 1>,
 <iRODSMeta None mgs.mango2dv-rdr.datasetContact.datasetContactName Kafetzaki, Danai 1>,
 <iRODSMeta None mgs.mango2dv-rdr.dsDescription.dsDescriptionValue This is a minimal end-to-end implementation for iRODS-Dataverse integration, a KU Leuven and SURF collaboration 1>,
 <iRODSMeta None mgs.mango2dv-rdr.keyword.keywordValue required-keyword 1>,
 <iRODSMeta None mgs.mango2dv-rdr.technicalFormat json None>,
 <iRODSMeta None mgs.mango2dv-rdr.access.accessRights open 1>]

# Convert AVUs to template JSON

In [27]:
avus_as_json = dv_schema.from_avus(as_avus)
avus_as_json

{'access': {'accessRights': 'open'},
 'author': {'authorAffiliation': 'KU Leuven',
  'authorName': 'Kafetzaki, Danai'},
 'datasetContact': {'datasetContactEmail': 'danai.kafetzaki@kuleuven.be',
  'datasetContactName': 'Kafetzaki, Danai'},
 'dsDescription': [{'dsDescriptionValue': 'This is a minimal end-to-end implementation for iRODS-Dataverse integration, a KU Leuven and SURF collaboration'}],
 'keyword': [{'keywordValue': 'required-keyword'}],
 'technicalFormat': 'json',
 'title': 'Minimum Viable Workflow - 16 May 2024'}

In [31]:
for v in avus_as_json.values():
    print(type(v))

<class 'dict'>
<class 'dict'>
<class 'dict'>
<class 'list'>
<class 'list'>
<class 'str'>
<class 'str'>


In [47]:
type("mari") == type("taihou") == int

False

In [180]:
avus_as_json

{'access': {'accessRights': 'open'},
 'author': [{'authorAffiliation': 'KU Leuven',
   'authorName': 'Kafetzaki, Danai'}],
 'datasetContact': [{'datasetContactEmail': 'danai.kafetzaki@kuleuven.be',
   'datasetContactName': 'Kafetzaki, Danai'}],
 'dsDescription': [{'dsDescriptionValue': 'This is a minimal end-to-end implementation for iRODS-Dataverse integration, a KU Leuven and SURF collaboration'}],
 'keyword': [{'keywordValue': 'required-keyword'}],
 'technicalFormat': 'json',
 'title': 'Minimum Viable Workflow - 16 May 2024'}

In [188]:
def return_dict(value, fromAvu):
    return {k: update_template(value[k], fromAvu) for k in value.keys()}
    
def update_template(field, avus_as_json):
    typeName = field["typeName"]
    value = field["value"]
    fromAvu = avus_as_json[typeName]
    typeClass = field["typeClass"]
    if typeClass != "compound":
        field["value"] = fromAvu
    elif type(value) == list:
        if type(fromAvu) != list:
            fromAvu = [fromAvu]
        field["value"] = [return_dict(x, y) for x, y in zip(value, fromAvu)]
    else:
        field["value"] = return_dict(value, fromAvu)
    return field

In [189]:
with open("../doc/metadata/template_RDR.json") as f:
    template = json.load(f)
template

{'datasetVersion': {'metadataBlocks': {'citation': {'fields': [{'value': '...Title...',
      'typeClass': 'primitive',
      'multiple': False,
      'typeName': 'title'},
     {'value': [{'authorName': {'value': '...LastName..., ...FirstName...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'authorName'},
        'authorAffiliation': {'value': '...Affiliation...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'authorAffiliation'}}],
      'typeClass': 'compound',
      'multiple': False,
      'typeName': 'author'},
     {'value': [{'datasetContactEmail': {'value': '...Email...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'datasetContactEmail'},
        'datasetContactName': {'value': '...LastName..., ...FirstName...',
         'typeClass': 'primitive',
         'multiple': False,
         'typeName': 'datasetContactName'}}],
      'typeClass': 'compound',
      'mult

In [190]:
[update_template(field, avus_as_json) for field in template["datasetVersion"]["metadataBlocks"]["citation"]["fields"]]
    

[{'value': 'Minimum Viable Workflow - 16 May 2024',
  'typeClass': 'primitive',
  'multiple': False,
  'typeName': 'title'},
 {'value': [{'authorName': {'value': 'Kafetzaki, Danai',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'authorName'},
    'authorAffiliation': {'value': 'KU Leuven',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'authorAffiliation'}}],
  'typeClass': 'compound',
  'multiple': False,
  'typeName': 'author'},
 {'value': [{'datasetContactEmail': {'value': 'danai.kafetzaki@kuleuven.be',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'datasetContactEmail'},
    'datasetContactName': {'value': 'Kafetzaki, Danai',
     'typeClass': 'primitive',
     'multiple': False,
     'typeName': 'datasetContactName'}}],
  'typeClass': 'compound',
  'multiple': False,
  'typeName': 'datasetContact'},
 {'value': [{'dsDescriptionValue': {'value': 'This is a minimal end-to-end implementation for iRODS-Dataver

In [177]:
template

{'datasetVersion': {'metadataBlocks': {'citation': {'fields': [{'value': 'Minimum Viable Workflow - 16 May 2024',
      'typeClass': 'primitive',
      'multiple': False,
      'typeName': 'title'},
     {'value': [{'authorName': None, 'authorAffiliation': None}],
      'typeClass': 'compound',
      'multiple': False,
      'typeName': 'author'},
     {'value': [{'datasetContactEmail': None, 'datasetContactName': None}],
      'typeClass': 'compound',
      'multiple': False,
      'typeName': 'datasetContact'},
     {'value': [{'dsDescriptionValue': None}],
      'typeClass': 'compound',
      'multiple': True,
      'typeName': 'dsDescription'},
     {'value': [{'keywordValue': None}],
      'typeClass': 'compound',
      'multiple': True,
      'typeName': 'keyword'},
     {'value': 'json',
      'typeClass': 'primitive',
      'multiple': False,
      'typeName': 'technicalFormat'},
     {'value': {'accessRights': None},
      'typeClass': 'compound',
      'multiple': False,
    

In [163]:
fill_in_template(template, avus_as_json)
print(template)

{'datasetVersion': {'metadataBlocks': {'citation': {'fields': [1, 2, 3], 'displayName': 'Citation Metadata'}}}}


In [155]:
template["datasetVersion"]["metadataBlocks"]["citation"]["fields"] = new_fields
with open("../doc/metadata/filled_in_template_RDR.json", "w") as f:
    json.dump(template, f, indent=4)