# CDM -> CDEF JSON Translator
This is to demonstrate/investigate the possibility of translating the existing SADL documents to JSON schema.

### Imports

In [1]:
import os
import copy
import json

### Constants

In [2]:
SCHEMA_TEMPLATE = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "http://example-schema.org/cdm/",
    "title": None,
    "type": "object",
    "properties": {}
}
# Dictionary to trade SADL types for JSON schema types
PRIMITIVES = {"string": "string", "double": "number", "float": "number", "int": "integer", "dateTime": "date-time"}

### Functions

In [11]:
def prepare_new_class(line):
    """Prepare schema dictionary for new class"""
    new_schema = copy.deepcopy(SCHEMA_TEMPLATE)#.copy()
    # Get the name of the class (should be the first word in this line)
    class_name = line.split(' ')[0]
    # Update the id and title with the class_name
    new_schema['$id'] = new_schema['$id'] + class_name
    new_schema['title'] = class_name
    # Add the description if there is one
    if "note" in line:
        new_schema['description'] = line[line.find('note "')+len('note "'):line.rfind('")')]
    return new_schema, class_name

In [12]:
def add_single_val_prop(curr_line, new_schema, new_property):
    """Adds single-value new_property to new_schema based on curr_line of the SADL file"""
    prop_type = curr_line[curr_line.find('single value of type ')+len('single value of type '):curr_line.rfind('\n')].split(' ')[0][:-1]
    # Assign type based on type in SADL
    if prop_type in PRIMITIVES.keys():
        if prop_type == 'dateTime':
            new_schema['properties'][new_property]['type'] = 'string'
            new_schema['properties'][new_property]['format'] = PRIMITIVES[prop_type]
        else:
            new_schema['properties'][new_property]['type'] = PRIMITIVES[prop_type]
    # Add reference to class
    else:
        new_schema['properties'][new_property]['$ref'] = "http://example-schema.org/cdm/" + prop_type

In [13]:
def add_multi_val_prop(curr_line, new_schema, new_property):    
    """Adds multi-value new_property to new_schema based on curr_line of the SADL file"""
    prop_type = curr_line[curr_line.find('values of type ')+len('values of type '):curr_line.rfind('\n')].split(' ')[0][:-1]
    new_schema['properties'][new_property]['type'] = 'array'
    new_schema['properties'][new_property]['items'] = {}
    # Assign type based on type in SADL
    if prop_type in PRIMITIVES.keys():
        if prop_type == 'dateTime':
            new_schema['properties'][new_property]['items']['type'] = 'string'
            new_schema['properties'][new_property]['items']['format'] = PRIMITIVES[prop_type]
        else:
            new_schema['properties'][new_property]['items']['type'] = PRIMITIVES[prop_type]
    # Add reference to class
    else:
        new_schema['properties'][new_property]['items']['$ref'] = "http://example-schema.org/cdm/" + prop_type

In [14]:
def add_prop_description(curr_line, new_schema, new_property):
    """Adds description for new property"""
    new_schema['properties'][new_property]['description'] = curr_line[curr_line.find('note "')+len('note "'):curr_line.rfind('")')]

In [15]:
def prepare_new_property(curr_line, new_schema):
    """Prepare dictionary for new property in schema"""
    curr_line = curr_line.lstrip()
    # property name is after two spaces
    new_property = curr_line.split(' ')[2]
    # add new property
    new_schema['properties'][new_property] = {}
    return new_property

In [16]:
def get_class_schemas(sadl_folder, sadl_schema):
    """Generate JSON schemas for top-level SADL classes"""
    for sadl_file in os.listdir(sadl_folder):
        if os.path.isfile(os.path.join(sadl_folder,sadl_file)):
            # Read the contents of the SADL file
            with open(os.path.join(sadl_folder,sadl_file), "r") as f:
                sadl_contents = f.readlines()
            for idx, line in enumerate(sadl_contents):
                # Class case
                if line.endswith('is a class,\n') or line.endswith('is a class.\n'):
                    # Initialize schema for class
                    new_schema, class_name = prepare_new_class(line)

                    # Find properties and end of class
                    for jdx in range(idx+1, len(sadl_contents)):

                        curr_line = sadl_contents[jdx]
                        # Add class to the SADL schema if we get to the end of a description in SADL file
                        if curr_line == '\n':
                            sadl_schema[class_name] = new_schema
                            break
                        # Check if line describes a property
                        if 'described by' in curr_line:
                            # Make dictionary for new property
                            new_property = prepare_new_property(curr_line, new_schema)
                            # add a property description if one exists
                            if 'note' in curr_line:
                                add_prop_description(curr_line, new_schema, new_property)

                            # add type for single value
                            if 'single value of type' in curr_line:
                                add_single_val_prop(curr_line, new_schema, new_property)

                            # add type for multiple values
                            elif 'values of type' in curr_line:
                                add_multi_val_prop(curr_line, new_schema, new_property)

In [17]:
def get_subclass_schemas(sadl_file, sadl_schema):
    """Generate JSON schemas for SADL subclasses
    
    NOTE: main difference is check for 'is a type of' instead of 'is a class'
          This is sloppy and not very robust: it would be better to have something that
          recursively finds parents and adds their properties and gets all of this
          into one function
    """
    if os.path.isfile(sadl_file):
        with open(sadl_file, "r") as f:
            sadl_contents = f.readlines()
        for idx, line in enumerate(sadl_contents):
            # Class case
            if 'is a type of' in line:
                parent = line[line.find('is a type of ')+len('is a type of '):line.rfind('\n')].split(' ')[0][:-1]
                # If parent is not in existing list, skip this for now
                if parent not in sadl_schema.keys():
                    continue
                # Initialize schema for class
                new_schema, class_name = prepare_new_class(line)
                # Insert properties from parent
                new_schema['properties'] = copy.deepcopy(sadl_schema[parent]['properties'])
                # Find properties and end of class
                for jdx in range(idx+1, len(sadl_contents)):

                    curr_line = sadl_contents[jdx]
                    if curr_line == '\n':
                        sadl_schema[class_name] = new_schema
                        break
                    # Check if line describes a property
                    if 'described by' in curr_line:
                        # Make dictionary for new property
                        new_property = prepare_new_property(curr_line, new_schema)
                        # add a property description if one exists
                        if 'note' in curr_line:
                            add_prop_description(curr_line, new_schema, new_property)

                        # add type for single value
                        if 'single value of type' in curr_line:
                            add_single_val_prop(curr_line, new_schema, new_property)

                        # add type for multiple values
                        elif 'values of type' in curr_line:
                            add_multi_val_prop(curr_line, new_schema, new_property)

### Generate the full CDM
The above seems to work at this point. Let's do it for the full thing, and add in inheritance

In [18]:
# Build from an empty dictionary
sadl_schema = {}
sadl_folder = '../SADL/'

# Get top-level class schemas into sadl_schema
get_class_schemas(sadl_folder, sadl_schema)
                
# Repeat but for subclasses. Not elegant but functional for now
for sadl_file in ['../SADL/AM-CDM-base.sadl', '../SADL/AM-CDM-build.sadl', '../SADL/AM-CDM-material.sadl', '../SADL/AM-CDM-process.sadl', '../SADL/AM-CDM-system.sadl', '../SADL/AM-CDM-testInspectionCharacterization.sadl']: #os.listdir(sadl_folder):
    get_subclass_schemas(sadl_file, sadl_schema)

Important problem with the above: the order in which we load files matters (e.g., if TIC is loaded first, it won't know how to inherit from ProcessStep. For now, we can fix this by specifying a file order, but it's better to have a more robust solution to this.)

Will be more elegant to parse strings using regular expressions, perhaps.

Note that after building, some issues are apparent (e.g., redundancy of start/end times when TIC inherits from ProcessStep)

In [20]:
# Write schema to file
with open("full_cdm_v2.json", "w") as write_file:
    json.dump(sadl_schema, write_file, indent=4)

In [19]:
sadl_schema

{'TestResult': {'$schema': 'https://json-schema.org/draft/2020-12/schema',
  '$id': 'http://example-schema.org/cdm/TestResult',
  'title': 'TestResult',
  'type': 'object',
  'properties': {'ticPassFail': {'description': 'Indication of whether the results of the test, inspection, or characterization passed or failed and is for informational purposes only.',
    '$ref': 'http://example-schema.org/cdm/ResultStatus'},
   'ticResultMeasurement': {'$ref': 'http://example-schema.org/cdm/Measurement'},
   'ticReport': {'$ref': 'http://example-schema.org/cdm/Document'},
   'ticMaterialProperty': {'type': 'array',
    'items': {'$ref': 'http://example-schema.org/cdm/MaterialProperties'}}}},
 'Specimen': {'$schema': 'https://json-schema.org/draft/2020-12/schema',
  '$id': 'http://example-schema.org/cdm/Specimen',
  'title': 'Specimen',
  'type': 'object',
  'properties': {'specimenID': {'description': 'The identifier of the individual specimen.',
    'type': 'string'},
   'specimenDescription': 