diff --git a/src/app.py b/src/app.py index 3782d8eb..e6a10bf3 100644 --- a/src/app.py +++ b/src/app.py @@ -134,6 +134,7 @@ def close_neo4j_driver(error): schema_manager.initialize(app.config['SCHEMA_YAML_FILE'], app.config['UUID_API_URL'], app.config['INGEST_API_URL'], + app.config['SEARCH_API_URL'], auth_helper_instance, neo4j_driver_instance) @@ -812,6 +813,14 @@ def create_entity(entity_type): # No need to log the validation errors bad_request_error(str(e)) + # Execute property level validators defined in schema yaml before entity property creation + # Use empty dict {} to indicate there's no existing_data_dict + try: + schema_manager.execute_property_level_validators('before_property_create_validators', normalized_entity_type, request, {}, json_data_dict) + # Currently only ValueError + except ValueError as e: + bad_request_error(e) + # Sample and Dataset: additional validation, create entity, after_create_trigger # Collection and Donor: create entity if normalized_entity_type == 'Sample': @@ -2049,11 +2058,22 @@ def retract_dataset(id): 1-5 will be returned in reverse order (newest first). Non-public access is only required to retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary contains the dataset revision number and its uuid. Optionally, the full dataset can be included for each. + By default, only the revision number and uuid is included. To include the full dataset, the query parameter "include_dataset" can be given with the value of "true". If this parameter is not included or is set to false, the dataset will not be included. For example, to include the full datasets for each revision, use '/datasets//revisions?include_dataset=true'. To omit the datasets, either set include_dataset=false, or simply do not include this parameter. + +Parameters +---------- +id : str + The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset + +Returns +------- +list + The list of revision datasets """ @app.route('/datasets//revisions', methods=['GET']) def get_revisions_list(id): diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index e02c3e03..f761936a 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -386,7 +386,6 @@ def update_entity(neo4j_driver, entity_type, entity_data_dict, uuid): raise TransactionError(msg) - """ Get all ancestors by uuid diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index d9fbd2c6..bd0fe97f 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -1,7 +1,7 @@ ############################################# Rules ############################################# # Entity properties: # - type: data type of the property, one of the following: string|integer|boolean|list|json_string -# - generated: whether the property is auto genearated or user provided, default to false +# - generated: whether the property is auto genearated (either with a `before_create_trigger` or not) or user provided, default to false # - required_on_create: whether the property is required from user reqeust JSON for entity creation via POST # - immutable: whether the property can NOT be updated once being created, default to false # - transient: whether the property to persist in database or not, default to false @@ -13,11 +13,11 @@ # - types: before_entity_create_validator, a single validation method needed for creating or updating the entity # Property level validators: -# - types: before_property_update_validators, a list of validation methods +# - types: before_property_create_validators|before_property_update_validators, a list of validation methods # Entity creation via http POST request: # - Use `generated: true` to mark a property as to be auto generated by the program instead of from user input JSON -# - If a property is marked as `generated: true`, either no trigger method needed (E.g. Donor.image_files) or a `before_create_trigger` can be used to generate the value +# - If a property is marked as `generated: true`, either no trigger method needed (E.g. Donor.image_files, Dataset.title) or a `before_create_trigger` can be used to generate the value # - If a property has `before_create_trigger`, it can't be specified in client request JSON # - If a property is mraked as `generated: true`, it can't be `required_on_create: true` at the same time # - Use `required_on_create: true` to mark a property as required from user input @@ -232,21 +232,29 @@ ENTITIES: required_on_create: true # Only required for create via POST, not update via PUT description: "True if the data contains any human genetic sequence information." status: + type: string before_property_update_validators: - validate_application_header_before_property_update - validate_dataset_status_value - type: string generated: true description: "One of: New|Processing|QA|Published|Error|Hold|Invalid" before_create_trigger: set_dataset_status_new after_update_trigger: update_dataset_and_ancestors_data_access_level title: type: string + generated: true # Disallow entry from users via POST + immutable: true # Disallow update via PUT + description: "The auto generated dataset title." + on_read_trigger: get_dataset_title description: "The auto generated dataset title." lab_dataset_id: type: string description: "A name or identifier used by the lab who is uploading the data to cross reference the data locally" data_types: + before_property_create_validators: + - validate_no_duplicates_in_list + before_property_update_validators: + - validate_no_duplicates_in_list type: list required_on_create: true # Only required for create via POST, not update via PUT description: "The data or assay types contained in this dataset as a json array of strings. Each is an assay code from [assay types](https://github.com/hubmapconsortium/search-api/blob/master/src/search-schema/data/definitions/enums/assay_types.yaml)." @@ -268,6 +276,10 @@ ENTITIES: direct_ancestor_uuids: required_on_create: true # Only required for create via POST, not update via PUT type: list + before_property_create_validators: + - validate_no_duplicates_in_list + before_property_update_validators: + - validate_no_duplicates_in_list transient: true exposed: false description: "The uuids of source entities from which this new entity is derived. Used to pass source entity ids in on POST or PUT calls used to create the linkages." @@ -707,6 +719,8 @@ ENTITIES: #datasets will be added only via PUT/update, NEVER on POST/create dataset_uuids_to_link: type: list + before_property_update_validators: + - validate_no_duplicates_in_list generated: true # Disallow user input from request json when being created transient: true exposed: false @@ -716,6 +730,8 @@ ENTITIES: #transient attribute used to pass in a list of dataset ids of datasets to be removed from a DataSubmisison dataset_uuids_to_unlink: type: list + before_property_update_validators: + - validate_no_duplicates_in_list generated: true # Disallow user input from request json when being created transient: true exposed: false diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index acf2d7e9..1cd54585 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -37,6 +37,7 @@ _schema = None _uuid_api_url = None _ingest_api_url = None +_search_api_url = None _auth_helper = None _neo4j_driver = None @@ -57,19 +58,22 @@ """ def initialize(valid_yaml_file, uuid_api_url, - ingest_api_url, + ingest_api_url, + search_api_url, auth_helper_instance, neo4j_driver_instance): # Specify as module-scope variables global _schema global _uuid_api_url global _ingest_api_url + global _search_api_url global _auth_helper global _neo4j_driver _schema = load_provenance_schema(valid_yaml_file) _uuid_api_url = uuid_api_url _ingest_api_url = ingest_api_url + _search_api_url = search_api_url # Get the helper instances _auth_helper = auth_helper_instance @@ -538,23 +542,12 @@ def normalize_entity_result_for_response(entity_dict, properties_to_exclude = [] # Skip properties with None value and the ones that are marked as not to be exposed. # By default, all properties are exposed if not marked as `exposed: false` # It's still possible to see `exposed: true` marked explictly - if (entity_dict[key] is not None) and ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): - # Safely evaluate a string containing a Python dict or list literal - # Only convert to Python list/dict when the string literal is not empty - # instead of returning the json-as-string or array-as-string - if isinstance(entity_dict[key], str) and entity_dict[key] and (properties[key]['type'] in ['list', 'json_string']): - # ast uses compile to compile the source string (which must be an expression) into an AST - # If the source string is not a valid expression (like an empty string), a SyntaxError will be raised by compile - # If, on the other hand, the source string would be a valid expression (e.g. a variable name like foo), - # compile will succeed but then literal_eval() might fail with a ValueError - # Also this fails with a TypeError: literal_eval("{{}: 'value'}") - try: - entity_dict[key] = ast.literal_eval(entity_dict[key]) - except (SyntaxError, ValueError, TypeError) as e: - logger.debug(f"Invalid expression (string value) of key: {key} for ast.literal_eval()") - logger.debug(entity_dict[key]) - msg = "Failed to convert the source string with ast.literal_eval()" - logger.exception(msg) + if (entity_dict[key] is not None) and ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): + if entity_dict[key] and (properties[key]['type'] in ['list', 'json_string']): + # Safely evaluate a string containing a Python dict or list literal + # Only convert to Python list/dict when the string literal is not empty + # instead of returning the json-as-string or array-as-string + entity_dict[key] = convert_str_to_data(entity_dict[key]) # Add the target key with correct value of data type to the normalized_entity dict normalized_entity[key] = entity_dict[key] @@ -740,13 +733,13 @@ def execute_entity_level_validator(validator_type, normalized_entity_type, reque Parameters ---------- validator_type : str - For now only: before_property_update_validators (support multiple validators) + before_property_create_validators|before_property_update_validators (support multiple validators) normalized_entity_type : str One of the normalized entity types defined in the schema yaml: Donor, Sample, Dataset, Upload request: Flask request object The instance of Flask request passed in from application request existing_data_dict : dict - A dictionary that contains all existing entity properties + A dictionary that contains all existing entity properties, {} for before_property_create_validators new_data_dict : dict The json data in request body, already after the regular validations """ @@ -890,10 +883,10 @@ def validate_entity_level_validator_type(validator_type): Parameters ---------- validator_type : str - One of the validator types: before_property_update_validators + One of the validator types: before_property_create_validators|before_property_update_validators """ def validate_property_level_validator_type(validator_type): - accepted_validator_types = ['before_property_update_validators'] + accepted_validator_types = ['before_property_create_validators', 'before_property_update_validators'] separator = ', ' if validator_type.lower() not in accepted_validator_types: @@ -1439,6 +1432,20 @@ def get_ingest_api_url(): return _ingest_api_url +""" +Get the search-api URL to be used by trigger methods + +Returns +------- +str + The search-api URL +""" +def get_search_api_url(): + global _search_api_url + + return _search_api_url + + """ Get the AUthHelper instance to be used by trigger methods @@ -1466,6 +1473,40 @@ def get_neo4j_driver_instance(): return _neo4j_driver +""" +Convert a string representation of the Python list/dict (either nested or not) to a Python list/dict + +Parameters +---------- +data_str: str + The string representation of the Python list/dict stored in Neo4j. + It's not stored in Neo4j as a json string! And we can't store it as a json string + due to the way that Cypher handles single/double quotes. + +Returns +------- +list or dict + The real Python list or dict after evaluation +""" +def convert_str_to_data(data_str): + if isinstance(data_str, str): + # ast uses compile to compile the source string (which must be an expression) into an AST + # If the source string is not a valid expression (like an empty string), a SyntaxError will be raised by compile + # If, on the other hand, the source string would be a valid expression (e.g. a variable name like foo), + # compile will succeed but then literal_eval() might fail with a ValueError + # Also this fails with a TypeError: literal_eval("{{}: 'value'}") + try: + data = ast.literal_eval(data_str) + except (SyntaxError, ValueError, TypeError) as e: + logger.debug(f"Invalid expression (string value): {data_str} to be evaluated by ast.literal_eval()") + logger.debug(entity_dict[key]) + msg = "Failed to convert the source string with ast.literal_eval()" + logger.exception(msg) + else: + data = data_str + + return data + #################################################################################################### ## Internal functions @@ -1495,3 +1536,4 @@ def _create_request_headers(user_token): return headers_dict + diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index f66fd123..f2f729f5 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -56,6 +56,45 @@ def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): return results + +""" +Get the sample organ name and donor metadata information of the given dataset uuid + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The neo4j database connection pool +uuid : str + The uuid of target entity + +Returns +------- +str: The sample organ name +str: The donor metadata (string representation of a Python dict) +""" +def get_dataset_organ_and_donor_info(neo4j_driver, uuid): + organ_name = None + donor_metadata = None + + query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) " + # Filter out the Lab entities + f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) " + # COLLECT() returns a list + # apoc.coll.toSet() reruns a set containing unique nodes + f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata") + + logger.debug("======get_dataset_organ_and_donor_info() query======") + logger.debug(query) + + with neo4j_driver.session() as session: + record = session.read_transaction(_execute_readonly_tx, query) + + if record: + organ_name = record['organ_name'] + donor_metadata = record['donor_metadata'] + + return organ_name, donor_metadata + """ Create or recreate one or more linkages (via Activity nodes) between the target entity node and the direct ancestor nodes in neo4j diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index dcf9dc0e..54d049ca 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1,9 +1,11 @@ import os import ast import json +import yaml import logging import datetime import requests +import urllib.request from neo4j.exceptions import TransactionError # Local modules @@ -527,11 +529,7 @@ def update_file_descriptions(property_key, normalized_type, user_token, existing # Note: The property, name specified by `target_property_key`, is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[property_key] - if isinstance(ext_prop, str): - existing_files_list = ast.literal_eval(ext_prop) - else: - existing_files_list = ext_prop + existing_files_list = schema_manager.convert_str_to_data(existing_data_dict[property_key]) else: if not property_key in generated_dict: raise KeyError(f"Missing '{property_key}' key in 'generated_dict' during calling 'update_file_descriptions()' trigger method.") @@ -901,6 +899,111 @@ def link_to_previous_revision(property_key, normalized_type, user_token, existin # No need to log raise +""" +Trigger event method of auto generating the dataset title + +Parameters +---------- +property_key : str + The target property key +normalized_type : str + One of the types defined in the schema yaml: Activity, Collection, Donor, Sample, Dataset +user_token: str + The user's globus nexus token +existing_data_dict : dict + A dictionary that contains all existing entity properties +new_data_dict : dict + A merged dictionary that contains all possible input data to be used + +Returns +------- +str: The target property key +str: The generated dataset title +""" +def get_dataset_title(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): + if 'uuid' not in existing_data_dict: + raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'get_dataset_title()' trigger method.") + + # Assume organ_desc is always available, otherwise will throw parsing error + organ_desc = '' + + age = None + race = None + sex = None + + # Parse assay_type from the Dataset + try: + # Note: The existing_data_dict['data_types'] is stored in Neo4j as a string representation of the Python list + # It's not stored in Neo4j as a json string! And we can't store it as a json string + # due to the way that Cypher handles single/double quotes. + data_types_list = schema_manager.convert_str_to_data(existing_data_dict['data_types']) + assay_type_desc = _get_assay_type_description(data_types_list) + except requests.exceptions.RequestException as e: + raise requests.exceptions.RequestException(e) + + # Get the sample organ name and donor metadata information of this dataset + organ_name, donor_metadata = schema_neo4j_queries.get_dataset_organ_and_donor_info(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + + # Can we move organ_types.yaml to commons or make it an API call to avoid parsing the raw yaml? + # Parse the organ description + if organ_name is not None: + try: + # The organ_name is the two-letter code only set if specimen_type == 'organ' + # Convert the two-letter code to a description + # https://github.com/hubmapconsortium/search-api/blob/test-release/src/search-schema/data/definitions/enums/organ_types.yaml + organ_desc = _get_organ_description(organ_name) + except yaml.YAMLError as e: + raise yaml.YAMLError(e) + + # Parse age, race, and sex + if donor_metadata is not None: + try: + # Note: The donor_metadata is stored in Neo4j as a string representation of the Python dict + # It's not stored in Neo4j as a json string! And we can't store it as a json string + # due to the way that Cypher handles single/double quotes. + ancestor_metadata_dict = schema_manager.convert_str_to_data(donor_metadata) + + # Easier to ask for forgiveness than permission (EAFP) + # Rather than checking key existence at every level + data_list = ancestor_metadata_dict['organ_donor_data'] + + for data in data_list: + if 'grouping_concept_preferred_term' in data: + if data['grouping_concept_preferred_term'].lower() == 'age': + # The actual value of age stored in 'data_value' instead of 'preferred_term' + age = data['data_value'] + + if data['grouping_concept_preferred_term'].lower() == 'race': + race = data['preferred_term'].lower() + + if data['grouping_concept_preferred_term'].lower() == 'sex': + sex = data['preferred_term'].lower() + except KeyError: + pass + + age_race_sex_info = None + + if (age is None) and (race is not None) and (sex is not None): + age_race_sex_info = f"{race} {sex} of unknown age" + elif (race is None) and (age is not None) and (sex is not None): + age_race_sex_info = f"{age}-year-old {sex} of unknown race" + elif (sex is None) and (age is not None) and (race is not None): + age_race_sex_info = f"{age}-year-old {race} donor of unknown sex" + elif (age is None) and (race is None) and (sex is not None): + age_race_sex_info = f"{sex} donor of unknown age and race" + elif (age is None) and (sex is None) and (race is not None): + age_race_sex_info = f"{race} donor of unknown age and sex" + elif (race is None) and (sex is None) and (age is not None): + age_race_sex_info = f"{age}-year-old donor of unknown race and sex" + elif (age is None) and (race is None) and (sex is None): + age_race_sex_info = "donor of unknown age, race and sex" + else: + age_race_sex_info = f"{age}-year-old {race} {sex}" + + generated_title = f"{assay_type_desc} data from the {organ_desc} of a {age_race_sex_info}" + + return property_key, generated_title + """ Trigger event method of getting the uuid of the previous revision dataset if exists @@ -1105,11 +1208,7 @@ def delete_thumbnail_file(property_key, normalized_type, user_token, existing_da # is stored in Neo4j as a string representation of the Python dict # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[target_property_key] - if isinstance(ext_prop, str): - file_info_dict = ast.literal_eval(ext_prop) - else: - file_info_dict = ext_prop + file_info_dict = schema_manager.convert_str_to_data(existing_data_dict[target_property_key]) else: file_info_dict = generated_dict[target_property_key] @@ -1622,11 +1721,7 @@ def _commit_files(target_property_key, property_key, normalized_type, user_token # Note: The property, name specified by `target_property_key`, is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[target_property_key] - if isinstance(ext_prop, str): - files_info_list = ast.literal_eval(ext_prop) - else: - files_info_list = ext_prop + files_info_list = schema_manager.convert_str_to_data(existing_data_dict[target_property_key]) else: files_info_list = generated_dict[target_property_key] @@ -1736,11 +1831,7 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token # Note: The property, name specified by `target_property_key`, is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[target_property_key] - if isinstance(ext_prop, str): - files_info_list = ast.literal_eval(ext_prop) - else: - files_info_list = ext_prop + files_info_list = schema_manager.convert_str_to_data(existing_data_dict[target_property_key]) else: files_info_list = generated_dict[target_property_key] @@ -1773,3 +1864,85 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token generated_dict[target_property_key] = files_info_list return generated_dict + +""" +Compose the assay type description + +Parameters +---------- +data_types : list + A list of dataset data types + +Returns +------- +str: The formatted assay type description +""" +def _get_assay_type_description(data_types): + assay_types = [] + assay_type_desc = '' + + for data_type in data_types: + # The assaytype endpoint in search-api is public accessible, no token needed + search_api_target_url = schema_manager.get_search_api_url() + f"/assaytype/{data_type}" + + # Disable ssl certificate verification + response = requests.get(url = search_api_target_url, verify = False) + + if response.status_code == 200: + assay_type_info = response.json() + # Add to the list + assay_types.append(assay_type_info['description']) + else: + msg = f"Unable to query the assay type details of: {data_type} via search-api" + + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======status code from search-api======") + logger.debug(response.status_code) + + logger.debug("======response text from search-api======") + logger.debug(response.text) + + raise requests.exceptions.RequestException(response.text) + + # Formatting based on the number of items in the list + if assay_types: + if len(assay_types) == 1: + assay_type_desc = assay_types[0] + elif len(assay_types) == 2: + # and + assay_type_desc = ' and '.join(assay_types) + else: + # , , and + assay_type_desc = f"{', '.join(assay_types[:-1])}, and {assay_types[-1]}" + else: + msg = "Empty list of assay_types" + + logger.error(msg) + + raise ValueError(msg) + + return assay_type_desc + +""" +Get the organ description based on the given organ code + +Parameters +---------- +organ_code : str + The two-letter organ code + +Returns +------- +str: The organ code description +""" +def _get_organ_description(organ_code): + yaml_file_url = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/organ_types.yaml' + with urllib.request.urlopen(yaml_file_url) as response: + yaml_file = response.read() + try: + organ_types_dict = yaml.safe_load(yaml_file) + return organ_types_dict[organ_code]['description'].lower() + except yaml.YAMLError as e: + raise yaml.YAMLError(e) diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index b4e68aef..b3af375b 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -41,6 +41,29 @@ def validate_application_header_before_entity_create(normalized_entity_type, req ## Property Level Validators #################################################################################################### +""" +Validate the target list has no duplicated items + +Parameters +---------- +property_key : str + The target property key +normalized_type : str + Submission +request: Flask request object + The instance of Flask request passed in from application request +existing_data_dict : dict + A dictionary that contains all existing entity properties +new_data_dict : dict + The json data in request body, already after the regular validations +""" +def validate_no_duplicates_in_list(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): + # Use lowercase for comparision via list comprehensions + target_list = [v.lower() for v in new_data_dict[property_key]] + if len(set(target_list)) != len(target_list): + raise ValueError(f"The {property_key} field must only contain unique items") + + """ Validate the provided value of Dataset.status on update via PUT