From 29e4fad2abb5c6e48b51c149d624c3888fdfaa5e Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Tue, 28 Sep 2021 23:11:41 -0400 Subject: [PATCH 1/8] Generate dataset title on read --- src/app.py | 1 + src/app_neo4j_queries.py | 1 - src/schema/provenance_schema.yaml | 1 + src/schema/schema_manager.py | 20 ++- src/schema/schema_neo4j_queries.py | 38 ++++++ src/schema/schema_triggers.py | 188 +++++++++++++++++++++++++++++ 6 files changed, 247 insertions(+), 2 deletions(-) diff --git a/src/app.py b/src/app.py index cc75d6cd..8cddf435 100644 --- a/src/app.py +++ b/src/app.py @@ -134,6 +134,7 @@ def close_neo4j_driver(error): schema_manager.initialize(app.config['SCHEMA_YAML_FILE'], app.config['UUID_API_URL'], app.config['INGEST_API_URL'], + app.config['SEARCH_API_URL'], auth_helper_instance, neo4j_driver_instance) diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index e02c3e03..f761936a 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -386,7 +386,6 @@ def update_entity(neo4j_driver, entity_type, entity_data_dict, uuid): raise TransactionError(msg) - """ Get all ancestors by uuid diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 2a7647de..39d74989 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -240,6 +240,7 @@ ENTITIES: title: type: string description: "The dataset title." + on_read_trigger: get_dataset_title lab_dataset_id: type: string description: "A name or identifier used by the lab who is uploading the data to cross reference the data locally" diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index acf2d7e9..941293b6 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -37,6 +37,7 @@ _schema = None _uuid_api_url = None _ingest_api_url = None +_search_api_url = None _auth_helper = None _neo4j_driver = None @@ -57,19 +58,22 @@ """ def initialize(valid_yaml_file, uuid_api_url, - ingest_api_url, + ingest_api_url, + search_api_url, auth_helper_instance, neo4j_driver_instance): # Specify as module-scope variables global _schema global _uuid_api_url global _ingest_api_url + global _search_api_url global _auth_helper global _neo4j_driver _schema = load_provenance_schema(valid_yaml_file) _uuid_api_url = uuid_api_url _ingest_api_url = ingest_api_url + _search_api_url = search_api_url # Get the helper instances _auth_helper = auth_helper_instance @@ -1439,6 +1443,20 @@ def get_ingest_api_url(): return _ingest_api_url +""" +Get the search-api URL to be used by trigger methods + +Returns +------- +str + The search-api URL +""" +def get_search_api_url(): + global _search_api_url + + return _search_api_url + + """ Get the AUthHelper instance to be used by trigger methods diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index f66fd123..7f2195dd 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -56,6 +56,44 @@ def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): return results + +""" +Get all ancestors of the given dataset uuid + +Parameters +---------- +neo4j_driver : neo4j.Driver object + The neo4j database connection pool +uuid : str + The uuid of target entity + +Returns +------- +list + A list of unique ancestor dictionaries returned from the Cypher query +""" +def get_dataset_ancestors(neo4j_driver, uuid): + results = [] + + query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(ancestor:Entity) " + # Filter out the Lab entities + f"WHERE e.uuid='{uuid}' AND ancestor.entity_type <> 'Lab' " + # COLLECT() returns a list + # apoc.coll.toSet() reruns a set containing unique nodes + f"RETURN apoc.coll.toSet(COLLECT(ancestor)) AS {record_field_name}") + + logger.debug("======get_dataset_ancestors() query======") + logger.debug(query) + + with neo4j_driver.session() as session: + record = session.read_transaction(_execute_readonly_tx, query) + + if record and record[record_field_name]: + # Convert the list of nodes to a list of dicts + results = _nodes_to_dicts(record[record_field_name]) + + return results + """ Create or recreate one or more linkages (via Activity nodes) between the target entity node and the direct ancestor nodes in neo4j diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index dcf9dc0e..db838546 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -1,9 +1,11 @@ import os import ast import json +import yaml import logging import datetime import requests +import urllib.request from neo4j.exceptions import TransactionError # Local modules @@ -901,6 +903,125 @@ def link_to_previous_revision(property_key, normalized_type, user_token, existin # No need to log raise +""" +Trigger event method of auto generating the dataset title + +Parameters +---------- +property_key : str + The target property key +normalized_type : str + One of the types defined in the schema yaml: Activity, Collection, Donor, Sample, Dataset +user_token: str + The user's globus nexus token +existing_data_dict : dict + A dictionary that contains all existing entity properties +new_data_dict : dict + A merged dictionary that contains all possible input data to be used + +Returns +------- +str: The target property key +str: The generated dataset title +""" +def get_dataset_title(property_key, normalized_type, user_token, existing_data_dict, new_data_dict): + if 'uuid' not in existing_data_dict: + raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'get_dataset_title()' trigger method.") + + # Assume organ_desc is always available, otherwise will throw parsing error + organ_desc = '' + + age = None + race = None + sex = None + + # Parse assay_type from the Dataset + try: + # Note: The 'data_types' is stored in Neo4j as a string representation of the Python list + # It's not stored in Neo4j as a json string! And we can't store it as a json string + # due to the way that Cypher handles single/double quotes. + stored_data_types = existing_data_dict['data_types'] + if isinstance(stored_data_types, str): + data_types_list = ast.literal_eval(stored_data_types) + else: + data_types_list = stored_data_types + + assay_type_desc = _get_assay_type_description(data_types_list) + except requests.exceptions.RequestException as e: + raise requests.exceptions.RequestException(e) + + # Get all the ancestors of this dataset + ancestors = schema_neo4j_queries.get_dataset_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + + # Parse organ_name, age, race, and sex from ancestor Sample and Donor + for ancestor in ancestors: + if 'entity_type' in ancestor: + # 'specimen_type' is a key in search-api/src/search-schema/data/definitions/enums/tissue_sample_types.yaml + if ancestor['entity_type'] == 'Sample': + if 'specimen_type' in ancestor and ancestor['specimen_type'].lower() == 'organ' and 'organ' in ancestor: + try: + # ancestor['organ'] is the two-letter code only set if sample_type == organ. + # Convert the two-letter code to a description + # https://github.com/hubmapconsortium/search-api/blob/test-release/src/search-schema/data/definitions/enums/organ_types.yaml + organ_desc = _get_organ_description(ancestor['organ']) + except yaml.YAMLError as e: + raise yaml.YAMLError(e) + + if ancestor['entity_type'] == 'Donor': + # Easier to ask for forgiveness than permission (EAFP) + # Rather than checking key existence at every level + try: + # Note: The 'metadata' is stored in Neo4j as a string representation of the Python dict + # It's not stored in Neo4j as a json string! And we can't store it as a json string + # due to the way that Cypher handles single/double quotes. + stored_ancestor_metadata = ancestor['metadata'] + if isinstance(stored_ancestor_metadata, str): + ancestor_metadata_dict = ast.literal_eval(stored_ancestor_metadata) + else: + ancestor_metadata_dict = stored_ancestor_metadata + + data_list = ancestor_metadata_dict['organ_donor_data'] + + for data in data_list: + if 'grouping_concept_preferred_term' in data: + if data['grouping_concept_preferred_term'].lower() == 'age': + # The actual value of age stored in 'data_value' instead of 'preferred_term' + age = data['data_value'] + + if data['grouping_concept_preferred_term'].lower() == 'race': + race = data['preferred_term'].lower() + + if data['grouping_concept_preferred_term'].lower() == 'sex': + sex = data['preferred_term'].lower() + except KeyError: + pass + + age_race_sex_info = None + + if (age is None) and (race is not None) and (sex is not None): + age_race_sex_info = f"{race} {sex} of unknown age" + elif (race is None) and (age is not None) and (sex is not None): + age_race_sex_info = f"{age}-year-old {sex} of unknown race" + elif (sex is None) and (age is not None) and (race is not None): + age_race_sex_info = f"{age}-year-old {race} donor of unknown sex" + elif (age is None) and (race is None) and (sex is not None): + age_race_sex_info = f"{sex} donor of unknown age and race" + elif (age is None) and (sex is None) and (race is not None): + age_race_sex_info = f"{race} donor of unknown age and sex" + elif (race is None) and (sex is None) and (age is not None): + age_race_sex_info = f"{age}-year-old donor of unknown race and sex" + elif (age is None) and (race is None) and (sex is None): + age_race_sex_info = "donor of unknown age, race and sex" + else: + age_race_sex_info = f"{age}-year-old {race} {sex}" + + generated_title = f"{assay_type_desc} data from the {organ_desc} of a {age_race_sex_info}" + + logger.debug("===========Auto generated Title===========") + logger.debug(generated_title) + + return property_key, generated_title + """ Trigger event method of getting the uuid of the previous revision dataset if exists @@ -1773,3 +1894,70 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token generated_dict[target_property_key] = files_info_list return generated_dict + + + + +#====================================================================== + +def _get_assay_type_description(data_types): + assay_types = [] + assay_type_desc = '' + + for data_type in data_types: + # The assaytype endpoint in search-api is public accessible, no token needed + search_api_target_url = schema_manager.get_search_api_url() + f"/assaytype/{data_type}" + + # Disable ssl certificate verification + response = requests.get(url = search_api_target_url, verify = False) + + if response.status_code == 200: + assay_type_info = response.json() + # Add to the list + assay_types.append(assay_type_info['description']) + else: + msg = f"Unable to query the assay type details of: {data_type} via search-api" + + # Log the full stack trace, prepend a line with our message + logger.exception(msg) + + logger.debug("======status code from search-api======") + logger.debug(response.status_code) + + logger.debug("======response text from search-api======") + logger.debug(response.text) + + raise requests.exceptions.RequestException(response.text) + + # Formatting based on the number of items in the list + if assay_types: + if len(assay_types) == 1: + assay_type_desc = assay_types[0] + elif len(assay_types) == 2: + # and + assay_type_desc = ' and '.join(assay_types) + else: + # , , and + assay_type_desc = f"{', '.join(assay_types[:-1])}, and {assay_types[-1]}" + else: + msg = "Empty list of assay_types" + + logger.error(msg) + + raise ValueError(msg) + + return assay_type_desc + +def _get_organ_types_dict(): + yaml_file_url = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/organ_types.yaml' + with urllib.request.urlopen(yaml_file_url) as response: + yaml_file = response.read() + try: + return yaml.safe_load(yaml_file) + except yaml.YAMLError as e: + raise yaml.YAMLError(e) + +def _get_organ_description(organ_code): + organ_types_dict = _get_organ_types_dict() + return organ_types_dict[organ_code]['description'].lower() + From d22491c944570129566ad7c2198f49663d266c34 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 29 Sep 2021 13:07:38 -0400 Subject: [PATCH 2/8] Reuse data convert from a func --- src/schema/schema_manager.py | 24 ++++++++++++++++++++ src/schema/schema_triggers.py | 42 +++++++---------------------------- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 941293b6..c3517e82 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -1484,6 +1484,29 @@ def get_neo4j_driver_instance(): return _neo4j_driver +""" +Convert a string representation of the Python list/dict (either nested or not) to a Python list/dict + +Parameters +---------- +data_str: str + The string representation of the Python list/dict stored in Neo4j. + It's not stored in Neo4j as a json string! And we can't store it as a json string + due to the way that Cypher handles single/double quotes. + +Returns +------- +list or dict + The real Python list or dict after evaluation +""" +def convert_str_to_data(data_str): + if isinstance(data_str, str): + data = ast.literal_eval(data_str) + else: + data = data_str + + return data + #################################################################################################### ## Internal functions @@ -1513,3 +1536,4 @@ def _create_request_headers(user_token): return headers_dict + diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index db838546..a1f4f0b3 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -529,11 +529,7 @@ def update_file_descriptions(property_key, normalized_type, user_token, existing # Note: The property, name specified by `target_property_key`, is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[property_key] - if isinstance(ext_prop, str): - existing_files_list = ast.literal_eval(ext_prop) - else: - existing_files_list = ext_prop + existing_files_list = schema_manager.convert_str_to_data(existing_data_dict[property_key]) else: if not property_key in generated_dict: raise KeyError(f"Missing '{property_key}' key in 'generated_dict' during calling 'update_file_descriptions()' trigger method.") @@ -937,15 +933,10 @@ def get_dataset_title(property_key, normalized_type, user_token, existing_data_d # Parse assay_type from the Dataset try: - # Note: The 'data_types' is stored in Neo4j as a string representation of the Python list + # Note: The existing_data_dict['data_types'] is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - stored_data_types = existing_data_dict['data_types'] - if isinstance(stored_data_types, str): - data_types_list = ast.literal_eval(stored_data_types) - else: - data_types_list = stored_data_types - + data_types_list = schema_manager.convert_str_to_data(existing_data_dict['data_types']) assay_type_desc = _get_assay_type_description(data_types_list) except requests.exceptions.RequestException as e: raise requests.exceptions.RequestException(e) @@ -971,15 +962,10 @@ def get_dataset_title(property_key, normalized_type, user_token, existing_data_d # Easier to ask for forgiveness than permission (EAFP) # Rather than checking key existence at every level try: - # Note: The 'metadata' is stored in Neo4j as a string representation of the Python dict + # Note: The ancestor['metadata'] is stored in Neo4j as a string representation of the Python dict # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - stored_ancestor_metadata = ancestor['metadata'] - if isinstance(stored_ancestor_metadata, str): - ancestor_metadata_dict = ast.literal_eval(stored_ancestor_metadata) - else: - ancestor_metadata_dict = stored_ancestor_metadata - + ancestor_metadata_dict = schema_manager.convert_str_to_data(ancestor['metadata']) data_list = ancestor_metadata_dict['organ_donor_data'] for data in data_list: @@ -1226,11 +1212,7 @@ def delete_thumbnail_file(property_key, normalized_type, user_token, existing_da # is stored in Neo4j as a string representation of the Python dict # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[target_property_key] - if isinstance(ext_prop, str): - file_info_dict = ast.literal_eval(ext_prop) - else: - file_info_dict = ext_prop + file_info_dict = schema_manager.convert_str_to_data(existing_data_dict[target_property_key]) else: file_info_dict = generated_dict[target_property_key] @@ -1743,11 +1725,7 @@ def _commit_files(target_property_key, property_key, normalized_type, user_token # Note: The property, name specified by `target_property_key`, is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[target_property_key] - if isinstance(ext_prop, str): - files_info_list = ast.literal_eval(ext_prop) - else: - files_info_list = ext_prop + files_info_list = schema_manager.convert_str_to_data(existing_data_dict[target_property_key]) else: files_info_list = generated_dict[target_property_key] @@ -1857,11 +1835,7 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token # Note: The property, name specified by `target_property_key`, is stored in Neo4j as a string representation of the Python list # It's not stored in Neo4j as a json string! And we can't store it as a json string # due to the way that Cypher handles single/double quotes. - ext_prop = existing_data_dict[target_property_key] - if isinstance(ext_prop, str): - files_info_list = ast.literal_eval(ext_prop) - else: - files_info_list = ext_prop + files_info_list = schema_manager.convert_str_to_data(existing_data_dict[target_property_key]) else: files_info_list = generated_dict[target_property_key] From cc73079bd6202db05f97fb49ee84e55a1d9f6303 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 29 Sep 2021 13:15:56 -0400 Subject: [PATCH 3/8] More code reuse --- src/schema/schema_manager.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index c3517e82..54e8a992 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -542,23 +542,12 @@ def normalize_entity_result_for_response(entity_dict, properties_to_exclude = [] # Skip properties with None value and the ones that are marked as not to be exposed. # By default, all properties are exposed if not marked as `exposed: false` # It's still possible to see `exposed: true` marked explictly - if (entity_dict[key] is not None) and ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): - # Safely evaluate a string containing a Python dict or list literal - # Only convert to Python list/dict when the string literal is not empty - # instead of returning the json-as-string or array-as-string - if isinstance(entity_dict[key], str) and entity_dict[key] and (properties[key]['type'] in ['list', 'json_string']): - # ast uses compile to compile the source string (which must be an expression) into an AST - # If the source string is not a valid expression (like an empty string), a SyntaxError will be raised by compile - # If, on the other hand, the source string would be a valid expression (e.g. a variable name like foo), - # compile will succeed but then literal_eval() might fail with a ValueError - # Also this fails with a TypeError: literal_eval("{{}: 'value'}") - try: - entity_dict[key] = ast.literal_eval(entity_dict[key]) - except (SyntaxError, ValueError, TypeError) as e: - logger.debug(f"Invalid expression (string value) of key: {key} for ast.literal_eval()") - logger.debug(entity_dict[key]) - msg = "Failed to convert the source string with ast.literal_eval()" - logger.exception(msg) + if (entity_dict[key] is not None) and ('exposed' not in properties[key]) or (('exposed' in properties[key]) and properties[key]['exposed']): + if entity_dict[key] and (properties[key]['type'] in ['list', 'json_string']): + # Safely evaluate a string containing a Python dict or list literal + # Only convert to Python list/dict when the string literal is not empty + # instead of returning the json-as-string or array-as-string + entity_dict[key] = convert_str_to_data(entity_dict[key]) # Add the target key with correct value of data type to the normalized_entity dict normalized_entity[key] = entity_dict[key] @@ -1501,7 +1490,18 @@ def get_neo4j_driver_instance(): """ def convert_str_to_data(data_str): if isinstance(data_str, str): - data = ast.literal_eval(data_str) + # ast uses compile to compile the source string (which must be an expression) into an AST + # If the source string is not a valid expression (like an empty string), a SyntaxError will be raised by compile + # If, on the other hand, the source string would be a valid expression (e.g. a variable name like foo), + # compile will succeed but then literal_eval() might fail with a ValueError + # Also this fails with a TypeError: literal_eval("{{}: 'value'}") + try: + data = ast.literal_eval(data_str) + except (SyntaxError, ValueError, TypeError) as e: + logger.debug(f"Invalid expression (string value): {data_str} to be evaluated by ast.literal_eval()") + logger.debug(entity_dict[key]) + msg = "Failed to convert the source string with ast.literal_eval()" + logger.exception(msg) else: data = data_str From fba609ca5c489c5dea89ed1cf935ba764e46da59 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 29 Sep 2021 13:18:28 -0400 Subject: [PATCH 4/8] Add code comments --- src/app.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/app.py b/src/app.py index a8011704..d6a2ee7f 100644 --- a/src/app.py +++ b/src/app.py @@ -2050,11 +2050,22 @@ def retract_dataset(id): 1-5 will be returned in reverse order (newest first). Non-public access is only required to retrieve information on non-published datasets. Output will be a list of dictionaries. Each dictionary contains the dataset revision number and its uuid. Optionally, the full dataset can be included for each. + By default, only the revision number and uuid is included. To include the full dataset, the query parameter "include_dataset" can be given with the value of "true". If this parameter is not included or is set to false, the dataset will not be included. For example, to include the full datasets for each revision, use '/datasets//revisions?include_dataset=true'. To omit the datasets, either set include_dataset=false, or simply do not include this parameter. + +Parameters +---------- +id : str + The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target dataset + +Returns +------- +list + The list of revision datasets """ @app.route('/datasets//revisions', methods=['GET']) def get_revisions_list(id): From ddf44d452f7cf3265a0bd01a2b25e5397aff32fe Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 29 Sep 2021 14:59:04 -0400 Subject: [PATCH 5/8] Use cypher query to get organ and donor info --- src/schema/schema_neo4j_queries.py | 27 +++---- src/schema/schema_triggers.py | 117 ++++++++++++++++------------- 2 files changed, 78 insertions(+), 66 deletions(-) diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 7f2195dd..f2f729f5 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -58,7 +58,7 @@ def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): """ -Get all ancestors of the given dataset uuid +Get the sample organ name and donor metadata information of the given dataset uuid Parameters ---------- @@ -69,30 +69,31 @@ def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): Returns ------- -list - A list of unique ancestor dictionaries returned from the Cypher query +str: The sample organ name +str: The donor metadata (string representation of a Python dict) """ -def get_dataset_ancestors(neo4j_driver, uuid): - results = [] +def get_dataset_organ_and_donor_info(neo4j_driver, uuid): + organ_name = None + donor_metadata = None - query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(ancestor:Entity) " + query = (f"MATCH (e:Dataset)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(s:Sample)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(d:Donor) " # Filter out the Lab entities - f"WHERE e.uuid='{uuid}' AND ancestor.entity_type <> 'Lab' " + f"WHERE e.uuid='{uuid}' AND s.specimen_type='organ' AND EXISTS(s.organ) " # COLLECT() returns a list # apoc.coll.toSet() reruns a set containing unique nodes - f"RETURN apoc.coll.toSet(COLLECT(ancestor)) AS {record_field_name}") + f"RETURN s.organ AS organ_name, d.metadata AS donor_metadata") - logger.debug("======get_dataset_ancestors() query======") + logger.debug("======get_dataset_organ_and_donor_info() query======") logger.debug(query) with neo4j_driver.session() as session: record = session.read_transaction(_execute_readonly_tx, query) - if record and record[record_field_name]: - # Convert the list of nodes to a list of dicts - results = _nodes_to_dicts(record[record_field_name]) + if record: + organ_name = record['organ_name'] + donor_metadata = record['donor_metadata'] - return results + return organ_name, donor_metadata """ Create or recreate one or more linkages (via Activity nodes) diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index a1f4f0b3..54d049ca 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -941,46 +941,45 @@ def get_dataset_title(property_key, normalized_type, user_token, existing_data_d except requests.exceptions.RequestException as e: raise requests.exceptions.RequestException(e) - # Get all the ancestors of this dataset - ancestors = schema_neo4j_queries.get_dataset_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) - - # Parse organ_name, age, race, and sex from ancestor Sample and Donor - for ancestor in ancestors: - if 'entity_type' in ancestor: - # 'specimen_type' is a key in search-api/src/search-schema/data/definitions/enums/tissue_sample_types.yaml - if ancestor['entity_type'] == 'Sample': - if 'specimen_type' in ancestor and ancestor['specimen_type'].lower() == 'organ' and 'organ' in ancestor: - try: - # ancestor['organ'] is the two-letter code only set if sample_type == organ. - # Convert the two-letter code to a description - # https://github.com/hubmapconsortium/search-api/blob/test-release/src/search-schema/data/definitions/enums/organ_types.yaml - organ_desc = _get_organ_description(ancestor['organ']) - except yaml.YAMLError as e: - raise yaml.YAMLError(e) - - if ancestor['entity_type'] == 'Donor': - # Easier to ask for forgiveness than permission (EAFP) - # Rather than checking key existence at every level - try: - # Note: The ancestor['metadata'] is stored in Neo4j as a string representation of the Python dict - # It's not stored in Neo4j as a json string! And we can't store it as a json string - # due to the way that Cypher handles single/double quotes. - ancestor_metadata_dict = schema_manager.convert_str_to_data(ancestor['metadata']) - data_list = ancestor_metadata_dict['organ_donor_data'] - - for data in data_list: - if 'grouping_concept_preferred_term' in data: - if data['grouping_concept_preferred_term'].lower() == 'age': - # The actual value of age stored in 'data_value' instead of 'preferred_term' - age = data['data_value'] - - if data['grouping_concept_preferred_term'].lower() == 'race': - race = data['preferred_term'].lower() - - if data['grouping_concept_preferred_term'].lower() == 'sex': - sex = data['preferred_term'].lower() - except KeyError: - pass + # Get the sample organ name and donor metadata information of this dataset + organ_name, donor_metadata = schema_neo4j_queries.get_dataset_organ_and_donor_info(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + + # Can we move organ_types.yaml to commons or make it an API call to avoid parsing the raw yaml? + # Parse the organ description + if organ_name is not None: + try: + # The organ_name is the two-letter code only set if specimen_type == 'organ' + # Convert the two-letter code to a description + # https://github.com/hubmapconsortium/search-api/blob/test-release/src/search-schema/data/definitions/enums/organ_types.yaml + organ_desc = _get_organ_description(organ_name) + except yaml.YAMLError as e: + raise yaml.YAMLError(e) + + # Parse age, race, and sex + if donor_metadata is not None: + try: + # Note: The donor_metadata is stored in Neo4j as a string representation of the Python dict + # It's not stored in Neo4j as a json string! And we can't store it as a json string + # due to the way that Cypher handles single/double quotes. + ancestor_metadata_dict = schema_manager.convert_str_to_data(donor_metadata) + + # Easier to ask for forgiveness than permission (EAFP) + # Rather than checking key existence at every level + data_list = ancestor_metadata_dict['organ_donor_data'] + + for data in data_list: + if 'grouping_concept_preferred_term' in data: + if data['grouping_concept_preferred_term'].lower() == 'age': + # The actual value of age stored in 'data_value' instead of 'preferred_term' + age = data['data_value'] + + if data['grouping_concept_preferred_term'].lower() == 'race': + race = data['preferred_term'].lower() + + if data['grouping_concept_preferred_term'].lower() == 'sex': + sex = data['preferred_term'].lower() + except KeyError: + pass age_race_sex_info = None @@ -1003,9 +1002,6 @@ def get_dataset_title(property_key, normalized_type, user_token, existing_data_d generated_title = f"{assay_type_desc} data from the {organ_desc} of a {age_race_sex_info}" - logger.debug("===========Auto generated Title===========") - logger.debug(generated_title) - return property_key, generated_title """ @@ -1869,11 +1865,18 @@ def _delete_files(target_property_key, property_key, normalized_type, user_token return generated_dict +""" +Compose the assay type description +Parameters +---------- +data_types : list + A list of dataset data types - -#====================================================================== - +Returns +------- +str: The formatted assay type description +""" def _get_assay_type_description(data_types): assay_types = [] assay_type_desc = '' @@ -1922,16 +1925,24 @@ def _get_assay_type_description(data_types): return assay_type_desc -def _get_organ_types_dict(): +""" +Get the organ description based on the given organ code + +Parameters +---------- +organ_code : str + The two-letter organ code + +Returns +------- +str: The organ code description +""" +def _get_organ_description(organ_code): yaml_file_url = 'https://raw.githubusercontent.com/hubmapconsortium/search-api/master/src/search-schema/data/definitions/enums/organ_types.yaml' with urllib.request.urlopen(yaml_file_url) as response: yaml_file = response.read() try: - return yaml.safe_load(yaml_file) + organ_types_dict = yaml.safe_load(yaml_file) + return organ_types_dict[organ_code]['description'].lower() except yaml.YAMLError as e: raise yaml.YAMLError(e) - -def _get_organ_description(organ_code): - organ_types_dict = _get_organ_types_dict() - return organ_types_dict[organ_code]['description'].lower() - From b7f2ee44e13a5ac8e8a05a61e15f39ba30c334a6 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Fri, 1 Oct 2021 14:31:38 -0400 Subject: [PATCH 6/8] Mark Dataset.title as immutable and generated --- src/schema/provenance_schema.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 39d74989..2901a3e9 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -1,7 +1,7 @@ ############################################# Rules ############################################# # Entity properties: # - type: data type of the property, one of the following: string|integer|boolean|list|json_string -# - generated: whether the property is auto genearated or user provided, default to false +# - generated: whether the property is auto genearated (either with a `before_create_trigger` or not) or user provided, default to false # - required_on_create: whether the property is required from user reqeust JSON for entity creation via POST # - immutable: whether the property can NOT be updated once being created, default to false # - transient: whether the property to persist in database or not, default to false @@ -17,7 +17,7 @@ # Entity creation via http POST request: # - Use `generated: true` to mark a property as to be auto generated by the program instead of from user input JSON -# - If a property is marked as `generated: true`, either no trigger method needed (E.g. Donor.image_files) or a `before_create_trigger` can be used to generate the value +# - If a property is marked as `generated: true`, either no trigger method needed (E.g. Donor.image_files, Dataset.title) or a `before_create_trigger` can be used to generate the value # - If a property has `before_create_trigger`, it can't be specified in client request JSON # - If a property is mraked as `generated: true`, it can't be `required_on_create: true` at the same time # - Use `required_on_create: true` to mark a property as required from user input @@ -239,7 +239,9 @@ ENTITIES: after_update_trigger: update_dataset_and_ancestors_data_access_level title: type: string - description: "The dataset title." + generated: true # Disallow entry from users via POST + immutable: true # Disallow update via PUT + description: "The auto generated dataset title." on_read_trigger: get_dataset_title lab_dataset_id: type: string From f75d022d15f203e316acb3bda6915f4a9cb91c01 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 6 Oct 2021 14:46:08 -0400 Subject: [PATCH 7/8] Add validators on create/update for unique list check --- src/app.py | 7 +++++++ src/schema/provenance_schema.yaml | 16 ++++++++++++++-- src/schema/schema_manager.py | 8 ++++---- src/schema/schema_validators.py | 22 ++++++++++++++++++++++ 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/app.py b/src/app.py index d6a2ee7f..9948e2a9 100644 --- a/src/app.py +++ b/src/app.py @@ -813,6 +813,13 @@ def create_entity(entity_type): # No need to log the validation errors bad_request_error(str(e)) + # Execute property level validators defined in schema yaml before entity property creation + # Use empty dict {} to indicate there's no existing_data_dict + try: + schema_manager.execute_property_level_validators('before_property_create_validators', normalized_entity_type, request, {}, json_data_dict) + except (KeyError, ValueError) as e: + bad_request_error(e) + # Sample and Dataset: additional validation, create entity, after_create_trigger # Collection and Donor: create entity if normalized_entity_type == 'Sample': diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml index 287ad11f..bd0fe97f 100644 --- a/src/schema/provenance_schema.yaml +++ b/src/schema/provenance_schema.yaml @@ -13,7 +13,7 @@ # - types: before_entity_create_validator, a single validation method needed for creating or updating the entity # Property level validators: -# - types: before_property_update_validators, a list of validation methods +# - types: before_property_create_validators|before_property_update_validators, a list of validation methods # Entity creation via http POST request: # - Use `generated: true` to mark a property as to be auto generated by the program instead of from user input JSON @@ -232,10 +232,10 @@ ENTITIES: required_on_create: true # Only required for create via POST, not update via PUT description: "True if the data contains any human genetic sequence information." status: + type: string before_property_update_validators: - validate_application_header_before_property_update - validate_dataset_status_value - type: string generated: true description: "One of: New|Processing|QA|Published|Error|Hold|Invalid" before_create_trigger: set_dataset_status_new @@ -251,6 +251,10 @@ ENTITIES: type: string description: "A name or identifier used by the lab who is uploading the data to cross reference the data locally" data_types: + before_property_create_validators: + - validate_no_duplicates_in_list + before_property_update_validators: + - validate_no_duplicates_in_list type: list required_on_create: true # Only required for create via POST, not update via PUT description: "The data or assay types contained in this dataset as a json array of strings. Each is an assay code from [assay types](https://github.com/hubmapconsortium/search-api/blob/master/src/search-schema/data/definitions/enums/assay_types.yaml)." @@ -272,6 +276,10 @@ ENTITIES: direct_ancestor_uuids: required_on_create: true # Only required for create via POST, not update via PUT type: list + before_property_create_validators: + - validate_no_duplicates_in_list + before_property_update_validators: + - validate_no_duplicates_in_list transient: true exposed: false description: "The uuids of source entities from which this new entity is derived. Used to pass source entity ids in on POST or PUT calls used to create the linkages." @@ -711,6 +719,8 @@ ENTITIES: #datasets will be added only via PUT/update, NEVER on POST/create dataset_uuids_to_link: type: list + before_property_update_validators: + - validate_no_duplicates_in_list generated: true # Disallow user input from request json when being created transient: true exposed: false @@ -720,6 +730,8 @@ ENTITIES: #transient attribute used to pass in a list of dataset ids of datasets to be removed from a DataSubmisison dataset_uuids_to_unlink: type: list + before_property_update_validators: + - validate_no_duplicates_in_list generated: true # Disallow user input from request json when being created transient: true exposed: false diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 54e8a992..1cd54585 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -733,13 +733,13 @@ def execute_entity_level_validator(validator_type, normalized_entity_type, reque Parameters ---------- validator_type : str - For now only: before_property_update_validators (support multiple validators) + before_property_create_validators|before_property_update_validators (support multiple validators) normalized_entity_type : str One of the normalized entity types defined in the schema yaml: Donor, Sample, Dataset, Upload request: Flask request object The instance of Flask request passed in from application request existing_data_dict : dict - A dictionary that contains all existing entity properties + A dictionary that contains all existing entity properties, {} for before_property_create_validators new_data_dict : dict The json data in request body, already after the regular validations """ @@ -883,10 +883,10 @@ def validate_entity_level_validator_type(validator_type): Parameters ---------- validator_type : str - One of the validator types: before_property_update_validators + One of the validator types: before_property_create_validators|before_property_update_validators """ def validate_property_level_validator_type(validator_type): - accepted_validator_types = ['before_property_update_validators'] + accepted_validator_types = ['before_property_create_validators', 'before_property_update_validators'] separator = ', ' if validator_type.lower() not in accepted_validator_types: diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index b4e68aef..9575fe73 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -41,6 +41,28 @@ def validate_application_header_before_entity_create(normalized_entity_type, req ## Property Level Validators #################################################################################################### +""" +Validate the target list has no duplicated items + +Parameters +---------- +property_key : str + The target property key +normalized_type : str + Submission +request: Flask request object + The instance of Flask request passed in from application request +existing_data_dict : dict + A dictionary that contains all existing entity properties +new_data_dict : dict + The json data in request body, already after the regular validations +""" +def validate_no_duplicates_in_list(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): + target_list = new_data_dict[property_key] + if len(set(target_list)) != len(target_list): + raise ValueError(f"The {property_key} field must only contain unique items") + + """ Validate the provided value of Dataset.status on update via PUT From 7b3453d9bb04c8c915841bd088035da1aadfcc0e Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 6 Oct 2021 15:37:47 -0400 Subject: [PATCH 8/8] Made duplicate check case insensitive --- src/app.py | 3 ++- src/schema/schema_validators.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/app.py b/src/app.py index 9948e2a9..e6a10bf3 100644 --- a/src/app.py +++ b/src/app.py @@ -817,7 +817,8 @@ def create_entity(entity_type): # Use empty dict {} to indicate there's no existing_data_dict try: schema_manager.execute_property_level_validators('before_property_create_validators', normalized_entity_type, request, {}, json_data_dict) - except (KeyError, ValueError) as e: + # Currently only ValueError + except ValueError as e: bad_request_error(e) # Sample and Dataset: additional validation, create entity, after_create_trigger diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py index 9575fe73..b3af375b 100644 --- a/src/schema/schema_validators.py +++ b/src/schema/schema_validators.py @@ -58,7 +58,8 @@ def validate_application_header_before_entity_create(normalized_entity_type, req The json data in request body, already after the regular validations """ def validate_no_duplicates_in_list(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict): - target_list = new_data_dict[property_key] + # Use lowercase for comparision via list comprehensions + target_list = [v.lower() for v in new_data_dict[property_key]] if len(set(target_list)) != len(target_list): raise ValueError(f"The {property_key} field must only contain unique items")