In [35]:
#!/usr/bin/env python
# coding: utf-8

# List the indexes and attributes available to the search-api.
# The documents indexed in ElasticSearch are JSON files that can nest up to 4 levels.

# If a document is associated with an element in the Entity Provenance Hierarchy, the index will include information that helps to locate
# the document in the hierarchy. 

# The Entity Provenance (poly)Hierarchy organizes information in ways that include:
# 1. Donor -> Sample -> Dataset
# 2. Collection -> Dataset

# The Entity Provenance elements relate with "ancestor" and "descendant" relationships.

# Elements can contain other elements of the same entity type hierarchically, to represent division or derivation--e.g.,
# 1. A Sample of type organ can be the ancestor of a Sample of type organ_piece.
# 2. A primary Dataset can be the ancestor of a derived Dataset entity.

# Assumption: 
# Prior authentication to ElasticSearch external to this script. 
# Because this notebook executes URLs directly against the ElasticSearch server, the account 
# running this notebook should be white-listed for the server.
#----------------

import requests
import pandas as pd

def getsearchablefields(url):
    #Returns a list of tuples for searchable attributes.
    #Assumes that the url corresponds to one ElasticSearch index.
    
    #List of tuples that will be converted to a DataFrame.
    #Each tuple will contain:
    #1. the name of the index
    #2. the name of the attribute, containing the full index path
    #3. the index type of the attribute
    #4. the root name of the attribute, down to the level before "keyword"
    #5. the name of the container of the attribute
    #6. the name of the ancestor of #5
    #7. the name of the ancestor of #6
    
    #e.g., for the attribute immediate_ancestors.metadata.metadata.rnaseq_assay_input_value.keyword, the return will be:
    #(<index>, immediate_ancestors.metadata.metadata.rnaseq_assay_input_value.keyword, keyword, rnaseq_assay_input_value, metadata, metadata,immediate_ancestors).
    
    listret = []
    
    #Obtain index data.
    headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        rjson = response.json()
        #Assumption: one index
        index = rjson.get('indices')[0]
        
        attributes = rjson.get('fields')
        for attribute in attributes.items():
            attributename = attribute[0]
            isprivate = attributename[0] == '_'
            attributeproperties = list(attribute[1].values())
            searchable = attributeproperties[0].get('searchable')
            indextype = attributeproperties[0].get('type')
            # Analyze the path for each attribute--i.e., the location of the attribute's key in the JSON.
            # The path should have a maximum of 4 levels, excluding "keyword".
            listpath = []
            path = attributename.split('.')
            for level in reversed(path):
                if level != 'keyword':
                    listpath.append(level)
            #Pad the list with blanks to the 4th level.
            for i in range(len(listpath)-1, 3) :
                listpath.append('')
            ret = [index,attributename,indextype] + listpath
            if searchable == True and isprivate == False:
                listret.append(tuple(ret))
            
    return listret

# ---------

# Build set of index-attribute mappings based on field capacity queries to the Elastic Search API.
colnames = ['index','attribute','index_type','attribute_key','ancestor_level_1','ancestor_level_2','ancestor_level_3']

dfindexattributes = pd.DataFrame(columns=colnames)

#https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_dev_public_entities/_field_caps?fields=*
#https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_dev_consortium_entities/_field_caps?fields=*
#https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_antibodies/_field_caps?fields=*
#https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_dev_public_files/_field_caps?fields=*
#https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_dev_consortium_files/_field_caps?fields=*
urls = []
urls.append('https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_dev_consortium_entities/_field_caps?fields=*')
urls.append('https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_antibodies/_field_caps?fields=*')
urls.append('https://search-hubmap-dev-test-hfnqv4ylo5ywvc42vwnyptbup4.us-east-1.es.amazonaws.com/hm_dev_consortium_files/_field_caps?fields=*')

for u in urls:
    # Obtain a list of tuples of searchable attributes for the index.
    listattributes = getsearchablefields(u)
    # Add the list for this index to the Data Frame.
    dfu = pd.DataFrame.from_records(listattributes,columns=colnames)
    dfindexattributes = pd.concat([dfindexattributes,dfu])
    
# Sort the DataFrame.
dfindexattributes = dfindexattributes.sort_values(by=['index','ancestor_level_3','ancestor_level_2','ancestor_level_1','attribute_key'])


In [36]:
dfindexattributes.to_csv('index_attributes.csv',index=False)

In [33]:
dfindexattributes

Unnamed: 0,index,attribute,index_type,level_0,level_1,level_2,level_3
0,hm_antibodies,antibody_name,text,antibody_name,,,
7,hm_antibodies,antibody_name.keyword,keyword,antibody_name,,,
24,hm_antibodies,antibody_uuid.keyword,keyword,antibody_uuid,,,
35,hm_antibodies,antibody_uuid,text,antibody_uuid,,,
4,hm_antibodies,avr_filename.keyword,keyword,avr_filename,,,
...,...,...,...,...,...,...,...
36,hm_dev_consortium_files,samples.uuid,text,uuid,samples,,
1,hm_dev_consortium_files,organs.type.description.keyword,keyword,description,type,organs,
32,hm_dev_consortium_files,organs.type.description,text,description,type,organs,
2,hm_dev_consortium_files,organs.type.iri,text,iri,type,organs,
