Script to generate a list of columns in GEOME that have been required in at least one project.

In [1]:
import logging
import pandas as pd
import numpy as np
import requests
import concurrent.futures

RULE_REQUIRED = "RequiredValue"
ERROR_LEVEL = "ERROR"
GEOME_API = "https://api.geome-db.org/v1"
TIMEOUT = 30 #seconds
logging.basicConfig(level=logging.INFO)

def processGeomeConfig(doc, proj_id=0, title="network"):
    res = []
    defns = {}
    for entity in doc.get("entities", []):
        alias = entity.get("conceptAlias")
        cols = set()
        # Get the required columns
        for r in entity.get("rules"):
            if r.get("name") == RULE_REQUIRED and r.get("level") == ERROR_LEVEL:
                cols.update(r.get("columns"))
        # Get column definition info
        for a in entity.get("attributes", []):
            cname = a.get("column")
            if cname in cols:
                adef = {
                    "definedBy": a.get("definedBy"),
                    "group": a.get("group"),
                    "definition": a.get("definition")
                }
                defns[cname] = adef
        for c in cols:
            #row = (alias, c, proj_id, title)
            row = (alias, c, proj_id)
            res.append(row)
    return res, defns

def getProjectConfig(project_id):
    url = f"{GEOME_API}/projects/{project_id}/config"
    response = requests.get(url, timeout=TIMEOUT)
    cfgdoc = response.json()
    return cfgdoc


def processProject(project_id, project_title):
    doc = getProjectConfig(project_id)
    res, defns = processGeomeConfig(doc, proj_id=project_id, title=project_title)
    return res, defns

def processProjects():
    L = logging.getLogger("loader")
    url = f"{GEOME_API}/projects/"
    params = {"includePublic": "true", "admin": "false"}
    response = requests.get(url, params=params, timeout=TIMEOUT)
    projects = response.json()
    res = []
    definitions = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        futures = []
        for project in projects:
            proj_id = project.get("projectId")
            title = project.get("projectTitle")            
            L.info(f"Loading: {proj_id} - {title}")
            futures.append(executor.submit(processProject, proj_id, title))
        for future in concurrent.futures.as_completed(futures):
            reqs, defns = future.result()
            res += reqs
            definitions.update(defns)
    return res, definitions


required = []
response = requests.get(f"{GEOME_API}/network/1/config", timeout=TIMEOUT)
network_config = response.json()
required, definitions = processGeomeConfig(network_config)
preq, pdefns = processProjects()
required += preq
definitions.update(pdefns)

INFO:loader:Loading: 114 - Collins Lab
INFO:loader:Loading: 125 - European ARMS project
INFO:loader:Loading: 131 - Amalda_mitochondria_genomes
INFO:loader:Loading: 147 - Pillar Point - CALeDNA
INFO:loader:Loading: 355 - NMNH TEST PROJECT
INFO:loader:Loading: 183 - DiSSCo
INFO:loader:Loading: 41 - Zostera capensis pooled RADseq
INFO:loader:Loading: 332 - HIV PROJECT
INFO:loader:Loading: 90 - Bionetwork
INFO:loader:Loading: 61 - Smithsonian DROP Project
INFO:loader:Loading: 67 - Cairns LAB
INFO:loader:Loading: 83 - Windsor LAB
INFO:loader:Loading: 376 - Early presence of Batrachochytrium dendrobatidis in Mexico with a contemporary dominance of the global panzootic lineage
INFO:loader:Loading: 303 - Tata PowerPoint
INFO:loader:Loading: 168 - Bighorn_RAD_Parentage_2018
INFO:loader:Loading: 158 - Candida ITS
INFO:loader:Loading: 181 - Mexican_gray_wolf_RAD_Parentage_2018
INFO:loader:Loading: 89 - Mekong River
INFO:loader:Loading: 118 - Crandall Lab
INFO:loader:Loading: 13 - Genetic diversit

In [2]:
df = pd.DataFrame.from_records(required, columns=["record_alias", "column_name", "project_id"])
df

Unnamed: 0,record_alias,column_name,project_id
0,Event,country,0
1,Event,yearCollected,0
2,Event,locality,0
3,Sample,eventID,0
4,Diagnostics,materialSampleID,0
...,...,...,...
1863,Diagnostics,materialSampleID,292
1864,Diagnostics,diagnosticID,292
1865,Diagnostics,diseaseDetected,292
1866,Diagnostics,fatal,292


In [3]:
dfg = df.groupby(by=["record_alias","column_name"]).count()
pd.options.display.float_format = '{:,.0f}'.format
p = pd.pivot_table(df, values="project_id", index="column_name", columns="record_alias", 
                   aggfunc=lambda x: len(x)).fillna(0)
p

record_alias,Diagnostics,Event,Event_Photo,Sample,Sample_Photo,Tissue,fastaSequence,fastqMetadata
column_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
basisOfRecord,0,0,0,26,0,0,0,0
collectorList,0,23,0,0,0,0,0,0
coordinateUncertaintyInMeters,0,1,0,0,0,0,0,0
country,0,84,0,0,0,0,0,0
decimalLatitude,0,60,0,0,0,0,0,0
decimalLongitude,0,60,0,0,0,0,0,0
designDescription,0,0,0,0,0,0,0,57
diagnosticID,28,0,0,0,0,0,0,0
diseaseDetected,23,0,0,0,0,0,0,0
diseaseTested,23,0,0,0,0,0,0,0


In [4]:
import textwrap
for k,d in definitions.items():
    print(f"{k}")
    print(f"       group: {d.get('group') or ''}")
    print(f"   definedBy: {d.get('definedBy') or ''}")
    print(f"  definition: {textwrap.fill(d.get('definition') or '', 70, subsequent_indent='              ')}")
    print("")

country
       group: 
   definedBy: http://rs.tdwg.org/dwc/terms/country
  definition: The name of the country or major administrative unit in which the
              Location occurs.

locality
       group: 
   definedBy: http://rs.tdwg.org/dwc/terms/locality
  definition: The specific description of the place. Less specific geographic
              information can be provided in other geographic terms
              (higherGeography, continent, country, stateProvince,
              county, municipality, waterBody, island, islandGroup).
              This term may contain information modified from the
              original to correct perceived errors or standardize the
              description.

yearCollected
       group: 
   definedBy: urn:yearCollected
  definition: The year the collecting event took place

eventID
       group: Record and Owner Details
   definedBy: http://rs.tdwg.org/dwc/terms/eventID
  definition: The collector's event identifier.

diagnosticID
       group: 
