# Identifiers.org resource metadata exploration

## Step 1

Get dataset.

In [1]:
import requests

get_all_dataset_url = "https://registry.api.identifiers.org/restApi/namespaces?size=1000"
dataset_request = requests.get(get_all_dataset_url)
dataset = dataset_request.json()
dataset = dataset["_embedded"]["namespaces"]

print(f"Found {len(dataset)} namespaces")

Found 684 namespaces


## Step 2
Iterate namespaces in dataset and get metadata from them.

In [273]:
import aiohttp
import asyncio

metadata_service_url = 'http://127.0.0.1:8082' # https://metadata.api.identifiers.org/


# Async request
async def fetch(url, session):
    async with session.get(url) as response:
        if response.status == 200:
            return await response.json()
        else:
            return {}

# Async semaphore fetch
metadata_results = {}

empty_metadata_resource_urls = []

async def bound_fetch(sem, url, session, namespace):
    async with sem:
        metadata = await fetch(url, session)
        # print(f"got metadata for {url}")
        
        payload = metadata.get("payload")
        print(f'\nMetadata for {url}: {payload}')
        
        if payload is not None:
            new_metadata_result = payload.get("metadata")
            namespace["metadata"] = new_metadata_result
            if len(payload["metadata"]) == 0:
                empty_metadata_resource_urls.append(url.split('/', 3)[3])
        else:
            namespace["metadata"] = []


async with aiohttp.ClientSession() as session:
    metadata_requests = []
    sem = asyncio.Semaphore(5)
    
    for namespace in dataset:
        # print(f"getting metadata for {namespace['prefix']}")

        if namespace['namespaceEmbeddedInLui']:
            print(f"* [{namespace['prefix']}] has a special LUI.")
            prefix = namespace['pattern'][1:].split(':')[0]
        else:
            prefix = namespace['prefix']

        get_metadata_url = f"{metadata_service_url}/{prefix}:{namespace['sampleId']}"

        task = asyncio.ensure_future(bound_fetch(sem, get_metadata_url, session, namespace))
        metadata_requests.append(task)

    await asyncio.gather(*metadata_requests)


* [chebi] has a special LUI.
* [go] has a special LUI.
* [sbo] has a special LUI.
* [mgi] has a special LUI.
* [eco] has a special LUI.
* [mod] has a special LUI.
* [mge] has a special LUI.
* [fma] has a special LUI.
* [so] has a special LUI.
* [psimi] has a special LUI.
* [cl] has a special LUI.
* [bto] has a special LUI.
* [pato] has a special LUI.
* [uo] has a special LUI.
* [pr] has a special LUI.
* [doid] has a special LUI.
* [cco] has a special LUI.
* [pw] has a special LUI.
* [po] has a special LUI.
* [vario] has a special LUI.
* [ma] has a special LUI.
* [uberon] has a special LUI.
* [gramene.growthstage] has a special LUI.
* [eo] has a special LUI.
* [swisslipid] has a special LUI.
* [hp] has a special LUI.
* [mp] has a special LUI.
* [ms] has a special LUI.
* [envo] has a special LUI.
* [ark] has a special LUI.
* [mir] has a special LUI.
* [mzspec] has a special LUI.
* [nmr] has a special LUI.
* [swh] has a special LUI.
* [foodon] has a special LUI.
* [did] has a special LUI.


Metadata for http://127.0.0.1:8082/kegg.glycan:G00123: {'metadata': []}

Metadata for http://127.0.0.1:8082/wb:WBGene00000001: {'metadata': []}

Metadata for http://127.0.0.1:8082/pfam:PF01234: {'metadata': []}

Metadata for http://127.0.0.1:8082/insdc:X58356: {'metadata': []}

Metadata for http://127.0.0.1:8082/fb:FBgn0011293: {'metadata': []}

Metadata for http://127.0.0.1:8082/pubchem.compound:100101: {'metadata': []}

Metadata for http://127.0.0.1:8082/prosite:PS00001: {'metadata': [{'@context': ['https://schema.org/'], '@type': 'Organization', 'url': 'https://prosite.expasy.org', 'name': 'PROSITE', 'logo': 'https://prosite.expasy.org//images/prosite/prosite.gif', 'memberof': {'@type': 'NGO', 'name': 'SIB Swiss Institute of Bioinformatics', 'url': 'https://sib.swiss/'}, 'parentOrganization': {'@type': 'Organization', 'name': 'ExPASy', 'url': 'https://www.expasy.org/'}}]}

Metadata for http://127.0.0.1:8082/arxiv:0807.4956v1: {'metadata': []}

Metadata for http://127.0.0.1:8082/wor


Metadata for http://127.0.0.1:8082/pharmgkb.pathways:PA146123006: {'metadata': []}

Metadata for http://127.0.0.1:8082/pharmgkb.drug:PA448710: {'metadata': []}

Metadata for http://127.0.0.1:8082/pharmgkb.disease:PA447218: {'metadata': []}

Metadata for http://127.0.0.1:8082/ttd.target:TTDS00056: {'metadata': []}

Metadata for http://127.0.0.1:8082/ttd.drug:DAP000773: {'metadata': []}

Metadata for http://127.0.0.1:8082/lgic:5HT3Arano: {'metadata': []}

Metadata for http://127.0.0.1:8082/neuromorpho:Rosa2: {'metadata': []}

Metadata for http://127.0.0.1:8082/chemidplus:57-27-2: {'metadata': []}

Metadata for http://127.0.0.1:8082/ctd.chemical:D001151: {'metadata': []}

Metadata for http://127.0.0.1:8082/biosystems:001: {'metadata': []}

Metadata for http://127.0.0.1:8082/neurondb:265: {'metadata': []}

Metadata for http://127.0.0.1:8082/ctd.gene:101: {'metadata': []}

Metadata for http://127.0.0.1:8082/ctd.disease:D053716: {'metadata': []}

Metadata for http://127.0.0.1:8082/bionumber


Metadata for http://127.0.0.1:8082/cath.domain:1cukA01: {'metadata': [{'@context': 'http://schema.org', '@type': 'WebSite', 'url': 'http://www.cathdb.info//version/v4_2_0/domain/1cukA01', 'mainEntityOfPage': {'@type': ['DataRecord'], 'identifier': 'cath.domain:1cukA01', 'isPartOf': {'@type': 'Dataset', '@id': 'CATH release v4_2_0'}, 'mainEntity': {'@type': ['StructuredValue', 'BioChemEntity'], 'additionalType': 'http://semanticscience.org/resource/SIO_001379', 'identifier': 'cathdomain:1cukA01', 'name': 'CATH Domain - 1cukA01 (PDB: 1cuk, chain A, domain domain.domain_number', 'isContainedIn': [{'@type': ['StructuredValue', 'BioChemEntity'], 'additionalType': 'http://semanticscience.org/resource/SIO_001379', 'additionalProperty': {'@type': 'BioChemEntity', '@id': 'http://identifiers.org/cath.superfamily:1.10.8.10', 'identifier': '1.10.8.10', 'name': '"winged helix" repressor DNA binding domain'}}, {'@type': 'DataRecord', 'identifier': '1cuk', 'isPartOf': {'@type': 'Dataset', '@id': 'ht


Metadata for http://127.0.0.1:8082/jcsd:J55.713G: {'metadata': []}

Metadata for http://127.0.0.1:8082/pharmgkb.gene:PA131: {'metadata': []}

Metadata for http://127.0.0.1:8082/scretf:RSC3: {'metadata': []}

Metadata for http://127.0.0.1:8082/insdc.sra:SRX000007: {'metadata': []}

Metadata for http://127.0.0.1:8082/PW:0000208: {'metadata': []}

Metadata for http://127.0.0.1:8082/napp:351: {'metadata': []}

Metadata for http://127.0.0.1:8082/mirnest:MNEST029358: {'metadata': []}

Metadata for http://127.0.0.1:8082/noncodev3:377550: {'metadata': []}

Metadata for http://127.0.0.1:8082/virsirna:virsi1909: {'metadata': []}

Metadata for http://127.0.0.1:8082/elm:CLV_MEL_PAP_1: {'metadata': []}

Metadata for http://127.0.0.1:8082/mimodb:1: {'metadata': []}

Metadata for http://127.0.0.1:8082/bykdb:A0A009E7X8: {'metadata': []}

Metadata for http://127.0.0.1:8082/sitex:1000: {'metadata': []}

Metadata for http://127.0.0.1:8082/topfind:Q9UKQ2: {'metadata': []}

Metadata for http://127.0.0.1:8


Metadata for http://127.0.0.1:8082/opm:1h68: {'metadata': []}

Metadata for http://127.0.0.1:8082/allergome:1948: {'metadata': []}

Metadata for http://127.0.0.1:8082/pombase:SPCC13B11.01: {'metadata': []}

Metadata for http://127.0.0.1:8082/hpa:ENSG00000026508: {'metadata': []}

Metadata for http://127.0.0.1:8082/jaxmice:005012: {'metadata': []}

Metadata for http://127.0.0.1:8082/ubio.namebank:2555646: {'metadata': []}

Metadata for http://127.0.0.1:8082/yetfasco:YOR172W_571.0: {'metadata': []}

Metadata for http://127.0.0.1:8082/tarbase:hsa-let-7a-2-3p: {'metadata': []}

Metadata for http://127.0.0.1:8082/charprot:CH_001923: {'metadata': []}

Metadata for http://127.0.0.1:8082/oma.protein:HUMAN16963: {'metadata': []}

Metadata for http://127.0.0.1:8082/genpept:CAA71118.1: {'metadata': []}

Metadata for http://127.0.0.1:8082/oma.grp:LCSCCPN: {'metadata': []}

Metadata for http://127.0.0.1:8082/unigene:4900: {'metadata': []}

Metadata for http://127.0.0.1:8082/ncbiprotein:CAA71118.1:


Metadata for http://127.0.0.1:8082/metabolights:MTBLS1: {'metadata': []}

Metadata for http://127.0.0.1:8082/orcid:0000-0002-5355-2576: {'metadata': []}

Metadata for http://127.0.0.1:8082/nbn:urn:nbn:fi:tkk-004781: {'metadata': []}

Metadata for http://127.0.0.1:8082/phosphopoint.kinase:AURKA: {'metadata': []}

Metadata for http://127.0.0.1:8082/uniprot.isoform:Q5BJF6-3: {'metadata': []}

Metadata for http://127.0.0.1:8082/phosphopoint.protein:AURKA: {'metadata': []}

Metadata for http://127.0.0.1:8082/inchikey:RYYVLZVUVIJVGH-UHFFFAOYSA-N: {'metadata': []}

Metadata for http://127.0.0.1:8082/inchi:InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3: {'metadata': []}

Metadata for http://127.0.0.1:8082/wikipedia.en:SM_UB-81: {'metadata': []}

Metadata for http://127.0.0.1:8082/cldb:cl3603: {'metadata': []}

Metadata for http://127.0.0.1:8082/kegg.environ:ev:E00032: {'metadata': []}

Metadata for http://127.0.0.1:8082/aphidbase.transcript:ACYPI000159: {'metadata': []}

Metadata for http://127.0.0.1:8082

Metadata for http://127.0.0.1:8082/noncodev4.rna:NONHSAT000001: {'metadata': []}

Metadata for http://127.0.0.1:8082/vfb:00000001: {'metadata': []}

Metadata for http://127.0.0.1:8082/noncodev4.gene:NONHSAG00001: {'metadata': []}

Metadata for http://127.0.0.1:8082/oryzabase.gene:117: {'metadata': []}

Metadata for http://127.0.0.1:8082/oryzabase.mutant:21393: {'metadata': []}

Metadata for http://127.0.0.1:8082/otl:AADB12: {'metadata': []}

Metadata for http://127.0.0.1:8082/oryzabase.strain:1: {'metadata': []}

Metadata for http://127.0.0.1:8082/oryzabase.stage:34: {'metadata': []}

Metadata for http://127.0.0.1:8082/genewiki:1017: {'metadata': [{'@context': 'https://schema.org', '@type': 'Article', 'name': 'Cyclin-dependent kinase 2', 'url': 'https://en.wikipedia.org/wiki/Cyclin-dependent_kinase_2', 'sameAs': 'http://www.wikidata.org/entity/Q14911732', 'mainEntity': 'http://www.wikidata.org/entity/Q14911732', 'author': {'@type': 'Organization', 'name': 'Contributors to Wikimedia pro


Metadata for http://127.0.0.1:8082/apid.interactions:P01116: {'metadata': []}

Metadata for http://127.0.0.1:8082/probonto:PROB_c0000005: {'metadata': []}

Metadata for http://127.0.0.1:8082/storedb:STOREDB:STUDY1040: {'metadata': []}

Metadata for http://127.0.0.1:8082/kegg:hsa00190: {'metadata': []}

Metadata for http://127.0.0.1:8082/MP:0005452: {'metadata': []}

Metadata for http://127.0.0.1:8082/planttfdb:Ath_AT1G01030.1: {'metadata': []}

Metadata for http://127.0.0.1:8082/multicellds.collection:MCDS_C_0000000001: {'metadata': []}

Metadata for http://127.0.0.1:8082/multicellds.cell_line:MCDS_L_0000000001: {'metadata': []}

Metadata for http://127.0.0.1:8082/multicellds.snapshot:MCDS_S_0000000001: {'metadata': []}

Metadata for http://127.0.0.1:8082/ecyano.entity:23: {'metadata': []}

Metadata for http://127.0.0.1:8082/lincs.data:LDS-1110: {'metadata': []}

Metadata for http://127.0.0.1:8082/ecyano.model:26: {'metadata': []}

Metadata for http://127.0.0.1:8082/ecyano.rule:56: {'


Metadata for http://127.0.0.1:8082/FOODON:03307879: {'metadata': []}

Metadata for http://127.0.0.1:8082/ricenetdb.gene:LOC_Os01g49190.1: {'metadata': []}

Metadata for http://127.0.0.1:8082/ricenetdb.protein:LOC_Os01g49190: {'metadata': []}

Metadata for http://127.0.0.1:8082/gtex:BRIP1: {'metadata': []}

Metadata for http://127.0.0.1:8082/ricenetdb.compound:OSC1416: {'metadata': []}

Metadata for http://127.0.0.1:8082/ricenetdb.reaction:OSR0818: {'metadata': []}

Metadata for http://127.0.0.1:8082/ricenetdb.mirna:osa-miR446: {'metadata': []}


Metadata for http://127.0.0.1:8082/asin:0471491039: {'metadata': []}

Metadata for http://127.0.0.1:8082/oclc:634515043: {'metadata': []}

Metadata for http://127.0.0.1:8082/ga4ghdos:dg.4503/01b048d0-e128-4cb0-94e9-b2d2cab7563d: {'metadata': []}

Metadata for http://127.0.0.1:8082/oid:2.16.840: {'metadata': []}

Metadata for http://127.0.0.1:8082/flowrepository:FR-FCM-ZYGW: {'metadata': []}



Metadata for http://127.0.0.1:8082/hpscreg:BCRTi001-A: {'metadata': [{'@context': 'http://schema.org', '@type': 'WebSite', 'url': 'http://hpscreg.eu/', 'potentialAction': {'@type': 'SearchAction', 'target': 'http://hpscreg.eu/search?q={search_term_string}', 'query-input': 'required name=search_term_string'}}]}

Metadata for http://127.0.0.1:8082/addgene:50943: {'metadata': []}

Metadata for http://127.0.0.1:8082/gdsc:1242: {'metadata': []}

Metadata for http://127.0.0.1:8082/bacdive:131392: {'metadata': []}

Metadata for http://127.0.0.1:8082/ido:0000504: {'metadata': []}

Metadata for http://127.0.0.1:8082/isni:000000012281955X: {'metadata': []}

Metadata for http://127.0.0.1:8082/lei:HWUPKR0MPOU8FGXBT394: {'metadata': [{'@context': 'http://schema.org', '@type': 'WebSite', 'name': 'Global Legal Entity Identifier Foundation', 'alternateName': 'GLEIF', 'url': 'http://gleif.org', 'potentialAction': {'@type': 'SearchAction', 'target': 'https://www.gleif.org/en/lei/search#query={search_te

In [3]:
print(dataset)



In [204]:
import pandas as pd


dataset_keys_to_keep = ["prefix", "metadata"]
final_dataset = [{ key: namespace[key] for key in dataset_keys_to_keep } for namespace in dataset]

df = pd.DataFrame.from_dict(final_dataset)

bad_metadata_count = df[df.metadata == 'ERROR'].shape[0]
no_metadata_count = df[df.metadata.map(len) == 0].shape[0]
total_metadata = df.shape[0] - bad_metadata_count - no_metadata_count

print(f"Bad metadata: {bad_metadata_count}")
print(f"No metadata: {no_metadata_count}")
print(f"Contains metadata: {total_metadata}")

Bad metadata: 0
No metadata: 651
Contains metadata: 33


In [205]:
namespaces_with_metadata = df[df.metadata.map(len) != 0]
namespaces_with_metadata

Unnamed: 0,prefix,metadata
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':..."
16,reactome,"[{'@context': 'http://schema.org', '@type': 'W..."
29,prosite,"[{'@context': ['https://schema.org/'], '@type'..."
43,rgd,"[{'@context': 'http://schema.org', '@type': 'D..."
54,biogrid,"[{'@context': 'https://schema.org/', '@type': ..."
80,chembl.compound,"[{'identifier': 'CHEMBL308052', 'inChIKey': 'B..."
158,dbsnp,"[{'@context': 'http://schema.org', '@type': 'D..."
195,disprot,"[{'@context': 'http://schema.org/', '@type': '..."
206,cath.domain,"[{'@context': 'http://schema.org', '@type': 'W..."
232,nextprot,"[{'@context': 'http://schema.org', '@graph': [..."


In [206]:
# Pie chart style
custom_style = Style(
    opacity='0.8',
    opacity_hover='0.5',
    title_font_size=36,
    tooltip_font_size=10,
    inner_radius=0.75,
    plot_background="rgba(249, 249, 249, 1)"
)

In [207]:
import pygal
from pygal.style import Style
from IPython.display import SVG, display

metadata_contents_pie = pygal.Pie(height=400, tooltip_border_radius=1, style=custom_style)

metadata_contents_pie.add("Bad metadata", bad_metadata_count)
metadata_contents_pie.add("No metadata", no_metadata_count)
metadata_contents_pie.add("Contains metadata", total_metadata)

display_interactive_pygal(metadata_contents_pie)


In [208]:
def get_schema_type(metadata):
    if len(metadata) == 0:
        return None

    # flatten lists of lists... :(
    if isinstance(metadata[0], list):
        metadata[0] = metadata[0][0]
    
    result = metadata[0].get("@type", None)
    if isinstance(result, list):
        result = result[0]
        
    return result

df["metadata_type"] = df["metadata"].map(lambda x: get_schema_type(x))

df[df["metadata"].map(len) != 0]



Unnamed: 0,prefix,metadata,metadata_type
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':...",DataRecord
16,reactome,"[{'@context': 'http://schema.org', '@type': 'W...",WebSite
29,prosite,"[{'@context': ['https://schema.org/'], '@type'...",Organization
43,rgd,"[{'@context': 'http://schema.org', '@type': 'D...",Dataset
54,biogrid,"[{'@context': 'https://schema.org/', '@type': ...",Dataset
80,chembl.compound,"[{'identifier': 'CHEMBL308052', 'inChIKey': 'B...",MolecularEntity
158,dbsnp,"[{'@context': 'http://schema.org', '@type': 'D...",Dataset
195,disprot,"[{'@context': 'http://schema.org/', '@type': '...",DataCatalog
206,cath.domain,"[{'@context': 'http://schema.org', '@type': 'W...",WebSite
232,nextprot,"[{'@context': 'http://schema.org', '@graph': [...",


In [210]:
from IPython.display import HTML
import pygal

def display_interactive_pygal(pygal_chart):
    base_html = """
    <!DOCTYPE html>
    <html>
      <head>
      <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
      <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
      </head>
      <body>
        <figure>
          {rendered_chart}
        </figure>
      </body>
    </html>
    """

    return HTML(base_html.format(rendered_chart=pygal_chart.render(is_unicode=True)))


In [211]:
metadata_type_counts = df.metadata_type.value_counts()
metadata_type_counts

Dataset            9
Organization       5
DataCatalog        5
WebSite            4
DataRecord         3
CreativeWork       1
Article            1
MolecularEntity    1
Product            1
Experiment         1
Name: metadata_type, dtype: int64

In [213]:
import pygal
from pygal.style import Style
from IPython.display import SVG, display

metadata_type_counts_list = [{"label": x, "value": y} for x, y in metadata_type_counts.items()]
metadata_type_counts_list

metadata_contents_pie_exploded = pygal.Pie(height=400, tooltip_border_radius=1, style=custom_style, print_values=False)

for metadata_type_count in metadata_type_counts_list:
    metadata_contents_pie_exploded.add(metadata_type_count["label"], metadata_type_count["value"])

display_interactive_pygal(metadata_contents_pie_exploded)


In [214]:
def get_metadata_properties(metadata):
    if len(metadata) == 0:
        return [None]

    # flatten lists of lists... :(
    if isinstance(metadata[0], list):
        metadata[0] = metadata[0][0]
    
    result = [x[0] for x in metadata[0].items()]

    return result

df["properties"] = df["metadata"].map(lambda x: get_metadata_properties(x))
df[df["metadata"].map(len) != 0]

Unnamed: 0,prefix,metadata,metadata_type,properties
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':...",DataRecord,"[identifier, @context, @type, mainEntity, isPa..."
16,reactome,"[{'@context': 'http://schema.org', '@type': 'W...",WebSite,"[@context, @type, url, potentialAction]"
29,prosite,"[{'@context': ['https://schema.org/'], '@type'...",Organization,"[@context, @type, url, name, logo, memberof, p..."
43,rgd,"[{'@context': 'http://schema.org', '@type': 'D...",Dataset,"[@context, @type, name, description, url, keyw..."
54,biogrid,"[{'@context': 'https://schema.org/', '@type': ...",Dataset,"[@context, @type, name, description, url, keyw..."
80,chembl.compound,"[{'identifier': 'CHEMBL308052', 'inChIKey': 'B...",MolecularEntity,"[identifier, inChIKey, @context, biochemicalIn..."
158,dbsnp,"[{'@context': 'http://schema.org', '@type': 'D...",Dataset,"[@context, @type, name, description, version, ..."
195,disprot,"[{'@context': 'http://schema.org/', '@type': '...",DataCatalog,"[@context, @type, @id, name, description, url,..."
206,cath.domain,"[{'@context': 'http://schema.org', '@type': 'W...",WebSite,"[@context, @type, url, mainEntityOfPage]"
232,nextprot,"[{'@context': 'http://schema.org', '@graph': [...",,"[@context, @graph]"


In [216]:
import pandas as pd

def unnest(df, col, reset_index=False):
    col_flat = pd.DataFrame([[i, x] for i, y in df[col].apply(list).iteritems() for x in y], columns=['I', col])
    col_flat = col_flat.set_index('I')
    df = df.drop(col, 1)
    df = df.merge(col_flat, left_index=True, right_index=True)
    if reset_index:
        df = df.reset_index(drop=True)
    return df

unnested_df = unnest(df, "properties")
unnested_df

Unnamed: 0,prefix,metadata,metadata_type,properties
0,chebi,[],,
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':...",DataRecord,identifier
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':...",DataRecord,@context
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':...",DataRecord,@type
1,ensembl,"[{'identifier': 'ENSG00000139618', '@context':...",DataRecord,mainEntity
...,...,...,...,...
679,cameo,[],,
680,nemo,[],,
681,insdc.gca,[],,
682,rrid,[],,


In [249]:
df_grouped_by_metadata_type = [x for _, x in unnested_df.groupby('metadata_type')]

for group_df in df_grouped_by_metadata_type:
    print(f"Properties for schema type {group_df['metadata_type'].values[0]}")
    print(group_df["properties"].value_counts())
    print('\n\n')


Properties for schema type Article
publisher        1
url              1
author           1
headline         1
@context         1
datePublished    1
image            1
sameAs           1
@type            1
dateModified     1
mainEntity       1
name             1
Name: properties, dtype: int64



Properties for schema type CreativeWork
publisher           1
url                 1
@context            1
inLanguage          1
name                1
mainEntityOfPage    1
identifier          1
headline            1
author              1
description         1
datePublished       1
additionalType      1
@type               1
@id                 1
Name: properties, dtype: int64



Properties for schema type DataCatalog
@context              5
dataset               5
name                  5
@type                 5
description           5
license               5
provider              5
url                   5
keywords              5
citation              3
fileFormat            3
dateCreated       

In [258]:
pd.set_option('display.max_rows', None)

print(unnested_df.properties.value_counts())

@context                       33
@type                          31
name                           23
url                            23
description                    17
license                        13
identifier                     10
keywords                       10
datePublished                   8
@id                             7
version                         6
creator                         5
dataset                         5
sourceOrganization              5
dateModified                    5
citation                        5
provider                        5
distribution                    5
includedInDataCatalog           4
dateCreated                     4
mainEntity                      4
sameAs                          4
logo                            4
fileFormat                      3
alternateName                   3
potentialAction                 3
parentOrganization              3
image                           3
memberof                        3
mainEntityOfPa

# Metadata service accuracy

Cannot check. Google pagespeed insights do not show if metadata is valid.

In [275]:
resolver_service_url = "http://resolver.api.identifiers.org"
pagespeed_apikey = "" # input api key for google pagespeed

for namespace in dataset[16:17]:
    if namespace['namespaceEmbeddedInLui']:
        prefix = namespace['pattern'][1:].split(':')[0]
    else:
        prefix = namespace['prefix']

    response = requests.get(f"{resolver_service_url}/{prefix}:{namespace['sampleId']}").json()
    
    namespace_provider_urls = []
    
    if response["payload"] is not None and response["payload"]["resolvedResources"] is not None:
        for resolvedResource in response["payload"]["resolvedResources"]:
            namespace_provider_urls.append(resolvedResource["compactIdentifierResolvedUrl"])
    
    print(namespace_provider_urls)
    
    for url in namespace_provider_urls:
        pagespeed_url = f"https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={url}&{pagespeed_apikey}"
        



TypeError: string indices must be integers

In [274]:
empty_metadata_resource_urls

['CHEBI:36927',
 'uniprot:P0DP23',
 'ec-code:1.1.1.1',
 'miriam.collection:MIR:00000008',
 'taxonomy:9606',
 'biomodels.db:BIOMD0000000048',
 'intact:EBI-2307691',
 'interpro:IPR000100',
 'kegg.pathway:hsa00620',
 'icd:C34',
 'pirsf:PIRSF000100',
 'pubmed:16333295',
 'mim:603903',
 'kegg.compound:C12345',
 'kegg.reaction:R00100',
 'doi:10.1038/nbt1156',
 'pdb:2gc4',
 'GO:0006915',
 'kegg.drug:D00123',
 'sgd:S000003909',
 'SBO:0000262',
 'kegg.glycan:G00123',
 'wb:WBGene00000001',
 'pfam:PF01234',
 'insdc:X58356',
 'fb:FBgn0011293',
 'pubchem.compound:100101',
 'arxiv:0807.4956v1',
 'wormpep:CE28239',
 'pubchem.substance:100101',
 'sabiork.reaction:75',
 'arrayexpress:E-MEXP-1712',
 'MGI:2442292',
 'refseq:NP_012345',
 'tcdb:5.A.1.1.1',
 'uniparc:UPI000000000A',
 'mint:MINT-10000',
 'signaling-gateway:A001094',
 'dip:DIP-743N',
 'resid:AA0001',
 'tair.protein:AASequence:1009107926',
 'tair.locus:2200950',
 'tair.gene:Gene:2200934',
 'peptideatlas:PAp00000009',
 'lipidmaps:LMPR0102010012

In [286]:
namespace_provider_urls = []

for ci in empty_metadata_resource_urls:
    response = requests.get(f"{resolver_service_url}/{ci}").json()

    if response["payload"] is not None and response["payload"]["resolvedResources"] is not None:
        for resolvedResource in response["payload"]["resolvedResources"]:
            namespace_provider_urls.append(resolvedResource["compactIdentifierResolvedUrl"])


In [288]:
import urllib.parse

print("\n".join([urllib.parse.quote(x, safe="") for x in namespace_provider_urls]))

https%3A%2F%2Fwww.ebi.ac.uk%2Fchebi%2FsearchId.do%3FchebiId%3DCHEBI%3A36927
https%3A%2F%2Fwww.ebi.ac.uk%2Fols%2Fontologies%2Fchebi%2Fterms%3Fobo_id%3DCHEBI%3A36927
http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FCHEBI%2FCHEBI%3A36927
https%3A%2F%2Fpurl.uniprot.org%2Funiprot%2FP0DP23
https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2Fprotein%2FP0DP23
https%3A%2F%2Fwww.ebi.ac.uk%2Fintenz%2Fquery%3Fcmd%3DSearchEC%26ec%3D1.1.1.1
https%3A%2F%2Fwww.genome.jp%2Fdbget-bin%2Fwww_bget%3Fec%3A1.1.1.1
http%3A%2F%2Fwww.enzyme-database.org%2Fquery.php%3Fec%3D1.1.1.1
https%3A%2F%2Fwww.ebi.ac.uk%2Fenzymeportal%2Fsearch%2Fec%2F1.1.1.1
https%3A%2F%2Fenzyme.expasy.org%2FEC%2F1.1.1.1
https%3A%2F%2Fwww.ebi.ac.uk%2Fmiriam%2Fmain%2FMIR%3A00000008
https%3A%2F%2Fwww.ncbi.nlm.nih.gov%2FTaxonomy%2FBrowser%2Fwwwtax.cgi%3Fmode%3DInfo%26id%3D9606
https%3A%2F%2Fpurl.uniprot.org%2Ftaxonomy%2F9606
https%3A%2F%2Fwww.ebi.ac.uk%2Fena%2Fdata%2Fview%2FTaxon%3A9606
http%3A%2F%2Fpurl.bioontology.org%2Fontology%2FNCBITAXON%2F9606
http%3A%2F