In [1]:
# This Helper fetches de resolver dataset from the EBI identifiers.org platform, to transform it and put it into
# the new MongoDB based data backend.
# Author: Manuel Bernal Llinares <mbdebian@gmail.com>
import time
import random
import pymongo
import requests
import threading
import matplotlib.pyplot as plt
import data.models.hq.registry as hqregistry

%matplotlib inline

In [2]:
# Set up the environment
identifiersorg_url_resolver_data_source = 'https://identifiers.org/rest/collections/expand'
# TODO Maybe get this from environment variables later on when everything is working
mongodb_username = 'devusername'
mongodb_password = 'devpassword'
mongodb_database = 'devdbregistry'
mongodb_endpoint = 'mongodb://localhost:27017'
mongodb_authendpoint = "mongodb://{}:{}@localhost:27017".format(mongodb_username, mongodb_password)
# Collections
collection_name_institution = 'institution'
collection_name_location = 'location'
collection_name_namespaceSynonym = 'namespace_synonym'
collection_name_resource = 'resource'
collection_name_namespace = 'namespace'

In [3]:
# Helpers
def make_rest_request_content_type_json(url):
    # TODO - Magic number here!!!
    n_attempts = 42
    response = None
    while n_attempts:
        n_attempts -= 1
        try:
            response = requests.get(url, headers={"Content-Type": "application/json"})
        except Exception as e:
            # Any possible exception counts towards the attempt counter
            # Random wait - TODO - Another magic number!!!
            time.sleep(random.randint(30))
            continue
        if response.ok:
            return response.json()
        # Random wait - TODO - Another magic number!!!
        time.sleep(random.randint(10))
    response.raise_for_status()

In [4]:
# Initialise pseudo-random number generator
random.seed(time.time())

In [5]:
# Preparing Database
dbclient = pymongo.MongoClient(mongodb_authendpoint)
print("---> DB Client instantiated")
print("\tDatabases: {}".format(dbclient.list_database_names()))
dbregistry = dbclient[mongodb_database]

---> DB Client instantiated
	Databases: ['admin', 'config', 'devdbregistry', 'local']


In [6]:
# Retrieve the Resolution Source dataset
resolution_dataset_source = make_rest_request_content_type_json(identifiersorg_url_resolver_data_source)

In [12]:
# Process PID entries
mongodb_collection = dbregistry[collection_name_namespace]
for index, pid_entry in enumerate(resolution_dataset_source):
    print("---> [{}] Processing Prefix '{}' - '{}'".format(index, pid_entry['id'], pid_entry['prefix']))
    # Translate PID Entry to Namespace model
    namespace = hqregistry.Namespace()
    namespace._id = index
    namespace.prefix = pid_entry.get('prefix', None)
    namespace.mirId = pid_entry.get('id', None)
    namespace.name = pid_entry.get('name', None)
    namespace.pattern = pid_entry.get('pattern', None)
    namespace.description = pid_entry.get('definition', None)
    # TODO creation and modification dates are missing from the dataset
    # There is no deprecation date for now
    # Flag all namespaces as 'NOT Deprecated' for now
    namespace.deprecated = False
    # print("\tInserting namespace record: '{}'".format(namespace.__dict__))
    # TODO Register Institutions
    # TODO Register Resources
    inserted_object = mongodb_collection.insert_one(namespace.__dict__)
    print("\tNamespace inserted, ({}, {}, {})".format(inserted_object.inserted_id, namespace.mirId, namespace.prefix))

---> [0] Processing Prefix 'MIR:00000002' - 'chebi'
	Inserting namespace record: '{'_id': 0, 'prefix': 'chebi', 'mirId': 'MIR:00000002', 'name': 'ChEBI', 'pattern': '^CHEBI:\\d+$', 'description': "Chemical Entities of Biological Interest (ChEBI) is a freely available dictionary of molecular entities focused on 'small' chemical compounds.", 'created': None, 'modified': None, 'deprecated': False, 'deprecationDate': None, 'resources': [], 'namespaceSynonmyns': []}'
	Namespace inserted, (0, MIR:00000002, chebi)
---> [1] Processing Prefix 'MIR:00000003' - 'ensembl'
	Inserting namespace record: '{'_id': 1, 'prefix': 'ensembl', 'mirId': 'MIR:00000003', 'name': 'Ensembl', 'pattern': '^((ENS[FPTG]\\\\d{11}(\\\\.\\\\d+)?)|(FB\\\\w{2}\\\\d{7})|(Y[A-Z]{2}\\\\d{3}[a-zA-Z](\\\\-[A-Z])?)|([A-Z_a-z0-9]+(\\\\.)?(t)?(\\\\d+)?([a-z])?))$', 'description': 'Ensembl is a joint project between EMBL - EBI and the Sanger Institute  to develop a software system which produces and maintains automatic annotation 

	Namespace inserted, (189, MIR:00000195, cazy)
---> [190] Processing Prefix 'MIR:00000196' - 'goa'
	Inserting namespace record: '{'_id': 190, 'prefix': 'goa', 'mirId': 'MIR:00000196', 'name': 'GOA', 'pattern': '^(([A-N,R-Z][0-9][A-Z][A-Z, 0-9][A-Z, 0-9][0-9])|([O,P,Q][0-9][A-Z, 0-9][A-Z, 0-9][A-Z, 0-9][0-9]))|(URS[0-9A-F]{10}(_[0-9]+){0,1})|(EBI-[0-9]+)$', 'description': 'The GOA (Gene Ontology Annotation) project provides high-quality Gene Ontology (GO) annotations to proteins in the UniProt Knowledgebase (UniProtKB) and International Protein Index (IPI). This involves electronic annotation and the integration of high-quality manual GO annotation from all GO Consortium model organism groups and specialist groups.', 'created': None, 'modified': None, 'deprecated': False, 'deprecationDate': None, 'resources': [], 'namespaceSynonmyns': []}'
	Namespace inserted, (190, MIR:00000196, goa)
---> [191] Processing Prefix 'MIR:00000197' - 'paleodb'
	Inserting namespace record: '{'_id': 191, 'pre

	Namespace inserted, (403, MIR:00000418, bgee.gene)
---> [404] Processing Prefix 'MIR:00000419' - 'bgee.stage'
	Inserting namespace record: '{'_id': 404, 'prefix': 'bgee.stage', 'mirId': 'MIR:00000419', 'name': 'Bgee stage', 'pattern': '^(FBvd|XtroDO|HsapDO|MmusDO)\\:\\d+$', 'description': 'Bgee is a database of gene expression patterns within particular anatomical structures within a species, and between different animal species. This collection refers to developmental stages.', 'created': None, 'modified': None, 'deprecated': False, 'deprecationDate': None, 'resources': [], 'namespaceSynonmyns': []}'
	Namespace inserted, (404, MIR:00000419, bgee.stage)
---> [405] Processing Prefix 'MIR:00000420' - 'bgee.organ'
	Inserting namespace record: '{'_id': 405, 'prefix': 'bgee.organ', 'mirId': 'MIR:00000420', 'name': 'Bgee organ', 'pattern': '^(XAO|ZFA|EHDAA|EMAPA|EV|MA)\\:\\d+$', 'description': 'Bgee is a database of gene expression patterns within particular anatomical structures within a s

	Namespace inserted, (629, MIR:00000647, mw.project)
---> [630] Processing Prefix 'MIR:00000648' - 'mex'
	Inserting namespace record: '{'_id': 630, 'prefix': 'mex', 'mirId': 'MIR:00000648', 'name': 'Metabolome Express', 'pattern': '^\\d+$', 'description': 'A public place to process, interpret and share GC/MS metabolomics datasets.', 'created': None, 'modified': None, 'deprecated': False, 'deprecationDate': None, 'resources': [], 'namespaceSynonmyns': []}'
	Namespace inserted, (630, MIR:00000648, mex)
---> [631] Processing Prefix 'MIR:00000649' - 'gpmdb'
	Inserting namespace record: '{'_id': 631, 'prefix': 'gpmdb', 'mirId': 'MIR:00000649', 'name': 'GPMDB', 'pattern': '^GPM\\d+$', 'description': 'The Global Proteome Machine Database was constructed to utilize the information obtained by GPM servers to aid in the difficult process of validating peptide MS/MS spectra as well as protein coverage patterns.', 'created': None, 'modified': None, 'deprecated': False, 'deprecationDate': None, 're