In [1]:
# This Helper fetches de resolver dataset from the EBI identifiers.org platform, to transform it and put it into
# the new MongoDB based data backend.
# Author: Manuel Bernal Llinares <mbdebian@gmail.com>
import time
import random
import pymongo
import requests
import threading
import matplotlib.pyplot as plt
import data.models.hq.registry as hqregistry

%matplotlib inline

In [2]:
# Set up the environment
identifiersorg_url_resolver_data_source = 'https://identifiers.org/rest/collections/expand'
# TODO Maybe get this from environment variables later on when everything is working
mongodb_username = 'devusername'
mongodb_password = 'devpassword'
mongodb_database = 'devdbregistry'
mongodb_endpoint = 'mongodb://localhost:27017'
mongodb_authendpoint = "mongodb://{}:{}@localhost:27017".format(mongodb_username, mongodb_password)
# Collections
collection_name_institution = 'institution'
collection_name_location = 'location'
collection_name_namespaceSynonym = 'namespace_synonym'
collection_name_resource = 'resource'
collection_name_namespace = 'namespace'

In [3]:
# Helpers
def make_rest_request_content_type_json(url):
    # TODO - Magic number here!!!
    n_attempts = 42
    response = None
    while n_attempts:
        n_attempts -= 1
        try:
            response = requests.get(url, headers={"Content-Type": "application/json"})
        except Exception as e:
            # Any possible exception counts towards the attempt counter
            # Random wait - TODO - Another magic number!!!
            time.sleep(random.randint(30))
            continue
        if response.ok:
            return response.json()
        # Random wait - TODO - Another magic number!!!
        time.sleep(random.randint(10))
    response.raise_for_status()

In [4]:
# Initialise pseudo-random number generator
random.seed(time.time())

In [5]:
# Preparing Database
dbclient = pymongo.MongoClient(mongodb_authendpoint)
print("---> DB Client instantiated")
print("\tDatabases: {}".format(dbclient.list_database_names()))
dbregistry = dbclient[mongodb_database]

---> DB Client instantiated
	Databases: ['admin', 'config', 'local']


In [6]:
# Retrieve the Resolution Source dataset
resolution_dataset_source = make_rest_request_content_type_json(identifiersorg_url_resolver_data_source)

In [8]:
# Process PID entries
mongodb_collection = dbregistry[collection_name_namespace]
for pid_entry in resolution_dataset_source:
    print("---> Processing Prefix '{}' - '{}'".format(pid_entry['id'], pid_entry['prefix']))
    # Translate PID Entry to Namespace model
    namespace = hqregistry.Namespace()
    namespace.prefix = pid_entry.get('prefix', None)
    namespace.mirId = pid_entry.get('id', None)
    namespace.name = pid_entry.get('name', None)
    namespace.pattern = pid_entry.get('pattern', None)
    namespace.description = pid_entry.get('definition', None)
    # TODO creation and modification dates are missing from the dataset
    # There is no deprecation date for now
    # Flag all namespaces as 'NOT Deprecated' for now
    namespace.deprecated = False
    inserted_object = mongodb_collection.insert_one(namespace.__dict__)
    print("\tNamespace inserted, ({}, {}, {})".format(inserted_object.inserted_id, namespace.mirId, namespace.prefix))

---> Processing Prefix 'MIR:00000002' - 'chebi'
	Namespace inserted, (5bc0b686bc79c2dd14c59223, MIR:00000002, chebi)
---> Processing Prefix 'MIR:00000003' - 'ensembl'
	Namespace inserted, (5bc0b686bc79c2dd14c59224, MIR:00000003, ensembl)
---> Processing Prefix 'MIR:00000004' - 'ec-code'
	Namespace inserted, (5bc0b686bc79c2dd14c59225, MIR:00000004, ec-code)
---> Processing Prefix 'MIR:00000005' - 'uniprot'
	Namespace inserted, (5bc0b686bc79c2dd14c59226, MIR:00000005, uniprot)
---> Processing Prefix 'MIR:00000006' - 'taxonomy'
	Namespace inserted, (5bc0b686bc79c2dd14c59227, MIR:00000006, taxonomy)
---> Processing Prefix 'MIR:00000007' - 'biomodels.db'
	Namespace inserted, (5bc0b686bc79c2dd14c59228, MIR:00000007, biomodels.db)
---> Processing Prefix 'MIR:00000008' - 'miriam.collection'
	Namespace inserted, (5bc0b686bc79c2dd14c59229, MIR:00000008, miriam.collection)
---> Processing Prefix 'MIR:00000009' - 'icd'
	Namespace inserted, (5bc0b686bc79c2dd14c5922a, MIR:00000009, icd)
---> Process

	Namespace inserted, (5bc0b686bc79c2dd14c59308, MIR:00000235, mirbase.mature)
---> Processing Prefix 'MIR:00000236' - 'nextprot'
	Namespace inserted, (5bc0b686bc79c2dd14c59309, MIR:00000236, nextprot)
---> Processing Prefix 'MIR:00000237' - 'cas'
	Namespace inserted, (5bc0b686bc79c2dd14c5930a, MIR:00000237, cas)
---> Processing Prefix 'MIR:00000238' - 'kegg.genome'
	Namespace inserted, (5bc0b686bc79c2dd14c5930b, MIR:00000238, kegg.genome)
---> Processing Prefix 'MIR:00000239' - 'kegg.metagenome'
	Namespace inserted, (5bc0b686bc79c2dd14c5930c, MIR:00000239, kegg.metagenome)
---> Processing Prefix 'MIR:00000240' - 'narcis'
	Namespace inserted, (5bc0b686bc79c2dd14c5930d, MIR:00000240, narcis)
---> Processing Prefix 'MIR:00000241' - 'jcsd'
	Namespace inserted, (5bc0b686bc79c2dd14c5930e, MIR:00000241, jcsd)
---> Processing Prefix 'MIR:00000242' - 'pw'
	Namespace inserted, (5bc0b686bc79c2dd14c5930f, MIR:00000242, pw)
---> Processing Prefix 'MIR:00000243' - 'insdc.sra'
	Namespace inserted, (5

	Namespace inserted, (5bc0b686bc79c2dd14c59421, MIR:00000528, drugbankv4.target)
---> Processing Prefix 'MIR:00000529' - 'zinc'
	Namespace inserted, (5bc0b686bc79c2dd14c59422, MIR:00000529, zinc)
---> Processing Prefix 'MIR:00000530' - 'foodb.compound'
	Namespace inserted, (5bc0b686bc79c2dd14c59423, MIR:00000530, foodb.compound)
---> Processing Prefix 'MIR:00000531' - 'unii'
	Namespace inserted, (5bc0b686bc79c2dd14c59424, MIR:00000531, unii)
---> Processing Prefix 'MIR:00000532' - 'orphanet.ordo'
	Namespace inserted, (5bc0b686bc79c2dd14c59425, MIR:00000532, orphanet.ordo)
---> Processing Prefix 'MIR:00000533' - 'psipar'
	Namespace inserted, (5bc0b686bc79c2dd14c59426, MIR:00000533, psipar)
---> Processing Prefix 'MIR:00000534' - 'clinvar.record'
	Namespace inserted, (5bc0b686bc79c2dd14c59427, MIR:00000534, clinvar.record)
---> Processing Prefix 'MIR:00000535' - 'mgnify.proj'
	Namespace inserted, (5bc0b686bc79c2dd14c59428, MIR:00000535, mgnify.proj)
---> Processing Prefix 'MIR:00000536' 