In [1]:
# This Helper fetches de resolver dataset from the EBI identifiers.org platform, to transform it and put it into
# the new MongoDB based data backend.
# Author: Manuel Bernal Llinares <mbdebian@gmail.com>
import time
import random
import pymongo
import requests
import threading
from bson.dbref import DBRef
import matplotlib.pyplot as plt
import data.models.hq.registry as hqregistry

%matplotlib inline

In [2]:
# Constants
# Resolution Source Data Model
PID_ENTRY_KEY_ID = 'id'
PID_ENTRY_KEY_NAME = 'name'
PID_ENTRY_KEY_PATTERN = 'pattern'
PID_ENTRY_KEY_DEFINITION = 'definition'
PID_ENTRY_KEY_PREFIX = 'prefix'
PID_ENTRY_KEY_URL = 'url'
PID_ENTRY_KEY_PREFIXED = 'prefixed'
PID_ENTRY_KEY_RESOURCES = 'resources'
RESOURCE_ENTRY_KEY_ID = 'id'
RESOURCE_ENTRY_KEY_ACCESS_URL = 'accessURL'
RESOURCE_ENTRY_KEY_INFO = 'info'
RESOURCE_ENTRY_KEY_INSTITUTION = 'institution'
RESOURCE_ENTRY_KEY_LOCATION = 'location'
RESOURCE_ENTRY_KEY_OFFICIAL = 'official'
RESOURCE_ENTRY_KEY_RESOURCE_PREFIX = 'resourcePrefix'
RESOURCE_ENTRY_KEY_LOCAL_ID = 'localId'
RESOURCE_ENTRY_KEY_TEST_STRING = 'testString'
RESOURCE_ENTRY_KEY_RESOURCE_URL = 'resourceURL'

In [3]:
# Set up the environment
identifiersorg_url_resolver_data_source = 'https://identifiers.org/rest/collections/expand'
# TODO Maybe get this from environment variables later on when everything is working
mongodb_username = 'devusername'
mongodb_password = 'devpassword'
mongodb_database = 'devdbregistry'
mongodb_endpoint = 'mongodb://localhost:27017'
mongodb_authendpoint = "mongodb://{}:{}@localhost:27017".format(mongodb_username, mongodb_password)
# Collections
collection_name_institution = 'institution'
collection_name_location = 'location'
collection_name_namespaceSynonym = 'namespace_synonym'
collection_name_resource = 'resource'
collection_name_namespace = 'namespace'
# ID sync point
id_counter_resources = 0
id_counter_institution = 0
id_counter_location = 0

In [4]:
# Helpers
# Dictionaries
institution_by_attribute_name = dict()
location_by_country_code = dict()
# Data by collections
institutions = []
resources = []
namespaces = []
locations = []

def make_rest_request_content_type_json(url):
    # TODO - Magic number here!!!
    n_attempts = 42
    response = None
    while n_attempts:
        n_attempts -= 1
        try:
            response = requests.get(url, headers={"Content-Type": "application/json"})
        except Exception as e:
            # Any possible exception counts towards the attempt counter
            # Random wait - TODO - Another magic number!!!
            time.sleep(random.randint(30))
            continue
        if response.ok:
            return response.json()
        # Random wait - TODO - Another magic number!!!
        time.sleep(random.randint(10))
    response.raise_for_status()
    
def process_location(country_code):
    global id_counter_location
    print("Processing location '{}'".format(country_code))
    location = location_by_country_code.get(country_code, hqregistry.Location(country_code=country_code))
    if location._id is None:
        location._id = id_counter_location
        id_counter_location = id_counter_location + 1
        # Dictionary
        location_by_country_code[country_code] = location
        # Collection
        locations.append(location)
    return location
    
def process_institution(name, description="No description provided"):
    global id_counter_institution
    # In the case of institution, we do have to check whether we've seen this institution before
    institution = institution_by_attribute_name.get(name, hqregistry.Institution(name=name, description=description))
    if institution._id is None:
        print("[NEW] Processing institution '{}'".format(name))
        institution._id = id_counter_institution
        id_counter_institution = id_counter_institution + 1
        # Dictionary
        institution_by_attribute_name[name] = institution
        # Collection
        institutions.append(institution)
    else:
        print("[CACHED] Processing institution '{}'".format(name))
    return institution
    
def process_resource(source_resource):
    global id_counter_resources
    print("Processing resource '{}'".format(source_resource[RESOURCE_ENTRY_KEY_INFO]))
    # Every time we process a resource, it's basically a new one, so there is no need to check if we have already
    # processed the resource
    resource = hqregistry.Resource()
    # Set entry ID
    resource._id = id_counter_resources
    id_counter_resources = id_counter_resources + 1
    resource.mirId = source_resource.get(RESOURCE_ENTRY_KEY_ID, None)
    resource.accessUrl = source_resource.get(RESOURCE_ENTRY_KEY_ACCESS_URL, None)
    resource.info = source_resource.get(RESOURCE_ENTRY_KEY_INFO, None)
    resource.official = source_resource.get(RESOURCE_ENTRY_KEY_OFFICIAL, False)
    resource.resourcePrefix = source_resource.get(RESOURCE_ENTRY_KEY_RESOURCE_PREFIX, None)
    resource.localId = source_resource.get(RESOURCE_ENTRY_KEY_LOCAL_ID, None)
    resource.resourceUrl = source_resource.get(RESOURCE_ENTRY_KEY_RESOURCE_URL, None)
    # TODO Set the Foreign Keys
    if RESOURCE_ENTRY_KEY_INSTITUTION in source_resource:
        institution = process_institution(source_resource[RESOURCE_ENTRY_KEY_INSTITUTION])
        # DBRef?
        #resource.institution = institution._id
        resource.institution = DBRef(collection=collection_name_institution, id=institution._id)
        institution.resourcesFk.append(resource._id)
        print("\tInstitution #{} reference back to #{}, number of resources referenced #{}".format(institution._id, resource._id, len(institution.resourcesFk)))
    if RESOURCE_ENTRY_KEY_LOCATION in source_resource:
        location = process_location(source_resource[RESOURCE_ENTRY_KEY_LOCATION])
        # DBRef?
        #resource.location = location._id
        resource.location = DBRef(collection=collection_name_location, id=location._id)
    # Collection
    resources.append(resource)
    return resource

In [5]:
# Initialise pseudo-random number generator
random.seed(time.time())

In [6]:
# Retrieve the Resolution Source dataset
resolution_dataset_source = make_rest_request_content_type_json(identifiersorg_url_resolver_data_source)

In [7]:
# Process PID entries - This is done this way because at the time of developing this dataset helper, identifiers.org
# registry was no bigger than 800KiB, and its growth forecast was not too steep.
for index, pid_entry in enumerate(resolution_dataset_source):
    print("---> [{}] Processing Prefix '{}' - '{}'".format(index, pid_entry['id'], pid_entry['prefix']))
    # Translate PID Entry to Namespace model
    namespace = hqregistry.Namespace()
    namespace._id = index
    namespace.prefix = pid_entry.get(PID_ENTRY_KEY_PREFIX, None)
    namespace.mirId = pid_entry.get(PID_ENTRY_KEY_ID, None)
    namespace.name = pid_entry.get(PID_ENTRY_KEY_NAME, None)
    namespace.pattern = pid_entry.get(PID_ENTRY_KEY_PATTERN, None)
    namespace.description = pid_entry.get(PID_ENTRY_KEY_DEFINITION, None)
    # TODO creation and modification dates are missing from the dataset
    # There is no deprecation date for now
    # Flag all namespaces as 'NOT Deprecated' for now
    namespace.deprecated = False
    # print("\tInserting namespace record: '{}'".format(namespace.__dict__))
    #inserted_object = mongodb_collection.insert_one(namespace.__dict__)
    #print("\tNamespace inserted, ({}, {}, {})".format(inserted_object.inserted_id, namespace.mirId, namespace.prefix))
    # Process resources for this PID Entry
    for source_resource in pid_entry.get(PID_ENTRY_KEY_RESOURCES, []):
        resource = process_resource(source_resource)
        # DBRef?
        resource.namespaceFk = namespace._id
        #resource.namespace = DBRef(collection=collection_name_namespace, id=namespace._id)
        # DBRef?
        namespace.resources.append(DBRef(collection=collection_name_resource, id=resource._id))
    # Namespaces collection
    namespaces.append(namespace)

---> [0] Processing Prefix 'MIR:00000002' - 'chebi'
Processing resource 'ChEBI (Chemical Entities of Biological Interest)'
[NEW] Processing institution 'European Bioinformatics Institute, Hinxton, Cambridge'
	Institution #0 reference back to #0, number of resources referenced #1
Processing location 'UK'
Processing resource 'ChEBI through OLS'
[CACHED] Processing institution 'European Bioinformatics Institute, Hinxton, Cambridge'
	Institution #0 reference back to #1, number of resources referenced #2
Processing location 'UK'
Processing resource 'ChEBI through BioPortal'
[NEW] Processing institution 'National Center for Biomedical Ontology, Stanford'
	Institution #1 reference back to #2, number of resources referenced #1
Processing location 'USA'
---> [1] Processing Prefix 'MIR:00000003' - 'ensembl'
Processing resource 'Ensembl at Sanger/EMBL-EBI'
[NEW] Processing institution 'Sanger Institute and European Bioinformatics Institute, Hinxton, Cambridge'
	Institution #2 reference back to #3

Processing location 'USA'
---> [309] Processing Prefix 'MIR:00000323' - 'genecards'
Processing resource 'GeneCards at Weizmann Institute'
[NEW] Processing institution 'Department of Molecular Genetics, Weizmann Institute of Science, Rehovot'
	Institution #220 reference back to #417, number of resources referenced #1
Processing location 'Israel'
---> [310] Processing Prefix 'MIR:00000324' - 'mmrrc'
Processing resource 'MMRRC  at University of North Carolina'
[NEW] Processing institution 'University of North Carolina'
	Institution #221 reference back to #418, number of resources referenced #1
Processing location 'USA'
---> [311] Processing Prefix 'MIR:00000325' - 'umbbd.reaction'
Processing resource 'Biocatalysis/Biodegradation Database Mirror (Reaction) at ETH Zurich'
[CACHED] Processing institution 'ETH, Zurich'
	Institution #187 reference back to #419, number of resources referenced #2
Processing location 'Switzerland'
---> [312] Processing Prefix 'MIR:00000326' - 'umbbd.enzyme'
Proce

---> [578] Processing Prefix 'MIR:00000596' - 'clinvar'
Processing resource 'ClinVar Variant at NCBI'
[CACHED] Processing institution 'National Center for Biotechnology Information (NCBI), NIH, Maryland'
	Institution #341 reference back to #727, number of resources referenced #4
Processing location 'USA'
---> [579] Processing Prefix 'MIR:00000597' - 'bao'
Processing resource 'BioAssay Ontology through BioPortal'
[CACHED] Processing institution 'National Center for Biomedical Ontology, Stanford'
	Institution #1 reference back to #728, number of resources referenced #31
Processing location 'USA'
Processing resource 'BioAssay Ontology through OLS'
[CACHED] Processing institution 'European Bioinformatics Institute, Hinxton, Cambridge'
	Institution #0 reference back to #729, number of resources referenced #113
Processing location 'UK'
---> [580] Processing Prefix 'MIR:00000598' - 'cellosaurus'
Processing resource 'Cellosaurus through SIB'
[CACHED] Processing institution 'Swiss Institute of 

In [8]:
print("---> Processed Datasource:")
print("\tNamespaces   -> #{}".format(len(namespaces)))
print("\tResources    -> #{}".format(len(resources)))
print("\tInstitutions -> #{}".format(len(institutions)))
print("\tLocations    -> #{}".format(len(locations)))

---> Processed Datasource:
	Namespaces   -> #662
	Resources    -> #819
	Institutions -> #420
	Locations    -> #50


In [9]:
# Preparing Database
dbclient = pymongo.MongoClient(mongodb_authendpoint)
print("---> DB Client instantiated")
print("\tDatabases: {}".format(dbclient.list_database_names()))
dbregistry = dbclient[mongodb_database]
#mongodb_collection = dbregistry[collection_name_namespace]

---> DB Client instantiated
	Databases: ['admin', 'config', 'local']


In [10]:
# Prepare insertion in the database
data_for_insertion = []
data_for_insertion.append((collection_name_location, locations))
data_for_insertion.append((collection_name_institution, institutions))
data_for_insertion.append((collection_name_resource, resources))
data_for_insertion.append((collection_name_namespace, namespaces))

In [11]:
for collection_name, collection_content in data_for_insertion:
    print("\tPopulating collection '{}'".format(collection_name))
    mongodb_collection = dbregistry[collection_name]
    mongodb_collection.insert_many([entry.__dict__ for entry in collection_content])
print("---> DONE. <---")

	Populating collection 'location'
	Populating collection 'institution'
	Populating collection 'resource'
	Populating collection 'namespace'
---> DONE. <---
