In [1]:
# This Helper fetches de resolver dataset from the EBI identifiers.org platform, to transform it and put it into
# the new MongoDB based data backend.
# Author: Manuel Bernal Llinares <mbdebian@gmail.com>
import time
import random
import pymongo
import requests
import threading
import matplotlib.pyplot as plt
import data.models.hq.registry as hqregistry

%matplotlib inline

In [2]:
# Constants
# Resolution Source Data Model
PID_ENTRY_KEY_ID = 'id'
PID_ENTRY_KEY_NAME = 'name'
PID_ENTRY_KEY_PATTERN = 'pattern'
PID_ENTRY_KEY_DEFINITION = 'definition'
PID_ENTRY_KEY_PREFIX = 'prefix'
PID_ENTRY_KEY_URL = 'url'
PID_ENTRY_KEY_PREFIXED = 'prefixed'
PID_ENTRY_KEY_RESOURCES = 'resources'
RESOURCE_ENTRY_KEY_ID = 'id'
RESOURCE_ENTRY_KEY_ACCESS_URL = 'accessURL'
RESOURCE_ENTRY_KEY_INFO = 'info'
RESOURCE_ENTRY_KEY_INSTITUTION = 'institution'
RESOURCE_ENTRY_KEY_LOCATION = 'location'
RESOURCE_ENTRY_KEY_OFFICIAL = 'official'
RESOURCE_ENTRY_KEY_RESOURCE_PREFIX = 'resourcePrefix'
RESOURCE_ENTRY_KEY_LOCAL_ID = 'localId'
RESOURCE_ENTRY_KEY_TEST_STRING = 'testString'
RESOURCE_ENTRY_KEY_RESOURCE_URL = 'resourceURL'

In [3]:
# Set up the environment
identifiersorg_url_resolver_data_source = 'https://identifiers.org/rest/collections/expand'
# TODO Maybe get this from environment variables later on when everything is working
mongodb_username = 'devusername'
mongodb_password = 'devpassword'
mongodb_database = 'devdbregistry'
mongodb_endpoint = 'mongodb://localhost:27017'
mongodb_authendpoint = "mongodb://{}:{}@localhost:27017".format(mongodb_username, mongodb_password)
# Collections
collection_name_institution = 'institution'
collection_name_location = 'location'
collection_name_namespaceSynonym = 'namespace_synonym'
collection_name_resource = 'resource'
collection_name_namespace = 'namespace'
# ID sync point
id_counter_resources = 0
id_counter_institution = 0
id_counter_location = 0

In [4]:
# Helpers
# Dictionaries
institution_by_attribute_name = dict()
location_by_country_code = dict()
# Data by collections
institutions = []
resources = []
namespaces = []
locations = []

def make_rest_request_content_type_json(url):
    # TODO - Magic number here!!!
    n_attempts = 42
    response = None
    while n_attempts:
        n_attempts -= 1
        try:
            response = requests.get(url, headers={"Content-Type": "application/json"})
        except Exception as e:
            # Any possible exception counts towards the attempt counter
            # Random wait - TODO - Another magic number!!!
            time.sleep(random.randint(30))
            continue
        if response.ok:
            return response.json()
        # Random wait - TODO - Another magic number!!!
        time.sleep(random.randint(10))
    response.raise_for_status()
    
def process_location(country_code):
    global id_counter_location
    print("Processing location '{}'".format(country_code))
    location = location_by_country_code.get(country_code, hqregistry.Location(country_code=country_code))
    if location._id is None:
        location._id = id_counter_location
        id_counter_location = id_counter_location + 1
        # Dictionary
        location_by_country_code[country_code] = location
        # Collection
        locations.append(location)
    return location
    
def process_institution(name, description="No description provided"):
    global id_counter_institution
    # In the case of institution, we do have to check whether we've seen this institution before
    print("Processing institution '{}'".format(name))
    institution = institution_by_attribute_name.get(name, hqregistry.Institution())
    if institution._id is None:
        institution._id = id_counter_institution
        id_counter_institution = id_counter_institution + 1
        institution.name = name
        institution.description = description
        # Dictionary
        institution_by_attribute_name[name] = institution
        # Collection
        institutions.append(institution)
    return institution
    
def process_resource(source_resource):
    global id_counter_resources
    print("Processing resource '{}'".format(source_resource[RESOURCE_ENTRY_KEY_INFO]))
    # Every time we process a resource, it's basically a new one, so there is no need to check if we have already
    # processed the resource
    resource = hqregistry.Resource()
    # Set entry ID
    resource._id = id_counter_resources
    id_counter_resources = id_counter_resources + 1
    resource.mirId = source_resource.get(RESOURCE_ENTRY_KEY_ID, None)
    resource.accessUrl = source_resource.get(RESOURCE_ENTRY_KEY_ACCESS_URL, None)
    resource.info = source_resource.get(RESOURCE_ENTRY_KEY_INFO, None)
    resource.official = source_resource.get(RESOURCE_ENTRY_KEY_OFFICIAL, False)
    resource.resourcePrefix = source_resource.get(RESOURCE_ENTRY_KEY_RESOURCE_PREFIX, None)
    resource.localId = source_resource.get(RESOURCE_ENTRY_KEY_LOCAL_ID, None)
    resource.resourceUrl = source_resource.get(RESOURCE_ENTRY_KEY_RESOURCE_URL, None)
    # TODO Set the Foreign Keys
    if RESOURCE_ENTRY_KEY_INSTITUTION in source_resource:
        institution = process_institution(source_resource[RESOURCE_ENTRY_KEY_INSTITUTION])
        # DBRef?
        resource.institution = institution._id
    if RESOURCE_ENTRY_KEY_LOCATION in source_resource:
        location = process_location(source_resource[RESOURCE_ENTRY_KEY_LOCATION])
        # DBRef?
        resource.location = location._id
    return resource

In [5]:
# Initialise pseudo-random number generator
random.seed(time.time())

In [6]:
# Retrieve the Resolution Source dataset
resolution_dataset_source = make_rest_request_content_type_json(identifiersorg_url_resolver_data_source)

In [7]:
# Process PID entries - This is done this way because at the time of developing this dataset helper, identifiers.org
# registry was no bigger than 800KiB, and its growth forecast was not too steep.
for index, pid_entry in enumerate(resolution_dataset_source):
    print("---> [{}] Processing Prefix '{}' - '{}'".format(index, pid_entry['id'], pid_entry['prefix']))
    # Translate PID Entry to Namespace model
    namespace = hqregistry.Namespace()
    namespace._id = index
    namespace.prefix = pid_entry.get(PID_ENTRY_KEY_PREFIX, None)
    namespace.mirId = pid_entry.get(PID_ENTRY_KEY_ID, None)
    namespace.name = pid_entry.get(PID_ENTRY_KEY_NAME, None)
    namespace.pattern = pid_entry.get(PID_ENTRY_KEY_PATTERN, None)
    namespace.description = pid_entry.get(PID_ENTRY_KEY_DEFINITION, None)
    # TODO creation and modification dates are missing from the dataset
    # There is no deprecation date for now
    # Flag all namespaces as 'NOT Deprecated' for now
    namespace.deprecated = False
    # print("\tInserting namespace record: '{}'".format(namespace.__dict__))
    #inserted_object = mongodb_collection.insert_one(namespace.__dict__)
    #print("\tNamespace inserted, ({}, {}, {})".format(inserted_object.inserted_id, namespace.mirId, namespace.prefix))
    # Process resources for this PID Entry
    for source_resource in pid_entry.get(PID_ENTRY_KEY_RESOURCES, []):
        resource = process_resource(source_resource)
        # DBRef?
        resource.namespace = namespace._id
        # Collection
        resources.append(resource)
    # Namespaces collection
    namespaces.append(namespace)
print("---> Processed Datasource:")
print("\tNamespaces   -> #{}".format(len(namespaces)))
print("\tResources    -> #{}".format(len(resources)))
print("\tInstitutions -> #{}".format(len(institutions)))
print("\tLocations    -> #{}".format(len(locations)))

---> [0] Processing Prefix 'MIR:00000002' - 'chebi'
Processing resource 'ChEBI (Chemical Entities of Biological Interest)'
Processing institution 'European Bioinformatics Institute, Hinxton, Cambridge'
Processing location 'UK'
Processing resource 'ChEBI through OLS'
Processing institution 'European Bioinformatics Institute, Hinxton, Cambridge'
Processing location 'UK'
Processing resource 'ChEBI through BioPortal'
Processing institution 'National Center for Biomedical Ontology, Stanford'
Processing location 'USA'
---> [1] Processing Prefix 'MIR:00000003' - 'ensembl'
Processing resource 'Ensembl at Sanger/EMBL-EBI'
Processing institution 'Sanger Institute and European Bioinformatics Institute, Hinxton, Cambridge'
Processing location 'UK'
Processing resource 'Ensembl US West mirror'
Processing location 'USA'
Processing resource 'Ensembl US East mirror'
Processing location 'USA'
Processing resource 'Ensembl Asia mirror'
Processing location 'Singapore'
---> [2] Processing Prefix 'MIR:000000

Processing institution 'Swiss Institute of Bioinformatics (SIB), Geneva'
Processing location 'Switzerland'
---> [395] Processing Prefix 'MIR:00000409' - 'rfam'
Processing resource 'Rfam at EMBL-EBI'
Processing institution 'European Bioinformatics Institute, Hinxton, Cambridge'
Processing location 'UK'
---> [396] Processing Prefix 'MIR:00000410' - 'fbol'
Processing resource 'Fungal Barcode at Fungal Biodiversity Centre'
Processing institution 'Fungal Biodiversity Centre, Utrecht'
Processing location 'Netherlands'
---> [397] Processing Prefix 'MIR:00000411' - 'aftol.taxonomy'
Processing resource 'AFTOL at University of Minnesota'
Processing institution 'Department of Plant Biology, University of Minnesota, Minnesota'
Processing location 'USA'
---> [398] Processing Prefix 'MIR:00000412' - 'aspgd.locus'
Processing resource 'AspGD at Stanford Medical School'
Processing institution 'Department of Genetics, Stanford University Medical School, Stanford, California'
Processing location 'USA'
--

In [9]:
# Preparing Database
dbclient = pymongo.MongoClient(mongodb_authendpoint)
print("---> DB Client instantiated")
print("\tDatabases: {}".format(dbclient.list_database_names()))
dbregistry = dbclient[mongodb_database]
#mongodb_collection = dbregistry[collection_name_namespace]

---> DB Client instantiated
	Databases: ['admin', 'config', 'devdbregistry', 'local']


In [10]:
# Prepare insertion in the database
data_for_insertion = []
data_for_insertion.append((collection_name_location, locations))
data_for_insertion.append((collection_name_institution, institutions))
data_for_insertion.append((collection_name_resource, resources))
data_for_insertion.append((collection_name_namespace, namespaces))

In [11]:
for collection_name, collection_content in data_for_insertion:
    print("\tPopulating collection '{}'".format(collection_name))
    mongodb_collection = dbregistry[collection_name]
    mongodb_collection.insert_many([entry.__dict__ for entry in collection_content])
print("---> DONE. <---")

	Populating collection 'location'
	Populating collection 'institution'
	Populating collection 'resource'


InvalidDocument: Cannot encode object: <data.models.hq.registry.Location object at 0x11b06a0b8>