## ROR database of research organisations
https://ror.org/
Provides name, aliases and acronyms, but only got a 6% match and no ability to resolve institutions with multiple names

In [7]:
import json

# Load the JSON data
with open('../data/ror/v1.51-2024-08-21-ror-data.json', 'r') as file:
    data = json.load(file)

# Output basic information about the data
print(f"Total number of records: {len(data)}")

# Check the type and structure of the first item in the list
first_item = data[0]
print("\nStructure of the first item:")
for key, value in first_item.items():
    print(f"{key}: {type(value)}")

# Display the first item to understand its content
print("\nContent of the first item:")
print(json.dumps(first_item, indent=2))


Total number of records: 110484

Structure of the first item:
id: <class 'str'>
name: <class 'str'>
types: <class 'list'>
links: <class 'list'>
aliases: <class 'list'>
acronyms: <class 'list'>
status: <class 'str'>
wikipedia_url: <class 'str'>
labels: <class 'list'>
email_address: <class 'NoneType'>
ip_addresses: <class 'list'>
established: <class 'int'>
country: <class 'dict'>
relationships: <class 'list'>
addresses: <class 'list'>
external_ids: <class 'dict'>

Content of the first item:
{
  "id": "https://ror.org/04ttjf776",
  "name": "RMIT University",
  "types": [
    "Education"
  ],
  "links": [
    "https://www.rmit.edu.au/"
  ],
  "aliases": [
    "Royal Melbourne Institute of Technology University"
  ],
  "acronyms": [
    "RMIT"
  ],
  "status": "active",
  "wikipedia_url": "http://en.wikipedia.org/wiki/RMIT_University",
  "labels": [],
  "email_address": null,
  "ip_addresses": [],
  "established": 1887,
  "country": {
    "country_code": "AU",
    "country_name": "Australia

In [8]:
import pandas as pd

# Function to extract relevant information from each record
def extract_info(org):
    org_id = org.get('id', '')
    name = org.get('name', '')
    types = ', '.join(org.get('types', []))
    links = ', '.join(org.get('links', []))
    aliases = ', '.join(org.get('aliases', []))
    acronyms = ', '.join(org.get('acronyms', []))
    status = org.get('status', '')
    wikipedia_url = org.get('wikipedia_url', '')
    established = org.get('established', '')
    
    # Extract country information
    country_info = org.get('country', {})
    country_code = country_info.get('country_code', '')
    country_name = country_info.get('country_name', '')
    
    # Extract relationships
    relationships = []
    for relationship in org.get('relationships', []):
        relationships.append(f"{relationship['type']}: {relationship['label']} ({relationship['id']})")
    relationships = '; '.join(relationships)
    
    # Extract addresses
    addresses = []
    for address in org.get('addresses', []):
        city = address.get('city', '')
        state = address.get('state', '')
        country = country_name
        lat = address.get('lat', '')
        lng = address.get('lng', '')
        addresses.append(f"{city}, {state}, {country} (Lat: {lat}, Lng: {lng})")
    addresses = '; '.join(addresses)
    
    # Extract external IDs
    external_ids = {}
    for id_type, id_info in org.get('external_ids', {}).items():
        preferred_id = id_info.get('preferred', '')
        all_ids = ', '.join(id_info.get('all', [])) if isinstance(id_info.get('all', []), list) else id_info.get('all', '')
        external_ids[id_type] = {
            'preferred': preferred_id,
            'all': all_ids
        }
    
    return {
        'id': org_id,
        'name': name,
        'types': types,
        'links': links,
        'aliases': aliases,
        'acronyms': acronyms,
        'status': status,
        'wikipedia_url': wikipedia_url,
        'established': established,
        'country_code': country_code,
        'country_name': country_name,
        'relationships': relationships,
        'addresses': addresses,
        'external_ids': external_ids
    }

# Extract information from all records
institutions_info = [extract_info(org) for org in data]

# Convert to DataFrame for better visualization
institutions_df = pd.DataFrame(institutions_info)

# Display the first few rows of the DataFrame
institutions_df.head()

Unnamed: 0,id,name,types,links,aliases,acronyms,status,wikipedia_url,established,country_code,country_name,relationships,addresses,external_ids
0,https://ror.org/04ttjf776,RMIT University,Education,https://www.rmit.edu.au/,Royal Melbourne Institute of Technology Univer...,RMIT,active,http://en.wikipedia.org/wiki/RMIT_University,1887.0,AU,Australia,Related: Austin Hospital (https://ror.org/010m...,"Melbourne, Victoria, Australia (Lat: -37.80674...","{'ISNI': {'preferred': None, 'all': '0000 0001..."
1,https://ror.org/01rxfrp27,La Trobe University,Education,http://www.latrobe.edu.au/,,,active,http://en.wikipedia.org/wiki/La_Trobe_University,1964.0,AU,Australia,Related: Austin Hospital (https://ror.org/010m...,"Melbourne, Victoria, Australia (Lat: -37.72179...","{'ISNI': {'preferred': None, 'all': '0000 0001..."
2,https://ror.org/04j757h98,Victoria University,Education,http://www.vu.edu.au/,,VU,active,http://en.wikipedia.org/wiki/Victoria_Universi...,1916.0,AU,Australia,,"Melbourne, Victoria, Australia (Lat: -37.79416...","{'ISNI': {'preferred': None, 'all': '0000 0001..."
3,https://ror.org/04r659a56,University of New England,Education,http://www.une.edu.au/,New England University College of the Universi...,UNE,active,http://en.wikipedia.org/wiki/University_of_New...,1938.0,AU,Australia,Related: Gosford Hospital (https://ror.org/01j...,"Armidale, New South Wales, Australia (Lat: -30...","{'ISNI': {'preferred': None, 'all': '0000 0004..."
4,https://ror.org/02sc3r913,Griffith University,Education,http://www.griffith.edu.au/,,,active,http://en.wikipedia.org/wiki/Griffith_University,1971.0,AU,Australia,Related: Gold Coast Hospital (https://ror.org/...,"Brisbane, Queensland, Australia (Lat: -27.4706...","{'ISNI': {'preferred': None, 'all': '0000 0004..."


In [9]:
# Load the problematic database
with open('../data/raw/uk_data.json', 'r') as file:
    uk_data = json.load(file)

# Extract unique entries in the name field
uk_names = set(entry['name'] for entry in uk_data)

# Display the number of unique names and a few examples
print(f"Total unique names in UK database: {len(uk_names)}")
print("Sample of unique names:")
for name in list(uk_names)[:10]:  # Display the first 10 unique names
    print(name)

Total unique names in UK database: 5422
Sample of unique names:
SCOTTISH HYDRO ELECTRIC TRANSMISSION PLC
JALI LIMITED
TOMMY'S
PROBOTEK LTD
PublicSpace Ltd
SCOTTISH & NEWCASTLE UK LIMITED
CENTRE OF EXCELLENCE FOR LIFE SCIENCES LTD
National Marine Aquarium Ltd.
ANASYS INSTRUMENTS LIMITED
PROGRESSIVE FARMING TRUST LTD LBG


In [11]:
# Create dictionaries for fast lookups
name_to_ror_id = {}
alias_to_ror_id = {}
acronym_to_ror_id = {}

# Populate the dictionaries
for org in institutions_info:  # Assuming institutions_info is the list of ROR records
    ror_id = org['id']
    
    # Add primary name
    primary_name = org['name'].lower()
    if primary_name not in name_to_ror_id:
        name_to_ror_id[primary_name] = ror_id
    
    # Add aliases
    for alias in org.get('aliases', []):
        alias = alias.lower()
        if alias not in alias_to_ror_id:
            alias_to_ror_id[alias] = ror_id
    
    # Add acronyms
    for acronym in org.get('acronyms', []):
        acronym = acronym.lower()
        if acronym not in acronym_to_ror_id:
            acronym_to_ror_id[acronym] = ror_id

print("Hash sets created for ROR names, aliases, and acronyms.")


Hash sets created for ROR names, aliases, and acronyms.


In [13]:
# Function to find matches using hash sets
def find_hash_matches(uk_name, name_dict, alias_dict, acronym_dict):
    uk_name_lower = uk_name.lower()
    
    # Check if the name exists in any of the hash sets
    if uk_name_lower in name_dict:
        return {
            'uk_name': uk_name,
            'ror_id': name_dict[uk_name_lower],
            'matched_field': 'name'
        }
    elif uk_name_lower in alias_dict:
        return {
            'uk_name': uk_name,
            'ror_id': alias_dict[uk_name_lower],
            'matched_field': 'alias'
        }
    elif uk_name_lower in acronym_dict:
        return {
            'uk_name': uk_name,
            'ror_id': acronym_dict[uk_name_lower],
            'matched_field': 'acronym'
        }
    return None

# Compare UK names using hash sets
matches_found = []
for uk_name in uk_names:
    match = find_hash_matches(uk_name, name_to_ror_id, alias_to_ror_id, acronym_to_ror_id)
    if match:
        matches_found.append(match)

# Convert matches to DataFrame for easier analysis
matches_df = pd.DataFrame(matches_found)

# Calculate and print summary information
total_uk_names = len(uk_names)
total_matches = len(matches_df)
percentage_matched = (total_matches / total_uk_names) * 100

print(f"Total UK names: {total_uk_names}")
print(f"Total matches found: {total_matches}")
print(f"Percentage of UK names matched: {percentage_matched:.2f}%")

# Display the first few matches
matches_df.head(20)


Total UK names: 5422
Total matches found: 339
Percentage of UK names matched: 6.25%


Unnamed: 0,uk_name,ror_id,matched_field
0,TOMMY'S,https://ror.org/04rre2k21,name
1,Birmingham Museums Trust,https://ror.org/02cyn4444,name
2,St Andrew's Healthcare,https://ror.org/00d8qv962,name
3,NEWCASTLE CITY COUNCIL,https://ror.org/00ftam505,name
4,INSTITUTE OF NANOTECHNOLOGY,https://ror.org/02ma7yv75,name
5,Education Scotland,https://ror.org/05yapj268,name
6,BOURNEMOUTH UNIVERSITY,https://ror.org/05wwcw481,name
7,QUALITY MEAT SCOTLAND,https://ror.org/01bhb2k54,name
8,UNIVERSITY OF BRISTOL,https://ror.org/0524sp257,name
9,MOORFIELDS EYE HOSPITAL NHS FOUNDATION TRUST,https://ror.org/03zaddr67,name


In [16]:
# Count the number of matches for each field (name, alias, acronym)
matches_by_field = matches_df['matched_field'].value_counts()

print("Number of matches by field:")
print(matches_by_field)


Number of matches by field:
matched_field
name    339
Name: count, dtype: int64


In [15]:
# Group by the ror_id and count the number of unique UK names that matched each ror_id
matches_by_ror_id = matches_df.groupby('ror_id')['uk_name'].nunique().reset_index()

# Rename the columns for clarity
matches_by_ror_id.columns = ['ror_id', 'num_uk_names']

# Sort by the number of UK names matched to each ror_id
matches_by_ror_id = matches_by_ror_id.sort_values(by='num_uk_names', ascending=False)

print("Number of unique UK names matched with each ROR ID:")
print(matches_by_ror_id.head(20))  # Display the top 20 ROR IDs with the most matches


Number of unique UK names matched with each ROR ID:
                        ror_id  num_uk_names
142  https://ror.org/02fsyqx49             2
0    https://ror.org/000wh6t45             1
223  https://ror.org/03w54w620             1
230  https://ror.org/03zjvnn91             1
229  https://ror.org/03zefc030             1
228  https://ror.org/03zaddr67             1
227  https://ror.org/03z50sg96             1
226  https://ror.org/03z28gk75             1
225  https://ror.org/03yeq9x20             1
224  https://ror.org/03x94j517             1
222  https://ror.org/03vyddc91             1
212  https://ror.org/03n5g6138             1
221  https://ror.org/03vnshb93             1
220  https://ror.org/03vbj2a93             1
219  https://ror.org/03v9efr22             1
218  https://ror.org/03tzb7t51             1
217  https://ror.org/03svjbs84             1
216  https://ror.org/03snfqm79             1
215  https://ror.org/03r6k1a05             1
214  https://ror.org/03phf2w66             1
