In [1]:
import os, sys
import json
import pandas as pd
from pprint import pprint
import collections
from typing import Dict, List, Set


def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")

# Define the base path and the file paths 

base_path = os.path.join("data", "raw")
gtr_base = os.path.join(base_path, "all_scraped", "gtr", "scraped")
gtr_persons_json = os.path.join(gtr_base, "2024_07", "persons.json")
gtr_projects_json = os.path.join(gtr_base, "2024_07", "projects.json")
gtr_organisations_json = os.path.join(gtr_base, "2024_07", "organisations.json")

# gtr_persons_json = os.path.join("data", "raw", "example_data", "persons.json")
# gtr_projects_json = os.path.join("data", "raw", "example_data", "projects.json")
# gtr_organisations_json = os.path.join("data", "raw", "example_data", "organisations.json")
# with open(gtr_projects_json, "r") as f:
#     projects_data = json.load(f)

def get_data(file_path: str) -> Dict:
    with open(file_path, "r") as f:
        data = json.load(f)
    return data


Current working directory:  c:\Users\dec2g\GitHub\OrgSync


In [5]:
# print persons first 2 entries
persons_data = get_data(gtr_persons_json)
pprint(persons_data[:2])

[{'created': 1720391292000,
  'email': None,
  'ext': None,
  'firstName': 'Tom',
  'href': 'http://gtr.ukri.org/gtr/api/persons/0400AE80-674B-4068-A7D4-748DFA887CDE',
  'id': '0400AE80-674B-4068-A7D4-748DFA887CDE',
  'links': {'link': [{'end': None,
                      'href': 'http://gtr.ukri.org/gtr/api/projects/1FF7C213-559F-4374-9932-069540778CFE',
                      'otherAttributes': {},
                      'rel': 'PI_PER',
                      'start': None},
                     {'end': None,
                      'href': 'http://gtr.ukri.org/gtr/api/projects/A69BBDA8-5CE1-41A2-872C-122079954D8F',
                      'otherAttributes': {},
                      'rel': 'COI_PER',
                      'start': None},
                     {'end': None,
                      'href': 'http://gtr.ukri.org/gtr/api/projects/CF64FA13-BEAF-4A54-98E8-49CAF7EE0521',
                      'otherAttributes': {},
                      'rel': 'PI_PER',
                      'start'

In [2]:
def extract_id_from_href(href: str) -> str:
    return href.split('/')[-1]

def extract_domain_from_href(href: str) -> str:
    return href.split('/')[-2]

def get_hrefs_as_dict_of_lists(entry, rels):
    link_dicts = entry["links"]["link"]
    # populate with {rel: [list]}
    rel_lists = {rel: [] for rel in rels}
    for link_item in link_dicts:        
        if link_item["rel"] in rels:
            rel_lists[link_item["rel"]].append(link_item["href"])
            rel_lists[link_item["rel"]+"_ids"] = [extract_id_from_href(href) for href in rel_lists[link_item["rel"]]]
    return rel_lists

def transform_data(data: List[Dict], keys: Dict, rels: List)  -> List[Dict]:
    transformed = []
    for entry in data:
        data_transformed = {}
        for key, value in keys.items():
            data_transformed[key] = entry[value]
            href_dict = get_hrefs_as_dict_of_lists(entry, rels)
            # merge the transformed person with the hrefs
            data_transformed = {**data_transformed, **href_dict}
        transformed.append(data_transformed)
    return transformed

### Quick check
# get first element of the persons.json
# persons = get_data(gtr_persons_json)
# # person = persons[10] # example of sparse person
# entry = persons[0] # example of person fully populated
# # pprint(persons)
# hrefs = get_hrefs_as_dict_of_lists(entry, person_rels)
# pprint(hrefs)

In [3]:
# personan keys are used to extract the data from the json file that are not nested
# difference in key - value is just prefernece, for example if we want to specify that an id 
# is for a person so we can join with other datasets that also include an ID field but for orgs 
# or projects
person_keys = {
    "person_id": "id",
    "firstName": "firstName",
    "surname": "surname",
    "otherNames": "otherNames",
    "email": "email",
    "orcidId": "orcidId",
    "created": "created",
}

# rels is a list of the `rel` fields in the nested list of dictionaries in the json file
# Each returns an href to another json file that contins information about projects, organisations etc. 
person_rels = ["EMPLOYED", "PI_PER", "COI_PER"]

persons = get_data(gtr_persons_json)
persons_transformed = transform_data(persons, person_keys, person_rels)
# pprint(persons_transformed)
# create folder if doesn't exist
os.makedirs("data/transformed", exist_ok=True)
# save persons transformed and organisations transformed to csv
persons_df = pd.DataFrame(persons_transformed)
persons_df.to_csv("data/transformed/persons.csv", index=False)


In [4]:
organisation_keys = {
    "name": "name",
    "organisation_id": "id",
    "website": "website",
    "created": "created"
    # ignore postcode/ location for now.
    # ad "created" back in at some point could be helpful, same with projects and persons.
}

organisation_rels = ["EMPLOYEE", "PROJECT"]

organisations = get_data(gtr_organisations_json)
organisations_transformed = transform_data(organisations, organisation_keys, organisation_rels)
# pprint(organisations_transformed)


organisations_df = pd.DataFrame(organisations_transformed)
organisations_df.to_csv("data/transformed/organisations.csv", index=False)

# Joining Datasets



# Questions
1. Do we really want to be renaming fields like id -> organisation_id, or handle that at the point of joining?
2. We should really save the data iteratively, and allow it to run from the last processed entry. 