## Create persons

With this script you can make persons, including their ORCID and email address, from an excel file and upload them to the Knowledge Graph directly.

The steps in the notebook are as follows:
1. Create persons, ORCID and contactInformation (i.e. email address) instances from an excel file
2. Post the newly created instances to the KGE

An overview of the newly created instances with their persistent identifiers are saved in the output folder.

To be able to run the script, you need to the following requirements:
- Python version >= 3.6
- openMINDS package (can be downloaded from https://pypi.org/project/openMINDS/)
- read and write permission to the KG via the API


In [None]:
# import relevant packages
from getpass import getpass
import requests
import os
import json
import glob
import pandas as pd
import openMINDS
import openMINDS.version_manager

### Import relevant information

Give the path to the file and the file name with the person metadata.

In [None]:
# Place the script in the same folder as the csv file or define Location of the files
cwd = os.getcwd()

answer = ""
while answer not in ["y", "n"]: 
    answer = input(f"Is this where your files are stored: {cwd}? yes (y) or no (n) " ) 
    if answer == "y":
        fpath = cwd
        break
    elif answer == "n":
        fpath = input("Please define you path: ")
        break

# Load information for the persons
person_file = input("What is name of the file with the person information? ")
personList = pd.read_excel(os.path.join(cwd, person_file + '.xlsx'))

output_path = os.path.join(fpath, "instances_" + person_file)
print(f"The output folder is {output_path}")

### Create instances for persons

To create instances that are conform with openMINDS, we make use of the openMINDS python package.

In [None]:
# Initialise the local copy of openMINDS
openMINDS.version_manager.init()
openMINDS.version_manager.version_selection('v3')
helper = openMINDS.Helper()

In [None]:
# Function to create person instances including email address and orcid instances (if available)
def createInstances(df): 
    """
    
    Parameters
    ----------
    df : pandas DataFrame
        DataFrame with information to person instances including email address and orcid instances

    Returns
    -------
    data : pandas DataFrame
        Overview of all information and newly created instances.

    """

    person_dict = {}
    email_dict = {}
    orcid_dict = {}
    data = pd.DataFrame([])
    for num in range(len(df)):

        print(f"Creating person: {str(df.firstName[num])} {str(df.lastName[num])}\n")

        # initiate the collection into which you will store all metadata instances
        mycol = helper.create_collection()
        
        # create a metadata instance for (e.g.) the openMINDS Person schema
        person_dict[df.firstName[num]] = mycol.add_core_person(givenName = df.firstName[num])

        # add more metadata to a created instance
        mycol.get(person_dict[df.firstName[num]]).familyName = df.lastName[num]
        
        # if an email address exists, make the contact information and then add to person instance
        if pd.isnull(df.email[num]):
            email_dict[df.firstName[num]] = None
        else: 
            email_dict[df.firstName[num]] = mycol.add_core_contactInformation(email=df.email[num])
        
        mycol.get(person_dict[df.firstName[num]]).contactInformation = email_dict[df.firstName[num]]

        # if orcid exists, make orcid and then add to person instance
        if pd.isnull(df.ORCID[num]):
            orcid_dict[df.firstName[num]] = None
        else: 
            orcid_dict[df.firstName[num]] = mycol.add_core_ORCID(identifier=df.ORCID[num])
        
        mycol.get(person_dict[df.firstName[num]]).digitalIdentifier = orcid_dict[df.firstName[num]]


        data = data.append(pd.DataFrame({"firstName" : df.firstName[num],
            "lastName" : df.lastName[num],
            "person_uuid" : person_dict[df.firstName[num]].split("/")[-1],  
            "email" : df.email[num],
            "email_uuid" : email_dict[df.firstName[num]].split("/")[-1],
            "orcid" : df.ORCID[num],
            "orcid_uuid" : orcid_dict[df.firstName[num]].split("/")[-1]},                
                        index=[0]), ignore_index=True)

        mycol.save(os.path.join(output_path, ""))  
    
    return data


Run the cell below to create the instances and save an overview file in the output folder

In [None]:
# Create instances and save them    
data = createInstances(personList)

if data.empty:
    print("No instances were created. Try again")
else:
    savedInstances = os.path.join(output_path, "createdPersons.csv")
    data.to_csv(savedInstances, index = False, header=True)

### Authentication

To be able to upload the newly created instances directly to the Knowledge Graph editor via the API, you need an access token. To request a token, follow this link: https://nexus-iam.humanbrainproject.org/v0/oauth2/authorize or copy your token from the Knowledge Graph Editor (if you have access).

In [None]:
token = ""#getpass(prompt='Please paste your token: ')

## Upload instances to the KGE

With the upload function we can directly upload the newly created instances to the correct space in the editor.

In [None]:
# Function to upload the instances to the KGE
def upload(instances_fnames, token, space_name):
    """
    
    Parameters
    ----------
    instances_fnames : List 
        list of file paths to instances that need to be uploaded
    token : string
        Authorisation token to get access to the KGE
    space_name : string
        Space that the instances needs to be uploaded to, e.g. "dataset", "common", etc.

    Returns
    -------
    response : dictionary
        For each UUID as response is stored that indications if the upload 
        was successful

    """
    
    hed = {"accept": "*/*",
           "Authorization": "Bearer " + token,
           "Content-Type": "application/json"
           }
    
    # Prefix to upload to the right space
    url = "https://core.kg.ebrains.eu/v3-beta/instances/{}?space=" + space_name
    kg_prefix = "https://kg.ebrains.eu/api/instances/"
    
    new_instances = []
    for fname in instances_fnames:
        with open(fname, 'r') as f:
            new_instances.append(json.load(f))
        f.close()
    
    # Correct the capitalisation in the openMINDS package
    for instance in new_instances:
        atid = kg_prefix + instance["@id"].split("/")[-1] #only take the UUID 
        instance["@id"] = atid
        if "contactInformation" in instance.keys():
            atid = kg_prefix + instance["contactInformation"].split("/")[-1] #only take the UUID 
            instance["contactInformation"] = atid
        if "digitalIdentifier" in instance.keys():
            atid = kg_prefix + instance["digitalIdentifier"].split("/")[-1] #only take the UUID 
            instance["digitalIdentifier"] = atid
        if instance["@type"].endswith("Contactinformation"):
            splittype = instance["@type"].split("/")[:-1]
            splittype.append("ContactInformation")
            instance["@type"] = "/".join(splittype)
        if instance["@type"].endswith("Orcid"):
            splittype = instance["@type"].split("/")[:-1]
            splittype.append("ORCID")
            instance["@type"] = "/".join(splittype)
    
    # Upload to the KGE
    print("\nUploading instances now:\n")
    
    count = 0
    response = {}    
    for instance in new_instances:
        count += 1
        print("Posting instance " + str(count)+"/"+str(len(new_instances)))
        atid = instance["@id"].split("/")[-1] 
        response[atid] = requests.post(url.format(atid), json=instance, headers=hed)
        if response[atid].status_code == 200:
            print(response[atid], "OK!" )
        elif response[atid].status_code == 409:
            print(response[atid], "Instance already exists")
        elif response[atid].status_code == 401:
            print(response[atid], "Token not valid, authorisation not successful")
        else:
            print(response[atid])
        
        
    return response  

Running the cell will upload the instances to the KGE

In [None]:
# Upload instances to the KGE
answer = input("Would you like to upload the instances you created to the KGE? yes (y) or no (n) " ) 

if answer == "y":
    
    if token != "":
        orcid_instances = glob.glob(os.path.join(output_path,"ORCID") + "*\\*", recursive = True)
        if not orcid_instances == []:
            response = upload(orcid_instances, token, space_name = "common")  
        
        person_instances = glob.glob(os.path.join(output_path,"person") + "*\\*", recursive = True)
        if not person_instances == []:
            response = upload(person_instances, token, space_name = "common")  
        
        email_instances = glob.glob(os.path.join(output_path,"contactInformation") + "*\\*", recursive = True)
        if not email_instances == []:
            response = upload(email_instances, token, space_name = "restricted")  

        print("Done!")

    else:
        print("Token is empty, refresh token before trying to upload instances")
        
elif answer == "n":
    print("\nDone!")