## Creating service links and linking them to file bundles

With this notebook you can do the following:
1. Load CSV file with the tissue sample collection metadata (script "extract_tsc_info.ipynb") and the URL links for the viewer.
2. Create URL instances in the KGE
3. Create service links and link the correct file bundle to the correct URL instance
4. Post the newly created instances to the KGE

To be able to run the script, you need to the following requirements:
- Python version >= 3.6
- openMINDS package (can be downloaded from https://pypi.org/project/openMINDS/)
- read and write permission to the KG via the API

Information about the URL links should be stored in a .csv file with the following column names written in the correct way. **Note that more columns can be present in the csv file. They will not be used and do not affect the script.**: 
- sub_name
- tsc_name
- tsc_internalID
- fileBundle_name
- fileBundle_uuid
- URL_link

The subject and the internal identifier of the tissue sample collection will be used to generate a label for the service link, which can be found under the "view data" tab on the dataset card. The convention for this is as follows: "tissue sample collection <tsc_internalID> (subject <sub_name)>". The URL_link and the uuid of the file bundle is used link the file bundle to the service link.

In [1]:
# import relevant packages
from getpass import getpass
import requests
import os
import json
import glob
import pandas as pd
import openMINDS
import openMINDS.version_manager

### Load metadata

To load the metadata stored in the CSV file, we first define the location where the file is stored and the name of the file.

In [18]:
# Place the script in the same folder as the csv file or define Location of the files
cwd = os.getcwd()
answer = input("Is this where your files are stored: " + cwd + "? yes (y) or no (n) " ) 

if answer == "y":
    fpath = cwd
elif answer == "n":
    fpath = input("Please define you path: ")

kg_prefix = "https://kg.ebrains.eu/api/instances/"

# Load information for the service links
dsv_uuid = input("What is the UUID of the dataset? ")
filename = "tsc_" + dsv_uuid + ".csv"
print("The dataset version uuid is: " + dsv_uuid)

fpath = fpath + "\\" + dsv_uuid + "\\"
os.chdir(fpath)

In [None]:
# Load file and print the first few rows to check
df = pd.read_csv(filename)
df.head()

### Create instances for the URL and service links

To create instances that are conform with openMINDS, we make use of the openMINDS python package.

In [None]:
# Initialise the local copy of openMINDS
openMINDS.version_manager.init()
openMINDS.version_manager.version_selection('v3')
helper = openMINDS.Helper()

In [20]:
# Function to create URL instances and service link instances
def createInstances(df): 
    """
    
    Parameters
    ----------
    df : pandas DataFrame
        DataFrame with information to create URL and service link instances

    Returns
    -------
    data : pandas DataFrame
        Overview of all information and newly created instances.

    """
    
    # Ask the service the service links should be opened in.
    answer = input("Should the link be opened in 1) LocaliZoom or 2) Siibra-explorer: ")
    if answer == "1":
        service_atid = "https://openminds.ebrains.eu/instances/service/LocaliZoom"
    elif answer == "2":
        service_atid = "https://openminds.ebrains.eu/instances/service/siibraExplorer"
    elif answer == "3":
        service_atid = "https://openminds.ebrains.eu/instances/service/Multi-Image-OSd"
    
    #fileBundles = df.fileBundle.unique()
    link_dict = {}
    url_dict = {}
    data = pd.DataFrame([])
    for sample_num in range(len(df.tsc_internalID)):
   
        print("Creating URL for subject " + str(df.sub_name[sample_num]) + " tissue sample collection " + str(df.tsc_internalID[sample_num]) + "\n")
        
        # initiate the collection into which you will store all metadata instances
        mycol = helper.create_collection()
        
        # Create URL link 
        url_dict[df.fileBundle_name[sample_num]] = mycol.add_core_URL(URL = df.URL_link[sample_num])
        
        print("Creating service link for subject " + str(df.sub_name[sample_num]) + " file bundle " + str(df.fileBundle_name[sample_num]) + "\n")
    
        # Create Service link    
        link_dict[df.fileBundle_name[sample_num]] = mycol.add_core_serviceLink(
            dataLocation = [{"@id": kg_prefix + df.fileBundle_uuid[sample_num]}],
            openDataIn = [{"@id": kg_prefix + url_dict[df.fileBundle_name[sample_num]].split("/")[-1]}],
            service = [{"@id": service_atid}]) 
        if  pd.isnull(df.sub_name[sample_num]) and pd.isnull(df.tsc_internalID):
            label = "tissue sample collection (subject " + str(df.fileBundle_name[sample_num]) + ")"
        elif df.sub_name[sample_num] == df.tsc_internalID[sample_num]:
            label = "tissue sample collection (subject " + str(df.sub_name[sample_num]) + ")"
        else:
            label = "tissue sample collection " + str(df.tsc_internalID[sample_num]) + " (subject " + str(df.sub_name[sample_num]) + ")"
        mycol.get(link_dict[df.fileBundle_name[sample_num]]).name = label
    
        data = data.append(pd.DataFrame({"subject_name" : df.sub_name[sample_num],
                    "tsc_name" : df.tsc_internalID[sample_num],
                    "fileBundle_name" : df.fileBundle_name[sample_num],  
                    "URL_link" : df.URL_link[sample_num],
                    "URL_uuid" : url_dict[df.fileBundle_name[sample_num]].split("/")[-1],
                    "ServiceLink_uuid" : link_dict[df.fileBundle_name[sample_num]].split("/")[-1],
                    "ServiceLink_dataLocation_uuid" : df.fileBundle_uuid[sample_num],
                    "ServiceLink_name" : label,
                    "ServiceLink_service_atid" : service_atid},                
                            index=[0]), ignore_index=True)
    
        mycol.save(".\\")  
        
    return data

Before the instances can be created, please select which service/viewer the URLs need to be opened in.
Select:
1. LocaliZoom
2. Siibra-explorer
3. Multi-Image-OSd

In [None]:
# Create instances and save them    
data = createInstances(df)
savedInstances = ".\\instances_" + dsv_uuid + ".csv"
data.to_csv(savedInstances, index = False, header=True)

### Authentication

To be able to upload the newly created instances directly to the Knowledge Graph editor via the API, you need an access token. To request a token, follow this link: https://nexus-iam.humanbrainproject.org/v0/oauth2/authorize or copy your token from the Knowledge Graph Editor (if you have access).

In [22]:
token = getpass(prompt='Please paste your token: ')

### Upload instances to the Knowledge Graph editor

We first define the function to upload data to the KGE and then we execute the function.

In [23]:
# Function to upload the instances to the KGE
def upload(instances_fnames, token, space_name):
    """
    
    Parameters
    ----------
    instances_fnames : List 
        list of file paths to instances that need to be uploaded
    token : string
        Authorisation token to get access to the KGE
    space_name : string
        Space that the instances needs to be uploaded to, e.g. "dataset", "common", etc.

    Returns
    -------
    response : dictionary
        For each UUID as response is stored that indications if the upload 
        was successful

    """
    
    hed = {"accept": "*/*",
           "Authorization": "Bearer " + token,
           "Content-Type": "application/json"
           }
    
    # Prefix to upload to the right space
    url = "https://core.kg.ebrains.eu/v3-beta/instances/{}?space=" + space_name
    kg_prefix = "https://kg.ebrains.eu/api/instances/"
    
    new_instances = []
    for fname in instances_fnames:
        with open(fname, 'r') as f:
            new_instances.append(json.load(f))
        f.close()
    
    # Correct the capitalisation in the openMINDS package
    for instance in new_instances:
        atid = kg_prefix + instance["@id"].split("/")[-1] #only take the UUID 
        instance["@id"] = atid
        if "openDataIn" in instance.keys():
            atid = kg_prefix + instance["openDataIn"][0]["@id"].split("/")[-1] #only take the UUID 
            instance["openDataIn"][0]["@id"] = atid
        if instance["@type"].endswith("Servicelink"):
            splittype = instance["@type"].split("/")[:-1]
            splittype.append("ServiceLink")
            instance["@type"] = "/".join(splittype)
        if instance["@type"].endswith("Url"):
            splittype = instance["@type"].split("/")[:-1]
            splittype.append("URL")
            instance["@type"] = "/".join(splittype)
    
    # Upload to the KGE
    print("\nUploading instances now:\n")
    
    count = 0
    response = {}    
    for instance in new_instances:
        count += 1
        print("Posting instance " + str(count)+"/"+str(len(new_instances)))
        atid = instance["@id"].split("/")[-1] 
        response[atid] = requests.post(url.format(atid), json=instance, headers=hed)
        if response[atid].status_code == 200:
            print(response[atid], "OK!" )
        elif response[atid].status_code == 409:
            print(response[atid], "Instance already exists")
        elif response[atid].status_code == 401:
            print(response[atid], "Token not valid, authorisation not successful")
        else:
            print(response[atid])
        
        
    return response  

In [None]:
# Upload instances to the KGE
answer = input("Would you like to upload the instances you created to the KGE? yes (y) or no (n) " ) 

if answer == "y":
    instances_fnames = glob.glob(fpath + "*\\*", recursive = True)
    
    if token != "":
        response = upload(instances_fnames, token, space_name = "dataset")  
        
elif answer == "n":
    print("\nDone!")