In [1]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [2]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
from deriva.chisel import Model, Schema, Table, Column, Key, ForeignKey, builtin_types
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [3]:
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
# catalog_id = "362" #@param
catalog_id = "412" #@param
# host = 'www.eye-ai.org'
host = 'dev.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2024-10-17 12:55:54,168 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-10-17 12:55:54,169 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


You are already logged in.


In [4]:
cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.
EA = EyeAI(hostname = host, catalog_id = catalog_id, 
           cache_dir= cache_dir, working_dir=working_dir)

2024-10-17 12:55:56,585 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-10-17 12:55:56,586 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


In [5]:
cv_list = {}
for t in EA.find_vocabularies():
    cv_list[t.name] = {'schema': t.schema.name, 'fks': t.referenced_by}

full_list = cv_list
full_list.pop('Subject_image_quality_factor')
full_list

{'Diagnosis_Subject_Vocab': {'schema': 'eye-ai', 'fks': []},
 'Diagnosis_Observation_Vocab': {'schema': 'eye-ai', 'fks': []},
 'Diagnosis_Image_Vocab': {'schema': 'eye-ai',
  'fks': [<deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'Diagnosis_Diagnosis_Vocab_fkey' at 0x7f3f1b0bc5e0>,
   <deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'Subject_Diagnosis_Diagnosis_Vocab_fkey' at 0x7f3f1b0be950>,
   <deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'Observation_Diagnosis_Diagnosis_Vocab_fkey' at 0x7f3f1b0bf370>]},
 'Image_Side_Vocab': {'schema': 'eye-ai',
  'fks': [<deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'Image_Image_Side_Vocab_fkey' at 0x7f3f1b8de530>,
   <deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'Clinical_Records_Image_Side_Vocab_fkey' at 0x7f3f1b0d3b50>,
   <deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'OCR_RNFL_Image_Side_Vocab_fkey' at 0x7f3f1b0f8550>,
   <deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'OCR_HVF_Image_Side_Vocab_f

In [7]:
# cleanup_table_names(cv_list)

In [6]:
def create_new_cv_table(cv_schema_name, old_cv_table_name, t_name):
    cv_schema = EA.catalog.getPathBuilder().schemas[cv_schema_name]
    
    # retrieve exiting terms
    old_cv_table = cv_schema.tables[old_cv_table_name]
    path = old_cv_table.path
    entities = pd.DataFrame(path.entities())
    terms_ingest = entities[['Name', 'Description', 'Synonyms']]
    terms_ingest_mapping = entities[['RID', 'Name']]
    
    # create new table
    if(t_name not in cv_schema.tables):
        new_vocab_table = EA.create_vocabulary(
            vocab_name = t_name,
            schema= cv_schema_name)

        # ingest exiting terms
        ingest_list = terms_ingest.to_dict(orient='records')
        cv_schema = EA.catalog.getPathBuilder().schemas[cv_schema_name]
        new_vocab = cv_schema.tables[t_name]
        new_vocab.insert(ingest_list, defaults={'ID', 'URI'})
    return terms_ingest_mapping

def update(table, mapping_list, num_up):
    n = len(mapping_list)
    for i in range(n // num_up):
        table.update(mapping_list[i * num_up: (i + 1) * num_up], [table.RID])
        logging.info(f"Updated indices: {i * num_up} to {(i + 1) * num_up}")
    remaining_start = (i + 1) * num_up if n >= num_up else 0
    if remaining_start < n:
        table.update(mapping_list[remaining_start: n], [table.RID])
        logging.info(f"Updated indices: {remaining_start} to {n}")


def build_association(cv_schema_name, old_cv_table_name, asso_schema_name, asso_table_name, 
                      t_name, fk_col_name, terms_ingest_mapping):
    EA = EyeAI(hostname = host, catalog_id = catalog_id, 
           cache_dir= cache_dir, working_dir=working_dir)
    cv_model_schema = EA.model.schemas[cv_schema_name]
    asso_model_schema = EA.model.schemas[asso_schema_name]
    
    # 3. build FK
    asso_table = asso_model_schema.tables[asso_table_name]
    new_vocab_table = cv_model_schema.tables[t_name]
    try:
        asso_table.create_reference((t_name, True, new_vocab_table))
    except:
        pass

    # 4. add new references - Name
    asso_schema = EA.catalog.getPathBuilder().schemas[asso_schema_name]
    asso_entities = asso_schema.tables[asso_table_name]
    path = asso_entities.path

    terms_ingest = pd.DataFrame(path.entities())[['RID', fk_col_name]]
    terms_ingest = pd.merge(terms_ingest, terms_ingest_mapping, how='left', left_on=fk_col_name, right_on='RID')
    
    mapping_ingest = terms_ingest[['RID_x', 'Name']]
    mapping_ingest.rename(columns={'RID_x':'RID', 'Name':t_name}, inplace=True)

    mapping_ingest.dropna(inplace=True)
    mapping_list = mapping_ingest.to_dict(orient='records')
    update(asso_entities, mapping_list, 500)
    # asso_entities.update(mapping_list, [asso_entities.RID])
    

In [9]:
def refactor_cv_tables(cv_table_dict):
    for key, values in cv_table_dict.items():
        old_cv_table_name = key
        logging.info(f"Updating cv table {key}")
        if old_cv_table_name.endswith('_Vocab'):
            t_name = old_cv_table_name[:-6]
        else:
            t_name = old_cv_table_name+'_Vocab'
        cv_schema_name = values['schema']
        # create new cv table with values:
        terms_ingest_mapping = create_new_cv_table(cv_schema_name, old_cv_table_name, t_name)
        
        # find associated tables
        for fk in values['fks']:
            fk_cols = []
            for col in fk.columns:
                asso_schema_name = col.table.schema.name
                asso_table_name = col.table.name
                fk_col_name = col.name
                # Build association and update FK col
                build_association(cv_schema_name, old_cv_table_name, 
                                  asso_schema_name, asso_table_name, 
                                  t_name, fk_col_name,
                                  terms_ingest_mapping)
                fk_cols.append(col)
            fk.drop()
            for c in fk_cols:
                c.drop()

In [10]:
refactor_cv_tables(cv_list)
test_list

2024-10-17 12:58:04,790 - INFO - Updating cv table Diagnosis_Subject_Vocab
2024-10-17 12:58:05,668 - INFO - Updating cv table Diagnosis_Observation_Vocab
2024-10-17 12:58:06,872 - INFO - Updating cv table Diagnosis_Image_Vocab
2024-10-17 12:58:07,607 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2024-10-17 12:58:07,608 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapping_ingest.rename(columns={'RID_x':'RID', 'Name':t_name}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/st

NameError: name 'test_list' is not defined

In [29]:
full_list
def clean_old_cv_table(cv_list):
    for key, values in cv_list.items():
        table = EA.model.schemas[values['schema']].tables[key]
        for fk in values['fks']:
            fk_cols = [col for col in fk.columns]
            fk.drop()
            for c in fk_cols:
                c.drop()
        table.drop()

In [35]:
clean_old_cv_table(full_list)

In [33]:
# full_list.pop('Diagnosis_Subject_Vocab')
# full_list.pop('Diagnosis_Observation_Vocab')
full_list.pop('Diagnosis_Image_Vocab')


{'schema': 'eye-ai', 'fks': []}

In [34]:
full_list

{'Image_Side_Vocab': {'schema': 'eye-ai',
  'fks': [<deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'OCR_HVF_Image_Side_Vocab_fkey' at 0x7f3f1b0f9d80>]},
 'Image_Angle_Vocab': {'schema': 'eye-ai', 'fks': []},
 'Image_Quality_Vocab': {'schema': 'eye-ai', 'fks': []},
 'Subject_Gender': {'schema': 'eye-ai', 'fks': []},
 'Subject_Ethnicity': {'schema': 'eye-ai', 'fks': []},
 'Subject_hypertension': {'schema': 'eye-ai', 'fks': []},
 'Subject_insulin_dependent': {'schema': 'eye-ai', 'fks': []},
 'Subject_pregnant': {'schema': 'eye-ai', 'fks': []},
 'Subject_cataract': {'schema': 'eye-ai', 'fks': []},
 'Subject_maculopathy': {'schema': 'eye-ai', 'fks': []},
 'Subject_other': {'schema': 'eye-ai', 'fks': []},
 'Subject_image_quality': {'schema': 'eye-ai', 'fks': []},
 'Image_Output': {'schema': 'eye-ai', 'fks': []},
 'Diagnosis_Tag': {'schema': 'eye-ai',
  'fks': [<deriva.core.ermrest_model.ForeignKey object 'eye-ai'.'Subject_Diagnosis_Diagnosis_Tag_fkey' at 0x7f3f1b0be800>]},
 'Diagnosis