In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
from deriva.chisel import Model, Schema, Table, Column, Key, ForeignKey, builtin_types
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
# catalog_id = "362" #@param
catalog_id = "428" #@param
# 411 412 426
# host = 'www.eye-ai.org'
host = 'dev.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

In [None]:
cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.
EA = EyeAI(hostname = host, catalog_id = catalog_id, 
           cache_dir= cache_dir, working_dir=working_dir)

In [None]:
cv_list = {}
for t in EA.find_vocabularies():
    cv_list[t.name] = {'schema': t.schema.name, 'fks': t.referenced_by}

full_list = cv_list
full_list.pop('Subject_image_quality_factor')
full_list

In [None]:
# cleanup_table_names(cv_list)

In [None]:
def create_new_cv_table(EA, cv_schema_name, old_cv_table_name, t_name):
    cv_schema = EA.catalog.getPathBuilder().schemas[cv_schema_name]
    
    # retrieve exiting terms
    old_cv_table = cv_schema.tables[old_cv_table_name]
    path = old_cv_table.path
    entities = pd.DataFrame(path.entities())
    terms_ingest = entities[['Name', 'Description', 'Synonyms']]
    terms_ingest_mapping = entities[['RID', 'Name']]
    
    # create new table
    if(t_name not in cv_schema.tables):
        new_vocab_table = EA.create_vocabulary(
            vocab_name = t_name,
            schema= cv_schema_name)

        # ingest exiting terms
        ingest_list = terms_ingest.to_dict(orient='records')
        cv_schema = EA.catalog.getPathBuilder().schemas[cv_schema_name]
        new_vocab = cv_schema.tables[t_name]
        new_vocab.insert(ingest_list, defaults={'ID', 'URI'})
    return terms_ingest_mapping

def update(table, mapping_list, num_up):
    n = len(mapping_list)
    for i in range(n // num_up):
        table.update(mapping_list[i * num_up: (i + 1) * num_up], [table.RID])
        logging.info(f"Updated indices: {i * num_up} to {(i + 1) * num_up}")
    remaining_start = (i + 1) * num_up if n >= num_up else 0
    if remaining_start < n:
        table.update(mapping_list[remaining_start: n], [table.RID])
        logging.info(f"Updated indices: {remaining_start} to {n}")


def build_association(EA, cv_schema_name, old_cv_table_name, asso_schema_name, asso_table_name, 
                      t_name, fk_col_name, terms_ingest_mapping):
    # EA = EyeAI(hostname = host, catalog_id = catalog_id, 
    #        cache_dir= cache_dir, working_dir=working_dir)
    cv_model_schema = EA.model.schemas[cv_schema_name]
    asso_model_schema = EA.model.schemas[asso_schema_name]
    
    # 3. build FK
    asso_table = asso_model_schema.tables[asso_table_name]
    new_vocab_table = cv_model_schema.tables[t_name]
    try:
        asso_table.create_reference((t_name, True, new_vocab_table))
    except:
        pass

    # 4. add new references - Name
    asso_schema = EA.catalog.getPathBuilder().schemas[asso_schema_name]
    asso_entities = asso_schema.tables[asso_table_name]
    path = asso_entities.path

    terms_ingest = pd.DataFrame(path.entities())[['RID', fk_col_name]]
    terms_ingest = pd.merge(terms_ingest, terms_ingest_mapping, how='left', left_on=fk_col_name, right_on='RID')
    
    mapping_ingest = terms_ingest[['RID_x', 'Name']]
    mapping_ingest.rename(columns={'RID_x':'RID', 'Name':t_name}, inplace=True)

    mapping_ingest.dropna(inplace=True)
    mapping_list = mapping_ingest.to_dict(orient='records')
    update(asso_entities, mapping_list, 500)
    

In [None]:
def refactor_cv_tables(cv_table_dict):
    for key, values in cv_table_dict.items():
        old_cv_table_name = key
        logging.info(f"Updating cv table {key}")
        if old_cv_table_name.endswith('_Vocab'):
            t_name = old_cv_table_name[:-6]
        else:
            t_name = old_cv_table_name+'_Vocab'
        cv_schema_name = values['schema']
        # create new cv table with values:
        terms_ingest_mapping = create_new_cv_table(EA, cv_schema_name, old_cv_table_name, t_name)
        
        # find associated tables
        for fk in values['fks']:
            print(fk)
            fk_cols = []
            for col in fk.columns:
                asso_schema_name = col.table.schema.name
                asso_table_name = col.table.name
                fk_col_name = col.name
                # Build association and update FK col
                build_association(EA, cv_schema_name, old_cv_table_name, 
                                  asso_schema_name, asso_table_name, 
                                  t_name, fk_col_name,
                                  terms_ingest_mapping)
                fk_cols.append(col)
            fk.drop()
            for c in fk_cols:
                c.drop()

In [None]:
# refactor_cv_tables(results_cv)
# test_list

In [None]:
keys = ['Diagnosis_Image_Vocab','Image_Side_Vocab', 'Diagnosis_Tag', 'Diagnosis_Status']

remaining_list = {k: full_list[k] for k in keys if k in full_list}
remaining_list
refactor_cv_tables(remaining_list)

In [None]:
remaining_list2 = {}
remaining_list2['Image_Side_Vocab'] = remaining_list['Image_Side_Vocab']
remaining_list2
refactor_cv_tables(remaining_list2)

In [None]:
def clean_old_cv_table(cv_list):
    for key, values in cv_list.items():
        table = EA.model.schemas[values['schema']].tables[key]
        for fk in values['fks']:
            fk_cols = [col for col in fk.columns]
            fk.drop()
            for c in fk_cols:
                c.drop()
        table.drop()

In [None]:
clean_old_cv_table(full_list)

In [None]:
def consistent_cv_name(cv_tables):
    for key, values in cv_tables.items():
        tname = key
        logging.info(f"==== Updating cv table {tname}")
        if tname.endswith('_Vocab'):
            new_tname = tname[:-6]
            table = EA.model.table(values['schema'], tname)
            table.alter(table_name = new_tname)
            for fk in values['fks']:
                for col in fk.columns:
                    cname = col.name
                    new_cname = cname[:-6]
                    col.alter(name=new_cname)
            
# len(full_list)
consistent_cv_name(full_list)