DerivaML is a class library built on the Deriva Scientific Asset management system that is designed to help simplify a number of the basic operations associated with building and testing ML libraries based on common toolkits such as TensorFlow.  This notebook reviews the basic features of the DerivaML library.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from deriva.core import DerivaServer, ErmrestCatalog, get_credential
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
from deriva_ml.deriva_ml_base import DerivaML, DerivaMLException, ColumnDefinition, BuiltinTypes
from deriva_ml.schema_setup.create_schema import create_ml_schema
from deriva_ml.schema_setup.test_catalog import create_test_catalog
from deriva_ml.execution_configuration import ExecutionConfiguration

Set the details for the catalog we want and authenticate to the server if needed.

In [None]:
hostname = 'dev.eye-ai.org'
domain_schema = 'demo-schema'

gnl = GlobusNativeLogin(host=hostname)
if gnl.is_logged_in([hostname]):
    print("You are already logged in.")
else:
    gnl.login([hostname], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")


Create a test catalog and get an instance of the DerivaML class.

In [None]:
test_catalog = create_test_catalog(hostname, domain_schema)
ml_instance = DerivaML(hostname, test_catalog.catalog_id, domain_schema, None, None, "1")

In [None]:
ml_instance.chaise_url("Subject")

In [None]:
print([a.name for a in ml_instance.list_dataset_element_types()])
ml_instance.add_dataset_element_type("Subject")
print([a.name for a in ml_instance.list_dataset_element_types()])

In [None]:
# Create a new dataset
type_rid = ml_instance.add_term("Dataset_Type", "TestSet", description="A test")
dataset_rid = ml_instance.create_dataset(type_rid.name, description="A Dataset")

subject_rids = [i['RID'] for i in ml_instance.domain_path.tables['Subject'].entities().fetch()]
ml_instance.add_dataset_members(dataset_rid=dataset_rid, members=subject_rids)
ml_instance.list_dataset_members(dataset_rid)

In [None]:
def strip_system(d):
    return {k:v for k,v in d.items() if k not in ['RCT', 'RMT', 'RCB', 'RMB']}

pd.DataFrame([strip_system(m) for m in ml_instance.find_datasets()])

In [None]:
pd.DataFrame([{'Name': v.name, 'Description': v.description, 'Synonyms': v.synonyms} for v in ml_instance.list_vocabulary_terms("My termset")])

In [None]:
test_catalog.delete_ermrest_catalog(really=True)