# Update ontology sources

In [None]:
!lamin connect laminlabs/bionty-assets

In [None]:
import bionty as bt
import lamindb as ln
from bionty.core._source import register_source_in_bionty_assets
from lamin_utils import logger

ln.settings.verbosity = "hint"

ln.track("7extigZj6QNG")

All entities that are not listed in the following matrix must be curated manually as they require intervention.
Consult https://bionty-assets-gczz.netlify.app/ingest/ for guidance.

In [None]:
def process_ontology_entities(entity_configs: list[tuple[str, str, str, ...]]) -> None:
    """Register ontology entities from various sources as new versions in bionty assets."""
    from bionty.base._ontology_url import get_ontology_url

    total_configs = len(entity_configs)

    for i, config in enumerate(entity_configs, 1):
        entity, source_name, organism, *version = config
        config_id = f"{entity}_{source_name}_{organism}"

        logger.info(f"[{i}/{total_configs}] Processing {config_id}")

        try:
            logger.debug(
                f"getting ontology URL for {source_name}, requested version: {version[0] if version else 'latest'}"
            )
            *_, version_to_use = get_ontology_url(
                prefix=source_name, version=version[0] if version else None
            )
            logger.info(
                f"processing... {entity:<20} {source_name:<10} {version_to_use:<12} {organism}"
            )

            new_df = getattr(bt.base, entity)(
                source=source_name, version=version_to_use
            ).df()

            logger.debug(
                f"checking for existing source: entity=bionty.{entity}, name={source_name}, organism={organism}"
            )
            try:
                currently_used_source = bt.Source.filter(
                    entity=f"bionty.{entity}",
                    name=source_name,
                    organism=organism,
                    currently_used=True,
                ).one_or_none()

                if currently_used_source:
                    current_version_df = getattr(bt.base, entity)(
                        source=currently_used_source
                    ).df()
                    n_new = new_df.shape[0]
                    n_old = current_version_df.shape[0]

                    # should have more or equal values than the earlier version
                    if n_new < n_old:
                        logger.warning(
                            f"entity {entity} using source {source_name} of version {version_to_use} and organism {organism} has fewer rows than current version: {n_new} < {n_old}. Skipping..."
                        )
                        continue

                    # should have the same columns
                    old_columns = set(current_version_df.columns)
                    new_columns = set(new_df.columns)
                    missing_columns = old_columns - new_columns

                    if missing_columns:
                        logger.warning(
                            f"entity {entity} using source {source_name} of version {version_to_use} and organism {organism} is missing required columns: {missing_columns}. Skipping..."
                        )
                        continue
            except ValueError as e:
                if "No source url is available" in str(e):
                    pass  # This occurs during testing in local instances
                else:
                    raise

            logger.debug(f"adding source record for {entity}")
            source_rec = getattr(bt, entity).add_source(
                source=source_name, version=version_to_use
            )

            logger.info(f"registering assets for {config_id}...")
            try:
                register_source_in_bionty_assets(
                    f"{bt.base.settings.dynamicdir}/df_{organism}__{source_name}__{version_to_use}__{entity}.parquet",
                    source=source_rec,
                    is_dataframe=True,
                )
                register_source_in_bionty_assets(
                    f"{bt.base.settings.dynamicdir}/ontology_{organism}__{source_name}__{version_to_use}__{entity}",
                    source=source_rec,
                    is_dataframe=False,
                )
                logger.info(f"registered a new version {version_to_use} of {entity}.")

            except ValueError as e:
                if "artifact already exists" in str(e):
                    logger.warning(
                        f"entity {entity} using source {source_name} of version {version_to_use} and organism {organism} is already registered. Skipping..."
                    )
                else:
                    raise
            except FileNotFoundError:
                logger.warning(
                    f"entity {entity} using source {source_name} of version {version_to_use} and organism {organism} file cannot be found. "
                    "This can happen if the ontology was previously registered and the pronto ontology file did not get recreated. Skipping..."
                )

        except Exception as e:
            logger.error(
                f"[{i}/{total_configs}] {config_id} failed: {type(e).__name__}: {str(e)}"
            )
            continue

In [None]:
configs = [
    ("Disease", "mondo", "all"),
    ("CellType", "cl", "all"),
    ("Organism", "ncbitaxon", "all"),
    (
        "Tissue",
        "uberon",
        "all",
    ),  # This may take a long time due to parsing, set verbosity to hint to see progress updates
    ("Disease", "doid", "human"),
    ("ExperimentalFactor", "efo", "all"),
    ("Phenotype", "pato", "all"),
    ("Phenotype", "hp", "human"),
    ("Pathway", "go", "all"),
    # ("Pathway", "pw", "all"), Currently leads to a URL error - upstream issue
    ("DevelopmentalStage", "hsapdv", "human"),
    ("DevelopmentalStage", "mmusdv", "mouse"),
    ("Ethnicity", "hancestro", "human"),
    # ("Drug", "dron", "all"), Not a Bionty entity (yet)
]

process_ontology_entities(configs)

In [None]:
ln.finish()