# Update ontology sources

In [None]:
!lamin connect laminlabs/bionty-assets

In [None]:
import bionty as bt
import lamindb as ln
from bionty.base._ontology_url import get_ontology_url
from bionty.core._source import register_source_in_bionty_assets
from lamin_utils import logger

ln.settings.verbosity = "hint"

ln.track("7extigZj6QNG")

All entities that are not listed in the following matrix must be curated manually as they require intervention.
Consult https://bionty-assets-gczz.netlify.app/ingest/ for guidance.

In [None]:
configs = [
    ("Disease", "mondo", "all"),
    ("CellType", "cl", "all"),
    ("Organism", "ncbitaxon", "all"),
    (
        "Tissue",
        "uberon",
        "all",
    ),  # This may take a long time due to parsing, set verbosity to hint to see progress updates
    ("Disease", "doid", "human"),
    ("ExperimentalFactor", "efo", "all"),
    ("Phenotype", "pato", "all"),
    ("Phenotype", "hp", "human"),
    ("Pathway", "go", "all"),
    # ("Pathway", "pw", "all"), Currently leads to a URL error - upstream issue
    ("DevelopmentalStage", "hsapdv", "human"),
    ("DevelopmentalStage", "mmusdv", "mouse"),
    ("Ethnicity", "hancestro", "human"),
    # ("Drug", "dron", "all"), Not a Bionty entity (yet)
]

In [None]:
for i, config in enumerate(configs, 1):
    entity, source_name, organism, *version = config
    config_id = f"{entity}_{source_name}_{organism}"
    logger.info(f"[{i}/{len(configs)}] Processing {config_id}")

    try:
        *_, version_to_use = get_ontology_url(
            prefix=source_name, version=version[0] if version else None
        )

        new_df = getattr(bt.base, entity)(
            source=source_name, version=version_to_use
        ).to_dataframe()

        if new_df.empty or not {"name", "synonyms"}.issubset(new_df.columns):
            logger.warning(f"{config_id} failed validation. Skipping...")
            continue

        try:
            current_source = bt.Source.filter(
                entity=f"bionty.{entity}",
                name=source_name,
                organism=organism,
                currently_used=True,
            ).one_or_none()
            if current_source:
                current_df = getattr(bt.base, entity)(
                    source=current_source
                ).to_dataframe()
                if new_df.shape[0] < current_df.shape[0]:
                    logger.warning(
                        f"{config_id} has fewer rows than current. Skipping..."
                    )
                    continue
        except ValueError as e:
            if "No source url is available" in str(e):
                pass  # This occurs during testing in local instances
            else:
                raise
        except Exception:
            pass

        source_rec = getattr(bt, entity).add_source(
            source=source_name, version=version_to_use
        )
        register_source_in_bionty_assets(
            f"{bt.base.settings.dynamicdir}/df_{organism}__{source_name}__{version_to_use}__{entity}.parquet",
            source=source_rec,
            is_dataframe=True,
        )
        register_source_in_bionty_assets(
            f"{bt.base.settings.dynamicdir}/ontology_{organism}__{source_name}__{version_to_use}__{entity}",
            source=source_rec,
            is_dataframe=False,
        )

        logger.info(f"registered {config_id} version {version_to_use}")

    except ValueError as e:
        if "artifact already exists" in str(e):
            logger.warning(f"{config_id} already registered. Skipping...")
        else:
            logger.error(f"{config_id} failed: {e}")
    except FileNotFoundError:
        logger.warning(f"{config_id} files not found. Skipping...")
    except Exception as e:
        logger.error(
            f"[{i}/{len(configs)}] {config_id} failed: {type(e).__name__}: {str(e)}"
        )
        continue

In [None]:
ln.finish()