# `Gene`: ensembl

- https://www.ensembl.org/info/data/mysql.html

- https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

In [None]:
from typing import Literal

from bionty.base.entities._gene import EnsemblGene
from lamin_utils import logger

In [None]:
def ingest_ensembl_gene_data(
    organism: Literal["human", "mouse", "saccharomyces cerevisiae"], version: str
):
    """Ingest gene data from Ensembl."""
    ensembl_gene = EnsemblGene(organism=organism, version=version)

    df = ensembl_gene.download_df()
    df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)

    df.to_parquet(f"df_{organism}_ensembl_{version}_gene.parquet")

    df_legacy = ensembl_gene.download_legacy_ids_df(df, col="stable_id")

    logger.info(f"Legacy data shape: {df_legacy.shape}")

    return df, df_legacy