# `Organism`: ncbitaxon, 2023-06-20

In [None]:
import bionty as bt

In [None]:
onto = bt.Ontology(
    "/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/ontology_all__ncbitaxon__2023-06-20__Organism"
)

In [None]:
onto

Ontology('/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/ontology_all__ncbitaxon__2023-06-20__Organism', timeout=100)

In [None]:
term = onto["NCBITaxon:9606"]
term

Term('NCBITaxon:9606', name='Homo sapiens')

In [None]:
[i.description for i in term.synonyms if i.scope == "EXACT"]

['human']

In [None]:
df_values = []
for term in onto.terms():

    # # term definition text
    definition = None if term.definition is None else term.definition.title()

    # get synonyms as a list
    synonyms_list = [i.description for i in term.synonyms if i.scope == "EXACT"]
    # concatenate synonyms into a string
    if len(synonyms_list) > 0:
        common_name = synonyms_list[0]
        synonyms_list = synonyms_list[1:]
    else:
        common_name = term.name
    synonyms = "|".join(synonyms_list)
    if len(synonyms) == 0:
        synonyms = None  # type:ignore

    # get 1st degree parents as a list
    superclasses = [
        s.id
        for s in term.superclasses(distance=1, with_self=False).to_set()
        if s.id.startswith("NCBITaxon")
    ]

    df_values.append(
        (term.id, common_name, term.name, definition, synonyms, superclasses)
    )

In [None]:
len(df_values)

2511800

In [None]:
import pandas as pd

df = pd.DataFrame(
    df_values,
    columns=[
        "ontology_id",
        "name",
        "scientific_name",
        "definition",
        "synonyms",
        "parents",
    ],
).set_index("ontology_id")

In [None]:
df

Unnamed: 0_level_0,name,scientific_name,definition,synonyms,parents
ontology_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NCBITaxon:1,root,root,,,[]
NCBITaxon:10,Cellvibrio,Cellvibrio,,,[NCBITaxon:1706371]
NCBITaxon:100,Ancylobacter aquaticus,Ancylobacter aquaticus,,,[NCBITaxon:99]
NCBITaxon:100000,Herbaspirillum sp. BA12,Herbaspirillum sp. BA12,,,[NCBITaxon:2624150]
NCBITaxon:1000000,Microbacterium sp. 6.11-VPa,Microbacterium sp. 6.11-VPa,,,[NCBITaxon:2609290]
...,...,...,...,...,...
NCBITaxon:superorder,superorder,superorder,,,[NCBITaxon:taxonomic_rank]
NCBITaxon:superphylum,superphylum,superphylum,,,[NCBITaxon:taxonomic_rank]
NCBITaxon:taxonomic_rank,taxonomic rank,taxonomic rank,,,[]
NCBITaxon:tribe,tribe,tribe,,,[NCBITaxon:taxonomic_rank]


In [None]:
df.name = df.name.str.lower()

In [None]:
df.loc["NCBITaxon:9606"]

name                          human
scientific_name        Homo sapiens
definition                     None
synonyms                       None
parents            [NCBITaxon:9605]
Name: NCBITaxon:9606, dtype: object

In [None]:
df.to_parquet(
    "/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/df_all__ncbitaxon__2023-06-20__Organism.parquet"
)

In [None]:
from bionty.dev._md5 import calculate_md5

In [None]:
calculate_md5(
    "/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/df_all__ncbitaxon__2023-06-20__Organism.parquet"
)

'00d97ba65627f1cd65636d2df22ea76c'