# `CellLine`: clo; 2022-03-21

The owl files are missing metadata including definition and synonyms for clo, so we manually parse them from the csv file.

Download `clo.csv.gz` from: https://data.bioontology.org/ontologies/CLO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv
https://bioportal.bioontology.org/ontologies/CLO

In [None]:
import pandas as pd


def df_from_csv(csv_filepath, prefix):
    df = pd.read_csv(csv_filepath)
    # df = df[~df["Obsolete"]]
    df["ontology_id"] = (
        df["Class ID"]
        .str.replace("http://purl.obolibrary.org/obo/", "")
        .str.replace("_", ":")
    )
    df = df[df["ontology_id"].str.startswith("CLO")]
    df.drop(columns=["definition"], inplace=True)
    df.rename(
        columns={
            "Preferred Label": "name",
            "Synonyms": "synonyms",
            "Definitions": "definition",
            "Parents": "parents",
        },
        inplace=True,
    )
    parents = []
    for p in df["parents"]:
        try:
            plist = [
                i
                for i in p.replace("http://purl.obolibrary.org/obo/", "")
                .replace("_", ":")
                .split("|")
                if i.startswith(prefix)
            ]
            parents.append(plist)
        except AttributeError:
            parents.append([])
    df["parents"] = parents
    df = df[["ontology_id", "name", "definition", "synonyms", "parents"]]
    df = df.sort_values("ontology_id")

    # drop duplicated names, keep the last record
    df = df.drop_duplicates("name", keep="last")

    return df.set_index("ontology_id")

In [None]:
df = df_from_csv("clo.csv.gz", "CLO")

In [None]:
df

In [None]:
df.loc["CLO:0007050"]

In [None]:
# adding RPE1 and RPE to synonyms as it's used quite often

df.loc["CLO:0004290"]["synonyms"] += "|RPE1|RPE-1|RPE"

In [None]:
df.loc["CLO:0004290"]["synonyms"]

In [None]:
df.to_parquet("df_all__clo__2022-03-21__CellLine.parquet")