# `CellMarker`: cellmarker; 2.0

The underlying curation process for `bionty.CellMarker.df()`

In [None]:
import pandas as pd

In [None]:
url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"
df_cm = pd.read_excel(url, dtype=str)

In [None]:
import unicodedata

greek_to_letter = {}
greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
for i in greek_alphabet:
    if unicodedata.name(i).startswith("GREEK "):
        greek_to_letter[i] = unicodedata.name(i).split(" ")[-1]


def greek2letter(string: str):
    """α -> ALPHA."""
    for k, v in greek_to_letter.items():
        string = string.replace(k, v)
    return string


def greek2latin(string: str):
    """α -> a."""
    greek_alphabet = "ΑαΒβΓγΔδΕεΖζΗηΘθΙιΚκΛλΜμΝνΞξΟοΠπΡρΣσςΤτΥυΦφΧχΨψΩω"
    latin_alphabet = "AaBbGgDdEeZzHhJjIiKkLlMmNnXxOoPpRrSssTtUuFfQqYyWw"
    greek2latin = str.maketrans(greek_alphabet, latin_alphabet)
    return string.translate(greek2latin)


letter_to_latin = {}
for k, v in greek_to_letter.items():
    letter_to_latin[v] = greek2latin(k)


def letter2latin(string: str):
    """ALPHA -> a."""
    for k, v in letter_to_latin.items():
        string = string.replace(k, v).replace(k.lower(), v).replace(k.capitalize(), v)
    return string


def remove_case_insensitive_dup(myList: list):
    result = []

    marker = set()

    for l in myList:
        ll = l.lower()
        if ll not in marker:  # test presence
            marker.add(ll)
            result.append(l)  # preserve order
    return result

In [None]:
def preprocess(df_cm: pd.DataFrame, species: str):
    df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
    print(f"Original shape: {df_cm.shape}")
    display(df_cm.head())

    # Drop tissue, cell type, tech, journal, Genename
    df = df_cm[["marker", "GeneID", "Symbol", "UNIPROTID"]].copy()
    # Remove duplications
    df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
    df = df.drop_duplicates(subset=["marker"])
    df.rename(
        columns={
            "marker": "name",
            "GeneID": "ncbi_gene_id",
            "Symbol": "gene_symbol",
            "UNIPROTID": "uniprotkb_id",
        },
        inplace=True,
    )
    df = df[df["name"].notnull()].copy()
    print(f"Unique shape: {df.shape}")
    display(df.head())

    # Aggregate on a no-, space latin name
    df.rename(columns={"name": "orig_name"}, inplace=True)
    df["synonyms"] = ""
    df["agg"] = ""
    for _, row in df.iterrows():
        n = row["orig_name"]
        # remove ' ' and ' ' of the name
        syns = set([n, n.replace(" ", "").replace("-", "").replace("‐", "")])
        if "-" in n:
            syns.update([n.replace("-", " "), n.replace("-", "")])
        if " " in n:
            syns.update([n.replace(" ", "-"), n.replace(" ", "")])

        # convert greek symbols to latin and letters
        greeks = (
            set([greek2letter(i) for i in syns])
            .union(set([greek2latin(i) for i in syns]))
            .union(set([letter2latin(i) for i in syns]))
        )

        # if contains greek symbols, the name must be latin
        if greek2latin(n) != n:
            row["agg"] = min(set([greek2latin(i) for i in syns]), key=len)
        else:
            # use the shortest syns as the name
            row["agg"] = min(syns.union(greeks), key=len)

        row["agg"] = row["agg"].upper()
        row["synonyms"] = list(syns.union(greeks))

    # aggregate all synonyms
    df_group = df.groupby("agg").agg(
        {
            "ncbi_gene_id": "first",
            "gene_symbol": "first",
            "uniprotkb_id": "first",
            "synonyms": "sum",
            "orig_name": list,
        }
    )

    # remove the synonyms that are only case different from name
    # use the shortest non-greek original name as the new name
    df_group["name"] = ""
    for _, row in df_group.iterrows():
        orig_names = set(row["orig_name"])
        shortest_orig_name = min(orig_names, key=len)
        if greek2latin(shortest_orig_name) != shortest_orig_name:
            shortest_orig_name = greek2latin(shortest_orig_name)
        else:
            orig_names.remove(shortest_orig_name)

        syns = orig_names.union(row["synonyms"])
        syns = {i for i in syns if i.lower() != shortest_orig_name.lower()}

        syns = remove_case_insensitive_dup(syns)

        row["synonyms"] = "|".join(syns)
        row["name"] = shortest_orig_name

    df_group.reset_index(drop=True, inplace=True)
    df_group = df_group[
        ["name", "synonyms", "gene_symbol", "ncbi_gene_id", "uniprotkb_id"]
    ]
    df_group = df_group.sort_values("gene_symbol")
    df_group = df_group.set_index("name")

    print(f"Final shape: {df_group.shape}")
    display(df_group.head())

    return df_group

## Human

In [None]:
df_human = preprocess(df_cm, species="human")

In [None]:
# confirm that no synonym is attached to multiple entries

exp = pd.DataFrame(df_human["synonyms"].str.split("|"))
exp = exp.explode("synonyms")
exp = exp[exp["synonyms"].apply(len) > 0]
exp[exp["synonyms"].duplicated(keep=False)]

In [None]:
df_human.to_parquet("df_human__cellmarker__2.0__CellMarker.parquet")

## Mouse

In [None]:
df_mouse = preprocess(df_cm, species="mouse")

In [None]:
df_mouse.to_parquet("df_mouse__cellmarker__2.0__CellMarker.parquet")