# `CellMarker`: cellmarker; 2.0

The underlying curation process for `bionty.CellMarker.df()`

In [1]:
import pandas as pd

In [2]:
url = "http://bio-bigdata.hrbmu.edu.cn/CellMarker/CellMarker_download_files/file/Cell_marker_All.xlsx"

In [9]:
def add_to_synonyms(df_meta, key, name, no_meta_names):
    if df_meta.loc[key, "synonyms"] == "":
        df_meta.loc[key, "synonyms"] += name
    else:
        df_meta.loc[key, "synonyms"] += f"|{name}"
    no_meta_names.remove(name)


def map(no_meta_names: set, df_meta: pd.DataFrame):
    df_meta_names = df_meta.index.values
    names = list(no_meta_names)
    for name in names:
        n = name.replace(" ", "")
        for key in [n.lower(), n.upper(), n.capitalize(), n.replace("-", "")]:
            if key in df_meta_names:
                add_to_synonyms(df_meta, key, name, no_meta_names)
                key = None
                break
        if key is not None:
            for key in [
                n.replace("alpha", "α"),
                n.replace("α", "alpha"),
                n.replace("-alpha", "α"),
                n.replace("α", "-alpha"),
                n.replace("alpha", "a"),
                n.replace("alpha", "A"),
                n.replace("-alpha", "a"),
                n.replace("-alpha", "A"),
                n.replace("α", "A"),
                n.replace("-α", "A"),
                n.replace("-α", "alpha"),
                n.replace("-α", "a"),
            ]:
                if key in df_meta_names:
                    add_to_synonyms(df_meta, key, name, no_meta_names)
                    key = None
                    break
        if key is not None:
            for key in [
                n.replace("beta", "β"),
                n.replace("β", "beta"),
                n.replace("-beta", "β"),
                n.replace("β", "-beta"),
                n.replace("beta", "b"),
                n.replace("beta", "B"),
                n.replace("-beta", "b"),
                n.replace("-beta", "B"),
                n.replace("β", "B"),
                n.replace("-β", "B"),
                n.replace("-β", "beta"),
                n.replace("-β", "b"),
            ]:
                if key in df_meta_names:
                    add_to_synonyms(df_meta, key, name, no_meta_names)
                    key = None
                    break
        if key is not None:
            for key in [
                n.replace("gamma", "γ"),
                n.replace("γ", "gamma"),
                n.replace("-gamma", "γ"),
                n.replace("γ", "-gamma"),
                n.replace("gamma", "r"),
                n.replace("-gamma", "r"),
                n.replace("-γ", "r"),
                n.replace("-γ", "gamma"),
            ]:
                if key in df_meta_names:
                    add_to_synonyms(df_meta, key, name, no_meta_names)
                    key = None
                    break

In [10]:
def preprocess(url: str, species: str):
    df_cm = pd.read_excel(url, dtype=str)
    df_cm = df_cm[df_cm["species"] == species.capitalize()].copy()
    print(f"Original shape: {df_cm.shape}")
    display(df_cm.head())
    # Drop tissue, cell type, tech, journal, Genename
    df = df_cm[["marker", "GeneID", "Symbol", "UNIPROTID"]].copy()
    df = df.drop_duplicates(subset=["marker", "GeneID", "UNIPROTID"])
    df = df.drop_duplicates(subset=["marker"])
    display(df.head())
    df.rename(
        columns={
            "marker": "name",
            "GeneID": "ncbi_gene_id",
            "Symbol": "gene_symbol",
            "UNIPROTID": "uniprotkb_id",
        },
        inplace=True,
    )

    df = df[df["name"].notnull()].copy()
    print(f"Unique shape: {df.shape}")
    display(df.head())

    df_no_meta = df[df["gene_symbol"].isnull()]
    df_meta = df[~df["gene_symbol"].isnull()].copy()
    df_meta["synonyms"] = ""
    df_meta = df_meta.set_index("name")
    no_meta_names = set(df_no_meta.name)
    try:
        no_meta_names.remove("PDGFR\xa0α")
        no_meta_names.remove("PDGFRα\xa0")
    except KeyError:
        pass

    map(no_meta_names, df_meta)

    print(f"{len(no_meta_names)} names without metadata")

    # group synonyms
    df_meta = df_meta.reset_index()
    df_meta = (
        df_meta.groupby(["ncbi_gene_id", "gene_symbol", "uniprotkb_id"])
        .agg("|".join)
        .reset_index()
    )

    for _, row in df_meta.iterrows():
        names = row["name"].split("|")
        row["name"] = names[0]
        if len(names) > 1:
            syns = {i for i in row["synonyms"].split("|") if i != ""}
            syns.update(names[1:])
            row["synonyms"] = "|".join(syns)

    no_meta_names_df = pd.DataFrame(
        {
            "ncbi_gene_id": None,
            "gene_symbol": None,
            "uniprotkb_id": None,
            "name": list(no_meta_names),
            "synonyms": "",
        }
    )

    df_cb = pd.concat([df_meta, no_meta_names_df])
    df_cb = df_cb[["name", "synonyms", "gene_symbol", "ncbi_gene_id", "uniprotkb_id"]]
    df_cb = df_cb.sort_values("gene_symbol")
    df_cb = df_cb.set_index("name")

    print(f"After synonyms aggregation: {df_cb.shape}")
    display(df_cb.head())

    return df_cb

## Human

In [5]:
df_human = preprocess(url, species="human")

Original shape: (60877, 20)


Unnamed: 0,species,tissue_class,tissue_type,uberonongology_id,cancer_type,cell_type,cell_name,cellontology_id,marker,Symbol,GeneID,Genetype,Genename,UNIPROTID,technology_seq,marker_source,PMID,Title,journal,year
0,Human,Abdomen,Abdomen,UBERON_0000916,Normal,Normal cell,Macrophage,CL_0000235,MERTK,MERTK,10461,protein_coding,"MER proto-oncogene, tyrosine kinase",Q12866,,Experiment,31982413,Peritoneal Level of CD206 Associates With Mort...,Gastroenterology,2020
1,Human,Abdomen,Abdomen,UBERON_0000916,Normal,Normal cell,Macrophage,CL_0000235,CD16,FCGR3A,2215,protein_coding,Fc fragment of IgG receptor IIIb,O75015,,Experiment,31982413,Peritoneal Level of CD206 Associates With Mort...,Gastroenterology,2020
2,Human,Abdomen,Abdomen,UBERON_0000916,Normal,Normal cell,Macrophage,CL_0000235,CD206,MRC1,4360,protein_coding,mannose receptor C-type 1,P22897,,Experiment,31982413,Peritoneal Level of CD206 Associates With Mort...,Gastroenterology,2020
3,Human,Abdomen,Abdomen,UBERON_0000916,Normal,Normal cell,Macrophage,CL_0000235,CRIg,VSIG4,11326,protein_coding,V-set and immunoglobulin domain containing 4,Q9Y279,,Experiment,31982413,Peritoneal Level of CD206 Associates With Mort...,Gastroenterology,2020
4,Human,Abdomen,Abdomen,UBERON_0000916,Normal,Normal cell,Macrophage,CL_0000235,CD163,CD163,9332,protein_coding,CD163 molecule,Q86VB7,,Experiment,31982413,Peritoneal Level of CD206 Associates With Mort...,Gastroenterology,2020


Unnamed: 0,marker,GeneID,Symbol,UNIPROTID
0,MERTK,10461,MERTK,Q12866
1,CD16,2215,FCGR3A,O75015
2,CD206,4360,MRC1,P22897
3,CRIg,11326,VSIG4,Q9Y279
4,CD163,9332,CD163,Q86VB7


Unique shape: (16679, 4)


Unnamed: 0,name,ncbi_gene_id,gene_symbol,uniprotkb_id
0,MERTK,10461,MERTK,Q12866
1,CD16,2215,FCGR3A,O75015
2,CD206,4360,MRC1,P22897
3,CRIg,11326,VSIG4,Q9Y279
4,CD163,9332,CD163,Q86VB7


1995 names without metadata
After synonyms aggregation: (14180, 4)


Unnamed: 0_level_0,synonyms,gene_symbol,ncbi_gene_id,uniprotkb_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1BG,,A1BG,1,P04217
A2ML1,,A2ML1,144568,A8K2U0
A4GALT,,A4GALT,53947,A0A0S2Z5J1
AADAC,,AADAC,13,P22760
AADACL2,,AADACL2,344752,Q6P093


In [6]:
df_human.to_parquet("human_cellmarker_2.0_CellMarker_lookup.parquet")

## Mouse

In [11]:
df_mouse = preprocess(url, species="mouse")

Original shape: (35197, 20)


Unnamed: 0,species,tissue_class,tissue_type,uberonongology_id,cancer_type,cell_type,cell_name,cellontology_id,marker,Symbol,GeneID,Genetype,Genename,UNIPROTID,technology_seq,marker_source,PMID,Title,journal,year
8,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Fibro-adipogenic progenitor cell,,Wisp1,Ccn4,22402,protein_coding,cellular communication network factor 4,O54775,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022
9,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Myoblast,CL_0000056,Myod1,Myod1,17927,protein_coding,myogenic differentiation 1,P10085,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022
10,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Muscle satellite cell,CL_0000514,Myf5,Myf5,17877,protein_coding,myogenic factor 5,A2RSK4,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022
11,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Myocyte,CL_0000187,Ckm,Ckm,12715,protein_coding,"creatine kinase, muscle",A2RTA0,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022
12,Mouse,Abdomen,Muscle,UBERON_0001630,Normal,Normal cell,Myocyte,CL_0000187,Acta1,Acta1,11459,protein_coding,"actin alpha 1, skeletal muscle",P68134,10x Chromium,Experiment,35439171,An estrogen-sensitive fibroblast population dr...,JCI insight,2022


Unnamed: 0,marker,GeneID,Symbol,UNIPROTID
8,Wisp1,22402,Ccn4,O54775
9,Myod1,17927,Myod1,P10085
10,Myf5,17877,Myf5,A2RSK4
11,Ckm,12715,Ckm,A2RTA0
12,Acta1,11459,Acta1,P68134


Unique shape: (12503, 4)


Unnamed: 0,name,ncbi_gene_id,gene_symbol,uniprotkb_id
8,Wisp1,22402,Ccn4,O54775
9,Myod1,17927,Myod1,P10085
10,Myf5,17877,Myf5,A2RSK4
11,Ckm,12715,Ckm,A2RTA0
12,Acta1,11459,Acta1,P68134


1343 names without metadata
After synonyms aggregation: (10609, 4)


Unnamed: 0_level_0,synonyms,gene_symbol,ncbi_gene_id,uniprotkb_id
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0610010K14Rik,,0610010K14Rik,104457,D3Z687
0610030E20Rik,,0610030E20Rik,68364,Q149G0
0610040J01Rik,,0610040J01Rik,76261,Q99K99
1110017D15Rik,,1110017D15Rik,73721,Q2MH31
1110032A03Rik,,1110032A03Rik,68721,Q9D131


In [12]:
df_mouse.to_parquet("mouse_cellmarker_2.0_CellMarker_lookup.parquet")