In [1]:
import pandas as pd
import numpy as np
from functools import lru_cache
from numba import njit, prange
import itertools
from scipy import stats
import owlready2





## getting all possible types of values

In [2]:
import cellxgene_census
import cellxgene_census.experimental.ml as census_ml
import tiledbsoma as soma

census = cellxgene_census.open_soma(census_version = "latest")
experiment = census["census_data"]["homo_sapiens"]
datasets = census["census_info"]["datasets"]

In [6]:
batch = ['self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'assay_ontology_term_id', 'dataset_id']
features = ['tissue_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id']

In [7]:
df = experiment.obs.read(column_names=batch+features, value_filter="is_primary_data == True").concat().to_pandas()

In [8]:
df.shape

(37659244, 9)

In [10]:
# see all unique values for each df columns:
for column in df.columns:
    print(f"Unique values for {column}:")
    print(df[column].unique()[:20])
    print("\n")

Unique values for self_reported_ethnicity_ontology_term_id:
['unknown' 'HANCESTRO:0005' 'HANCESTRO:0439' 'HANCESTRO:0008'
 'HANCESTRO:0014' 'HANCESTRO:0013' 'HANCESTRO:0568' 'HANCESTRO:0590'
 'HANCESTRO:0010' 'HANCESTRO:0588' 'HANCESTRO:0306' 'HANCESTRO:0462'
 'HANCESTRO:0574' 'HANCESTRO:0463' 'HANCESTRO:0025' 'HANCESTRO:0496'
 'HANCESTRO:0487' 'HANCESTRO:0019' 'HANCESTRO:0022' 'HANCESTRO:0383']


Unique values for sex_ontology_term_id:
['PATO:0000383' 'PATO:0000384' 'unknown']


Unique values for assay_ontology_term_id:
['EFO:0009900' 'EFO:0009922' 'EFO:0009899' 'EFO:0700016' 'EFO:0030004'
 'EFO:0011025' 'EFO:0008722' 'EFO:0030003' 'EFO:0008919' 'EFO:0009901'
 'EFO:0008796' 'EFO:0008931' 'EFO:0700003' 'EFO:0700011' 'EFO:0008780'
 'EFO:0030002' 'EFO:0008953' 'EFO:0010010' 'EFO:0700004' 'EFO:0010550']


Unique values for dataset_id:
['f7ec7bd5-04ab-453b-a8a7-c9d14812affb'
 'da75ce6d-a395-4abd-962b-267aadb99666'
 'bcdec5fa-a7fa-4806-92bc-0cd02f40242f'
 '33911db3-f461-464b-8083-a397ab616a

In [164]:
from collections import Counter

In [165]:
# less than N cel per dataset?
list_of_datasets = {k: val for k, val in Counter(df['dataset_id']).items() if val >1000}


Counter({'f7c1c579-2dc0-47e2-ba19-8165c5a0e353': 4062980,
         '9f222629-9e39-47d0-b83f-e08d610c7479': 1959503,
         '9dbab10c-118d-496b-966a-67f1763a6b7d': 1462702,
         '6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3': 1309414,
         '218acb0f-9f2f-4f76-b90b-15a4b7c7f629': 1263676,
         '3faad104-2ab8-4434-816d-474d8d2641db': 1248980,
         'c2876b1b-06d8-4d96-a56b-5304f815b99a': 1149751,
         '56c4912d-2bae-4b64-98f2-af8a84389208': 1092789,
         'b0e547f0-462b-4f81-b31b-5b0a5d96f537': 1058909,
         '1e6a6ef9-7ec9-4c90-bbfb-2ad3c3165fd1': 996759,
         'ebc2e1ff-c8f9-466a-acf4-9d291afaf8b3': 836148,
         '842c6f5d-4a94-4eef-8510-8c792d1124bc': 714331,
         '1a38e762-2465-418f-b81c-6a4bce261c34': 700391,
         '65badd7a-9262-4fd1-9ce2-eb5dc0ca8039': 665955,
         'c7775e88-49bf-4ba2-a03b-93f00447c958': 647366,
         '01ad3cd7-3929-4654-84c0-6db05bd5fd59': 600929,
         '2adb1f8a-a6b1-4909-8ee8-484814e2d4bf': 598266,
         'd4e69e01-3ba

In [166]:
df['group'] = df[features+batch[:-1]].astype(str).agg('_'.join, axis=1)

In [None]:
for k, val in counts.items():
    print()

In [168]:
# compute weightings
# weight on large dev stage status, cell type tissue, disease, assay
counts = Counter(df['group'])

In [None]:
# compute the set of datasets to load

## cell line & tissue grouping

In [57]:
CL_BASIC_PERMANENT_URL_OWL = "https://github.com/obophenotype/cell-ontology/releases/latest/download/cl-basic.owl"
UB_BASIC_PERMANENT_URL_OWL = "https://github.com/obophenotype/uberon/releases/latest/download/uberon-basic.owl"
AC_BASIC_PERMANENT_URL_OWL = "https://raw.githubusercontent.com/EBISPOT/hancestro/main/hancestro-base.owl"
AS_BASIC_PERMANENT_URL_OWL = "https://github.com/obophenotype/uberon/releases/latest/download/uberon-basic.owl"
DV_BASIC_PERMANENT_URL_OWL = "http://purl.obolibrary.org/obo/hsapdv.owl"
DI_BASIC_PERMANENT_URL_OWL = 'https://raw.githubusercontent.com/EBISPOT/efo/master/efo-base.owl'


In [155]:
development_iri = "HsapDv_0000206"
val = _ancestors_dv(development_iri)


KeyboardInterrupt



In [103]:
ontology_dn = owlready2.get_ontology("http://purl.obolibrary.org/obo/hsapdv.owl")
ontology_dn.load()

get_ontology("http://purl.obolibrary.org/obo/hsapdv.owl#")

In [144]:

ontology_cl = owlready2.get_ontology(CL_BASIC_PERMANENT_URL_OWL)
ontology_cl.load()
ontology_ub = owlready2.get_ontology(UB_BASIC_PERMANENT_URL_OWL)
ontology_ub.load()
ontology_ac = owlready2.get_ontology(AC_BASIC_PERMANENT_URL_OWL)
ontology_ac.load()
ontology_di = owlready2.get_ontology()
ontology_di.load()

TypeError: World.get_ontology() missing 1 required positional argument: 'base_iri'

In [152]:
_ancestors_ti("UBERON:0002078")

['Thing',
 'UBERON:0000467',
 'UBERON:0000465',
 'UBERON:0002078',
 'EFO:0000001',
 'EFO:0000635',
 'UBERON:0004535',
 'UBERON:0035554',
 'UBERON:0000064',
 'UBERON:0001062',
 'BFO:0000004',
 'UBERON:0000061',
 'EFO:0001955',
 'EFO:0000786',
 'UBERON:0004120',
 'UBERON:0015212',
 'BFO:0000040',
 'UBERON:0010000',
 'BFO:0000001',
 'BFO:0000002',
 'UBERON:0004151',
 'UBERON:0002081']

In [153]:
@lru_cache(maxsize=None)
def _ancestors_dv(development):
    global ontology_dn
    ancestors = set()
    development_iri = development.replace(":", "_")
    entity = ontology_dn.search_one(iri=f"http://purl.obolibrary.org/obo/{development_iri}")
    for val in entity.ancestors(include_constructs = True, include_self = False):
        try:
            ancestors.add(val.name)
        except AttributeError:
            ancestors.add(val.value.name)
            ancestors |= _ancestors_dv(val.value.name)
    print(len(ancestors))
    return ancestors

In [148]:
@lru_cache(maxsize=None)
def _ancestors_di(disease):
    global ontology_di
    disease_iri = disease.replace(":", "_")
    entity = ontology_di.search_one(iri=f"http://purl.obolibrary.org/obo/{disease_iri}")
    ancestors = (
        [i.name.replace("_", ":") for i in entity.ancestors()]
        if entity
        else [disease]
    )
    return ancestors

@lru_cache(maxsize=None)
def _ancestors_dv(development):
    global ontology_dv
    development_iri = development.replace(":", "_")
    entity = ontology_dv.search_one(iri=f"http://purl.obolibrary.org/obo/{development_iri}")
    ancestors = (
        [i.name.replace("_", ":") for i in entity.ancestors()]
        if entity
        else [development]
    )
    return ancestors

@lru_cache(maxsize=None)
def _ancestors_ac(ancestry):
    global ontology_ac
    ancestry_iri = ancestry.replace(":", "_")
    entity = ontology_ac.search_one(iri=f"http://purl.obolibrary.org/obo/{ancestry_iri}")
    ancestors = (
        [i.name.replace("_", ":") for i in entity.ancestors()]
        if entity
        else [ancestry]
    )
    return ancestors

In [None]:
@lru_cache(maxsize=None)
def _ancestors_cl(cell_type):
    global ontology_cl
    cell_type_iri = cell_type.replace(":", "_")
    entity = ontology_cl.search_one(iri=f"http://purl.obolibrary.org/obo/{cell_type_iri}")
    ancestors = (
        [i.name.replace("_", ":") for i in entity.ancestors()]
        if entity
        else [cell_type]
    )
    return ancestors

In [151]:
@lru_cache(maxsize=None)
def _ancestors_ti(tissue):
    global ontology_ti
    tissue_iri = tissue.replace(":", "_")
    entity = ontology_ub.search_one(iri=f"http://purl.obolibrary.org/obo/{tissue_iri}")
    ancestors = (
        [i.name.replace("_", ":") for i in entity.ancestors()]
        if entity
        else [tissue]
    )
    return ancestors

In [8]:
import json
import urllib.request

url = "https://raw.githubusercontent.com/chanzuckerberg/single-cell-data-portal/main/frontend/src/components/common/Filter/descendant_mappings/cell_type_descendants.json"

In [17]:
# get all CxG cell types


def get_ancestry_mapping(url, type="cell_type"):
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())
    tot = set()
    ntot = set()
    for i, j in data.items():
        tot |= set(j)
        ntot |= set([i])
    # for each cell type, get all its ancestors
    ancestors = {}
    for val in tot | ntot:
        if type=="cell_type":
            ancestors[val] = set(_ancestors(val)) - set([val, 'Thing'])
        elif type=="tissue":
            ancestors[val] = set(_ancestors_ti(val)) - set([val, 'Thing'])
        elif type=="ancestry":
            ancestors[val] = set(_ancestors_ac(val)) - set([val, 'Thing'])
        else:
            raise ValueError("type must be 'cell_type' or 'tissue'")
    full_ancestors = set()
    for val in ancestors.values():
        full_ancestors |= set(val)

    # remove the things that are not in CxG
    full_ancestors = full_ancestors & set(ancestors.keys())

    # if a cell type is not an ancestor then it is a leaf
    leafs = tot - full_ancestors
    full_ancestors = full_ancestors - leafs
    # for each ancestor, make a dict of groupings of leafs that predict it
    groupings = {}
    for val in full_ancestors:
        groupings[val] = set()
    for leaf in leafs:
        for ancestor in ancestors[leaf]:
            if ancestor in full_ancestors:
                groupings[ancestor].add(leaf)

    return groupings, full_ancestors, leafs



In [19]:
df.columns

Index(['self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id',
       'assay_ontology_term_id', 'dataset_id', 'tissue_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data'],
      dtype='object')

In [63]:
tot

{'HsapDv:0000002',
 'HsapDv:0000015',
 'HsapDv:0000021',
 'HsapDv:0000023',
 'HsapDv:0000024',
 'HsapDv:0000025',
 'HsapDv:0000026',
 'HsapDv:0000027',
 'HsapDv:0000028',
 'HsapDv:0000029',
 'HsapDv:0000030',
 'HsapDv:0000046',
 'HsapDv:0000047',
 'HsapDv:0000048',
 'HsapDv:0000049',
 'HsapDv:0000050',
 'HsapDv:0000051',
 'HsapDv:0000052',
 'HsapDv:0000053',
 'HsapDv:0000054',
 'HsapDv:0000055',
 'HsapDv:0000056',
 'HsapDv:0000057',
 'HsapDv:0000058',
 'HsapDv:0000059',
 'HsapDv:0000060',
 'HsapDv:0000062',
 'HsapDv:0000063',
 'HsapDv:0000068',
 'HsapDv:0000080',
 'HsapDv:0000081',
 'HsapDv:0000082',
 'HsapDv:0000083',
 'HsapDv:0000084',
 'HsapDv:0000085',
 'HsapDv:0000086',
 'HsapDv:0000087',
 'HsapDv:0000088',
 'HsapDv:0000089',
 'HsapDv:0000090',
 'HsapDv:0000091',
 'HsapDv:0000092',
 'HsapDv:0000093',
 'HsapDv:0000094',
 'HsapDv:0000095',
 'HsapDv:0000096',
 'HsapDv:0000097',
 'HsapDv:0000098',
 'HsapDv:0000099',
 'HsapDv:0000100',
 'HsapDv:0000101',
 'HsapDv:0000102',
 'HsapDv:000

In [61]:
ancestors = {}
tot = set(df['development_stage_ontology_term_id'].unique())
for val in tot:
    ancestors[val] = set(_ancestors_dv(val)) - set([val, 'Thing'])
full_ancestors = set()
for val in ancestors.values():
    full_ancestors |= set(val)

# remove the things that are not in CxG
full_ancestors = full_ancestors & set(ancestors.keys())

# if a cell type is not an ancestor then it is a leaf
leafs = tot - full_ancestors
full_ancestors = full_ancestors - leafs
# for each ancestor, make a dict of groupings of leafs that predict it
groupings = {}
for val in full_ancestors:
    groupings[val] = set()
for leaf in leafs:
    for ancestor in ancestors[leaf]:
        if ancestor in full_ancestors:
            groupings[ancestor].add(leaf)

In [62]:
len(leafs), len(full_ancestors), len(groupings)

(172, 0, 0)

In [60]:
groupings, full_ancestors, leafs = get_ancestry_mapping(url, type="cell_type")

In [61]:
len(leafs), len(full_ancestors), len(groupings)

(387, 260, 260)

In [10]:
url = "https://raw.githubusercontent.com/chanzuckerberg/single-cell-data-portal/main/frontend/src/components/common/Filter/descendant_mappings/tissue_descendants.json"

In [11]:
groupings, full_ancestors, leafs = get_ancestry_mapping(data, type="tissue")

In [55]:
len(leafs), len(full_ancestors), len(groupings)

(250, 29, 29)

In [12]:
groupings

{'UBERON:0003528': {'UBERON:0002023',
  'UBERON:0002435',
  'UBERON:0002810',
  'UBERON:0002811',
  'UBERON:0006514',
  'UBERON:0007628',
  'UBERON:0010225',
  'UBERON:0016530',
  'UBERON:0016538',
  'UBERON:0016540'},
 'UBERON:0002081': {'UBERON:0002078', 'UBERON:0002079'},
 'UBERON:0001870': {'UBERON:0002810', 'UBERON:0002811'},
 'UBERON:0001637': {'UBERON:0001621', 'UBERON:0001624', 'UBERON:0005616'},
 'UBERON:0002385': {'UBERON:0001134', 'UBERON:0004648'},
 'UBERON:0003968': {'UBERON:0001542'},
 'UBERON:0000014': {'UBERON:0001416',
  'UBERON:0001471',
  'UBERON:0001511',
  'UBERON:0001868',
  'UBERON:8300000'},
 'UBERON:0002190': {'UBERON:0014455'},
 'UBERON:0001871': {'UBERON:0002808', 'UBERON:0002809'},
 'UBERON:0002021': {'UBERON:0002807'},
 'UBERON:0002378': {'UBERON:0002382'},
 'UBERON:0002082': {'UBERON:0002080', 'UBERON:0002084'},
 'UBERON:0001630': {'UBERON:0001103',
  'UBERON:0001388',
  'UBERON:0002382',
  'UBERON:0008612'},
 'UBERON:0001085': {'UBERON:0001416', 'UBERON:0

In [58]:
leafs

{'UBERON:0000002',
 'UBERON:0000004',
 'UBERON:0000006',
 'UBERON:0000016',
 'UBERON:0000017',
 'UBERON:0000053',
 'UBERON:0000056',
 'UBERON:0000057',
 'UBERON:0000059',
 'UBERON:0000074',
 'UBERON:0000080',
 'UBERON:0000088 (organoid)',
 'UBERON:0000160',
 'UBERON:0000310 (organoid)',
 'UBERON:0000328',
 'UBERON:0000362',
 'UBERON:0000400',
 'UBERON:0000411',
 'UBERON:0000416',
 'UBERON:0000451',
 'UBERON:0000453',
 'UBERON:0000473',
 'UBERON:0000926',
 'UBERON:0000945',
 'UBERON:0000947',
 'UBERON:0000948',
 'UBERON:0000955',
 'UBERON:0000956',
 'UBERON:0000964',
 'UBERON:0000965',
 'UBERON:0000966',
 'UBERON:0000966 (organoid)',
 'UBERON:0000970',
 'UBERON:0000977',
 'UBERON:0000988',
 'UBERON:0000995',
 'UBERON:0000995 (organoid)',
 'UBERON:0001003',
 'UBERON:0001005',
 'UBERON:0001043',
 'UBERON:0001046',
 'UBERON:0001052',
 'UBERON:0001103',
 'UBERON:0001117',
 'UBERON:0001134',
 'UBERON:0001153',
 'UBERON:0001154',
 'UBERON:0001155',
 'UBERON:0001156',
 'UBERON:0001157',
 'UBER

In [None]:
main_groups = {
    "adipose tissue": "",
    "bladder organ": "",
    "blood": "",
    "bone marrow": "",
    "brain": "",
    "breast": "",
    "esophagus": "",
    "eye": "",
    "embryo": "",
    "fallopian tube": "",
    "gall bladder": "",
    "heart": "",
    "intestine": "",
    "kidney": "",
    "liver": "",
    "lung": "",
    "lymph node": "",
    "musculature of body": "",
    "nose": "",
    "ovary": "",
    "pancreas": "",
    "placenta": "",
    "skin of body": "",
    "spinal cord": "",
    "spleen": "",
    "stomach": "",
    "thymus": "",
    "thyroid gland": "",
    "tongue": "",
    "uterus": "",
}

In [57]:
groupings

{'UBERON:0001902': {'UBERON:0000400', 'UBERON:0008345'},
 'UBERON:0001872': {'UBERON:0002802', 'UBERON:0002803'},
 'UBERON:0002378': {'UBERON:0002382'},
 'UBERON:0003661': {'UBERON:0001388'},
 'UBERON:0001013': {'UBERON:0001348',
  'UBERON:0003428',
  'UBERON:0005406',
  'UBERON:0014454',
  'UBERON:0014455',
  'UBERON:0015143'},
 'UBERON:0000178': {'UBERON:0012168', 'UBERON:0013756'},
 'UBERON:0000991': {'UBERON:0000473', 'UBERON:0002118', 'UBERON:0002119'},
 'UBERON:0002420': {'UBERON:0002023'},
 'UBERON:0000014': {'UBERON:0001416',
  'UBERON:0001471',
  'UBERON:0001511',
  'UBERON:0001868',
  'UBERON:8300000'},
 'UBERON:0002021': {'UBERON:0002807'},
 'UBERON:0007644': {'UBERON:0039167'},
 'UBERON:0000397': {'UBERON:0005636'},
 'UBERON:0001085': {'UBERON:0001416', 'UBERON:0001868'},
 'UBERON:0001870': {'UBERON:0002810', 'UBERON:0002811'},
 'UBERON:0002082': {'UBERON:0002080', 'UBERON:0002084'},
 'UBERON:0000992': {'UBERON:0002118', 'UBERON:0002119'},
 'UBERON:0002190': {'UBERON:001445

In [56]:
groupings['UBERON:0001723']

KeyError: 'UBERON:0001723'

In [40]:
response = urllib.request.urlopen(url)
data = json.loads(response.read())