In [1]:
import json

In [18]:
with open("./data/HMBA_BG_consensus_annotation.json", "r") as json_file:
    cas = json.load(json_file)

Resolve DOIs

In [19]:
reference_doi_dict = {
    "Garma et al. 2024": "https://doi.org/10.1038/s41467-024-50414-w",
    "Agarwal et al. 2020": "https://doi.org/10.1038/s41467-020-17876-0",
    "Kamath et al. 2022": "https://doi.org/10.1038/s41593-022-01061-1",
    "He et al. 2021": "https://doi.org/10.1016/j.cub.2021.10.015",
    "Krienen et al. 2020": "https://doi.org/10.1038/s41586-020-2874-8",
    "Gokce et al. 2016": "https://doi.org/10.1016/j.celrep.2016.06.059",
    "Stanley et al. 2020": "https://doi.org/10.1016/j.neuron.2019.11.004",
    "Saunders et al. 2018": "https://doi.org/10.1016/j.cell.2018.07.028",
    "Märtin et al. 2019": "https://doi.org/10.1016/j.celrep.2019.11.096",
    "Muñoz-Manchado et al. 2018": "https://doi.org/10.1016/j.celrep.2018.07.053",
    "Chen et al. 2021": "https://doi.org/10.1038/s41593-021-00938-x",
    "Jäkel et al. 2019": "https://doi.org/10.1038/s41586-019-0903-2",
    "Siletti et al. 2023": "https://doi.org/10.1126/science.add7046",
    "Marques et al. 2016": "https://doi.org/10.1126/science.aaf6463",
    "Wallace et al. 2017": "https://doi.org/10.1016/j.neuron.2017.03.017",
    "Yao et al. 2023": "https://doi.org/10.1038/s41586-023-06812-z",
    "Miyamoto and Fukuda et al. 2022": "https://doi.org/10.1523/ENEURO.0208-22.2022",
    "Hodge et al. 2019": "https://doi.org/10.1038/s41586-019-1506-7",
    "Lin et al. 2023": "https://doi.org/10.1073/pnas.2216231120",
    "Tran et al 2021": "https://doi.org/10.1016/j.neuron.2021.09.001",
    "Phillips et al 2023": "https://doi.org/10.1016/j.mcn.2023.103849",
    "Tushar et al 2022": "https://doi.org/10.1038/s41593-022-01061-1",
    "Zachary et al 2024": "https://doi.org/10.1101/2024.06.06.597807",
    "Partanen et al. 2022": "https://doi.org/10.3389/fnins.2022.976209",
    "Zachary et al. 2024 bioRxiv": "https://doi.org/10.1101/2024.06.06.597807",
    "Tushar et al. 2022 NNS": "https://doi.org/10.1038/s41593-022-01061-1",
    "Novel": None
}

for annotation in cas["annotations"]:
    if annotation.get("rationale"):
        annotation["rationale_dois"] = [reference_doi_dict[annotation.get("rationale")]]
        annotation["rationale"] = None


Get all cell labels

In [20]:
import rdflib
from rdflib.namespace import RDFS

g = rdflib.Graph()
url = "https://github.com/obophenotype/cell-ontology/raw/refs/heads/master/cl-base.owl"
g.parse(url, format="xml")

# Build a dictionary of CL term IRIs to labels
cl_terms = {}
for subject, predicate, obj in g.triples((None, RDFS.label, None)):
    subject_str = str(subject)
    # Only capture cell ontology terms
    if subject_str.startswith("http://purl.obolibrary.org/obo/CL_"):
        cl_terms[subject_str.replace("http://purl.obolibrary.org/obo/CL_", "CL:")] = str(obj)

for key, value in list(cl_terms.items())[:10]:
    print(key, value)

CL:4030044 has_not_completed
CL:4030045 lacks_part
CL:4030046 lacks_plasma_membrane_part
CL:0000000 cell
CL:0000001 primary cultured cell
CL:0000002 obsolete immortal cell line cell
CL:0000003 obsolete native cell
CL:0000004 obsolete cell by organism
CL:0000005 neural crest derived fibroblast
CL:0000006 neuronal receptor cell


Add Cell IDs

In [21]:
accession_index = dict()
for annotation in cas["annotations"]:
    accession_index[annotation["cell_set_accession"]] = annotation

for annotation in cas["annotations"]:
    if annotation.get("author_annotation_fields"):
        aaf = annotation.get("author_annotation_fields")
        if aaf.get("CL:ID_group") and aaf.get("CL:ID_group").strip().startswith("CL:"):
            cl_id = aaf.get("CL:ID_group").strip()
            annotation["cell_ontology_term"] = cl_terms[cl_id]
            annotation["cell_ontology_term_id"] = cl_id
            del aaf["CL:ID_group"]
        if aaf.get("CL:ID_subclass") and aaf.get("CL:ID_subclass").strip().startswith("CL:"):
            subclass_annotation = accession_index[annotation["parent_cell_set_accession"]]
            cl_id = aaf.get("CL:ID_subclass").strip()
            subclass_annotation["cell_ontology_term"] = cl_terms[cl_id]
            subclass_annotation["cell_ontology_term_id"] = cl_id
            del aaf["CL:ID_subclass"]
        if aaf.get("CL:ID_class") and aaf.get("CL:ID_class").strip().startswith("CL:"):
            subclass_annotation = accession_index[annotation["parent_cell_set_accession"]]
            class_annotation = accession_index[subclass_annotation["parent_cell_set_accession"]]
            cl_id = aaf.get("CL:ID_class").strip()
            class_annotation["cell_ontology_term"] = cl_terms[cl_id]
            class_annotation["cell_ontology_term_id"] = cl_id
            del aaf["CL:ID_class"]
        if aaf.get("CL:ID_neighborhood") and aaf.get("CL:ID_neighborhood").strip().startswith("CL:"):
            subclass_annotation = accession_index[annotation["parent_cell_set_accession"]]
            class_annotation = accession_index[subclass_annotation["parent_cell_set_accession"]]
            neighborhood_annotation = accession_index[class_annotation["parent_cell_set_accession"]]
            cl_id = aaf.get("CL:ID_neighborhood").strip()
            neighborhood_annotation["cell_ontology_term"] = cl_terms[cl_id]
            neighborhood_annotation["cell_ontology_term_id"] = cl_id
            del aaf["CL:ID_neighborhood"]        

In [22]:
with open("./data/HMBA_BG_consensus_annotation_2.json", "w") as file:
    json.dump(cas, file, indent=4)