In [71]:
import json
from os.path import join
import pandas as pd
import itertools

In [72]:
IMAGE_ID = "Repeat_Test-2_Scan1"
#IMAGE_ID = "DON218-ND-52yM-T1A"

In [73]:
COL_SUFFIX = "OME-TIFF" if IMAGE_ID == "DON218-ND-52yM-T1A" else "QPTIFF"

In [74]:
df = pd.read_csv(join("data", "Pancreas OMAP Markers.csv"), skiprows=1)
df = df.loc[pd.notnull(df["Marker name"]) & (df["Any OMAP?"] == "Y") & (df[f"Channel # ({COL_SUFFIX})"] != "-")]
df.head()

Unnamed: 0,Channel name (OME-TIFF),Channel # (OME-TIFF),Channel name (QPTIFF),Channel # (QPTIFF),Marker name,UniProt ID,OMAP-6?,In-progress OMAP?,Any OMAP?,Cell type(s),CT ID(s),Unnamed: 11,CT/1,CT/1/ID,CT/2,CT/2/ID,CT/3,CT/3/ID
0,CHRA,15,CHGA,28.0,Chromogranin A,P10645,Y,Y,Y,Endocrine,CL:0000169; CL:0000171; CL:0000173; CL:0002275...,,Pancreas exocrine glandular cell,CL:1001599,Pancreatic acinar cell,CL:0002064,,
1,GCG,6,GCG,18.0,Glucagon,P01275,Y,Y,Y,Endocrine (α),CL:0000171,,Pancreatic endocrine cell,CL:0008024,Type B pancreatic cell,CL:0000169,,
2,CPEP,13,C-Pep,22.0,C-peptide,P01308,Y,Y,Y,Endocrine (β),CL:0000169,,Pancreatic endocrine cell,CL:0008024,Pancreatic A cell,CL:0000171,,
3,PPY,20,PPY,20.0,Pancreatic polypeptide,P01298,Y,Y,Y,Endocrine (γ),CL:0002275,,Pancreatic endocrine cell,CL:0008024,Pancreatic D cell,CL:0000173,,
4,SST,29,SST,5.0,Somatostatin,P61278,Y,Y,Y,Endocrine (δ),CL:0000173,,Pancreatic endocrine cell,CL:0008024,Pancreatic PP cell,CL:0002275,,


In [75]:
marker_cols = df[["Marker name", "UniProt ID"]]
marker_cols.head()

Unnamed: 0,Marker name,UniProt ID
0,Chromogranin A,P10645
1,Glucagon,P01275
2,C-peptide,P01308
3,Pancreatic polypeptide,P01298
4,Somatostatin,P61278


In [76]:
# For each non-empty row, generate a feature set containing the row items.
# map items to uniprot identifiers by using the `uniprot:` prefix.
# as proposed at https://github.com/vitessce/vitessce/issues/1390

In [77]:
# Construct the tree, according to the following schema:
# https://github.com/hubmapconsortium/vitessce/blob/d5f63aa1d08aa61f6b20f6ad6bbfba5fceb6b5ef/src/schemas/cell_sets.schema.json
def init_feature_sets_tree():
    return {
        "datatype": "feature",
        "version": "0.1.3",
        "tree": []
    }

In [78]:
tree = init_feature_sets_tree()

In [79]:
tree["tree"] = [
    {
        "name": "Markers",
        "term": None,
        "children": [],
    }
]

In [80]:
for i, row in marker_cols.iterrows():
    row_name = row["Marker name"]
    markers = row["UniProt ID"].split(", ")
    if row_name != "-":
        marker_children = [
            {
                "name": marker,
                "term": f"uniprot:{marker}",
                "set": [],
            }
            for marker in markers
        ]
        tree["tree"][0]["children"].append({
            "name": row_name,
            "term": None,
            "children": marker_children,
        })
    

In [81]:
tree

{'datatype': 'feature',
 'version': '0.1.3',
 'tree': [{'name': 'Markers',
   'term': None,
   'children': [{'name': 'Chromogranin A',
     'term': None,
     'children': [{'name': 'P10645', 'term': 'uniprot:P10645', 'set': []}]},
    {'name': 'Glucagon',
     'term': None,
     'children': [{'name': 'P01275', 'term': 'uniprot:P01275', 'set': []}]},
    {'name': 'C-peptide',
     'term': None,
     'children': [{'name': 'P01308', 'term': 'uniprot:P01308', 'set': []}]},
    {'name': 'Pancreatic polypeptide',
     'term': None,
     'children': [{'name': 'P01298', 'term': 'uniprot:P01298', 'set': []}]},
    {'name': 'Somatostatin',
     'term': None,
     'children': [{'name': 'P61278', 'term': 'uniprot:P61278', 'set': []}]},
    {'name': 'Platelet endothelial cell adhesion molecule (PECAM-1)',
     'term': None,
     'children': [{'name': 'P16284', 'term': 'uniprot:P16284', 'set': []}]},
    {'name': 'α-Smooth muscle actin (ACTA2)',
     'term': None,
     'children': [{'name': 'P62736'

In [82]:
with open(join("data", "feature_sets.json"), "w") as f:
    json.dump(tree, f)

In [83]:
os_fs_df = df[["UniProt ID", "CT ID(s)"]]
os_fs_df.head()

Unnamed: 0,UniProt ID,CT ID(s)
0,P10645,CL:0000169; CL:0000171; CL:0000173; CL:0002275...
1,P01275,CL:0000171
2,P01308,CL:0000169
3,P01298,CL:0002275
4,P61278,CL:0000173


In [84]:
# Then, create a mapping between cell sets and feature sets,
# as proposed at https://github.com/vitessce/vitessce/issues/1517

In [85]:
os_fs_term_id_mapping = []
for os_row, fs_row in zip(os_fs_df["CT ID(s)"].values.tolist(), os_fs_df["UniProt ID"].values.tolist()):
    if pd.notnull(os_row) and fs_row != "-":
        os_fs_term_id_mapping += list(
            itertools.product(
                fs_row.split(", "),
                os_row.split("; ")
            )
        )
# TODO: prepend `uniprot:` to the feature IDs
for i in range(len(os_fs_term_id_mapping)):
    os_fs_term_id_mapping[i] = {
        "featureTerm": f"uniprot:{os_fs_term_id_mapping[i][0]}",
        "obsTerm": os_fs_term_id_mapping[i][1]
    }

In [86]:
os_fs_term_id_mapping

[{'featureTerm': 'uniprot:P10645', 'obsTerm': 'CL:0000169'},
 {'featureTerm': 'uniprot:P10645', 'obsTerm': 'CL:0000171'},
 {'featureTerm': 'uniprot:P10645', 'obsTerm': 'CL:0000173'},
 {'featureTerm': 'uniprot:P10645', 'obsTerm': 'CL:0002275'},
 {'featureTerm': 'uniprot:P10645', 'obsTerm': 'CL:0005019'},
 {'featureTerm': 'uniprot:P01275', 'obsTerm': 'CL:0000171'},
 {'featureTerm': 'uniprot:P01308', 'obsTerm': 'CL:0000169'},
 {'featureTerm': 'uniprot:P01298', 'obsTerm': 'CL:0002275'},
 {'featureTerm': 'uniprot:P61278', 'obsTerm': 'CL:0000173'},
 {'featureTerm': 'uniprot:P16284', 'obsTerm': 'CL:0000071'},
 {'featureTerm': 'uniprot:P16284', 'obsTerm': 'CL:0002144'},
 {'featureTerm': 'uniprot:P62736', 'obsTerm': 'CL:0000669'},
 {'featureTerm': 'uniprot:P62736', 'obsTerm': 'CL:0002410'},
 {'featureTerm': 'uniprot:P62736', 'obsTerm': 'CL:0000359'},
 {'featureTerm': 'uniprot:P08670', 'obsTerm': 'CL:0008019'},
 {'featureTerm': 'uniprot:P08727', 'obsTerm': 'CL:0002079'},
 {'featureTerm': 'unipro

In [87]:
with open(join("data", f"{IMAGE_ID}.term_mapping.json"), "w") as f:
    json.dump(os_fs_term_id_mapping, f)