In [1]:
# use python 3
import glob
import yaml
import pandas as pd 
from functools import reduce
from dotenv import load_dotenv
from os import getenv
load_dotenv()

# %%
# folder where endcluster directory is
all_files = glob.glob("endclusts/*")

# %%
# read in each cluster file as dataframe
li = []
for file in all_files:
     li.append(pd.read_csv(file, header=None, sep="\t"))


# %%
# merge into single frame with saved cluster identity
df = pd.concat(li, 
                axis=0,
                # ignore_index=True,
                keys=[f.split("/")[-1].split(".")[0] for f in all_files],
                names=["cluster", "orig_idx"]).reset_index()
df.columns = ['cluster', 'orig_idx', "CellID"]

In [2]:
df.head()

Unnamed: 0,cluster,orig_idx,CellID
0,tier0_clust0tier1_clust2tier2_clust4tier3_clust5,0,p014_T0D_ILE_LPS_3p_AGTTGGTTCATAACCG-1
1,tier0_clust0tier1_clust2tier2_clust4tier3_clust5,1,p014_T0D_ILE_LPS_3p_ATTATCCCAGACAGGT-1
2,tier0_clust0tier1_clust2tier2_clust4tier3_clust5,2,p014_T0D_ILE_LPS_3p_CCTAAAGCAGACGCAA-1
3,tier0_clust0tier1_clust2tier2_clust4tier3_clust5,3,p014_T0D_ILE_LPS_3p_CGGGTCAGTTGCTCCT-1
4,tier0_clust0tier1_clust2tier2_clust4tier3_clust5,4,p014_T0D_ILE_LPS_3p_CTCGGAGTCTCTTATG-1


In [3]:
# %%
# probably don't need this line (had to do with how I ran tiered clustering)
# cluster column should look like this "T0C0_T1C0" ...

# df.cluster = df.cluster.str[5:]
df.cluster = df.cluster\
    .str.split("tier")\
    .str.join("_T")\
    .str.split("_clust")\
    .str.join("C")\
    .str[1:]

In [4]:
df.head()

Unnamed: 0,cluster,orig_idx,CellID
0,T0C0_T1C2_T2C4_T3C5,0,p014_T0D_ILE_LPS_3p_AGTTGGTTCATAACCG-1
1,T0C0_T1C2_T2C4_T3C5,1,p014_T0D_ILE_LPS_3p_ATTATCCCAGACAGGT-1
2,T0C0_T1C2_T2C4_T3C5,2,p014_T0D_ILE_LPS_3p_CCTAAAGCAGACGCAA-1
3,T0C0_T1C2_T2C4_T3C5,3,p014_T0D_ILE_LPS_3p_CGGGTCAGTTGCTCCT-1
4,T0C0_T1C2_T2C4_T3C5,4,p014_T0D_ILE_LPS_3p_CTCGGAGTCTCTTATG-1


In [5]:
# %%
# get cluster paths down to iterated level
# exchanges "_" for "."
df["tier0ident"] = df.cluster.str.split("_").str[:1].str.join(".") # not interesting (all the same)
df["tier1ident"] = df.cluster.str.split("_").str[:2].str.join(".") # Major cell types
df["tier2ident"] = df.cluster.str.split("_").str[:3].str.join(".") 
df["tier3ident"] = df.cluster.str.split("_").str[:4].str.join(".")
df["tier4ident"] = df.cluster.str.split("_").str[:5].str.join(".")
df["tierNident"] = df.cluster.str.replace("_", ".")  # endclusters # used for cell subsets

In [6]:
# %%
# lookit! 
df

Unnamed: 0,cluster,orig_idx,CellID,tier0ident,tier1ident,tier2ident,tier3ident,tier4ident,tierNident
0,T0C0_T1C2_T2C4_T3C5,0,p014_T0D_ILE_LPS_3p_AGTTGGTTCATAACCG-1,T0C0,T0C0.T1C2,T0C0.T1C2.T2C4,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5
1,T0C0_T1C2_T2C4_T3C5,1,p014_T0D_ILE_LPS_3p_ATTATCCCAGACAGGT-1,T0C0,T0C0.T1C2,T0C0.T1C2.T2C4,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5
2,T0C0_T1C2_T2C4_T3C5,2,p014_T0D_ILE_LPS_3p_CCTAAAGCAGACGCAA-1,T0C0,T0C0.T1C2,T0C0.T1C2.T2C4,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5
3,T0C0_T1C2_T2C4_T3C5,3,p014_T0D_ILE_LPS_3p_CGGGTCAGTTGCTCCT-1,T0C0,T0C0.T1C2,T0C0.T1C2.T2C4,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5
4,T0C0_T1C2_T2C4_T3C5,4,p014_T0D_ILE_LPS_3p_CTCGGAGTCTCTTATG-1,T0C0,T0C0.T1C2,T0C0.T1C2.T2C4,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5,T0C0.T1C2.T2C4.T3C5
...,...,...,...,...,...,...,...,...,...
139337,T0C0_T1C1_T2C12,926,p048_T0D_ILE_LPS_3p_TGCTACCCAAGCGCTC-1,T0C0,T0C0.T1C1,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12
139338,T0C0_T1C1_T2C12,927,p048_T0D_ILE_LPS_3p_TGGACGCGTCTCCATC-1,T0C0,T0C0.T1C1,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12
139339,T0C0_T1C1_T2C12,928,p048_T0D_ILE_LPS_3p_TGGCCAGAGGTAAACT-1,T0C0,T0C0.T1C1,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12
139340,T0C0_T1C1_T2C12,929,p048_T0D_ILE_LPS_3p_TGGGCGTTCTGAGGGA-1,T0C0,T0C0.T1C1,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12,T0C0.T1C1.T2C12


In [7]:
# %%
# read in yaml file (had to change structer a little) (removes clusters: tag)
with open(r"20200509_allcells_CD14.yml", "r+") as fl:
    tree = {"T0C0": yaml.load(fl, Loader=yaml.FullLoader)}

In [8]:
# %%
# function 
def get_attr_path(tree, keypath, attr, species_prefix="CD"):
    """ from nested dictionary returns attr at each level as single string
    """
    if len(keypath) == 1:
        return species_prefix
    try:
        name = reduce(dict.__getitem__, keypath, tree)[attr]
        return ".".join([get_attr_path(tree, keypath[:-1], attr), name])
    except KeyError: 
		# if key error assumes we have gone too far in the tree 
		# i.e. into patient effects that I didn't annotate.
		# returns from tier up until it succeeds
        return ".".join([get_attr_path(tree, keypath[:-1], attr)])

In [9]:
# %%
df["tier0name"] = df.tier0ident.apply(lambda p: get_attr_path(tree, p.split("."), "name"))
df["tier1name"] = df.tier1ident.apply(lambda p: get_attr_path(tree, p.split("."), "name"))
df["tier2name"] = df.tier2ident.apply(lambda p: get_attr_path(tree, p.split("."), "name"))
df["tier3name"] = df.tier3ident.apply(lambda p: get_attr_path(tree, p.split("."), "name"))
df["tier4name"] = df.tier4ident.apply(lambda p: get_attr_path(tree, p.split("."), "name"))
df["tierNname"] = df.tierNident.apply(lambda p: get_attr_path(tree, p.split("."), "name"))

In [10]:
# %%
# QC checks to make sure there are no NA values
df.isna().sum()

cluster       0
orig_idx      0
CellID        0
tier0ident    0
tier1ident    0
tier2ident    0
tier3ident    0
tier4ident    0
tierNident    0
tier0name     0
tier1name     0
tier2name     0
tier3name     0
tier4name     0
tierNname     0
dtype: int64

In [14]:
# checks that the default value "Mm" is not an exact match anywhere except tier0name
df.apply(func=lambda x: x.str.match("^CD$"), axis=1).sum()

cluster            0.0
orig_idx           0.0
CellID             0.0
tier0ident         0.0
tier1ident         0.0
tier2ident         0.0
tier3ident         0.0
tier4ident         0.0
tierNident         0.0
tier0name     139342.0
tier1name          0.0
tier2name          0.0
tier3name          0.0
tier4name          0.0
tierNname          0.0
dtype: float64

In [11]:
df.loc[df.tier1name == "CD.Epith_Paneth", ].head()

Unnamed: 0,cluster,orig_idx,CellID,tier0ident,tier1ident,tier2ident,tier3ident,tier4ident,tierNident,tier0name,tier1name,tier2name,tier3name,tier4name,tierNname
14361,T0C0_T1C11,0,p011_T0D_ILE_LPS_3p_ACGAGGAGTTAAGGGC-1,T0C0,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,CD,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth
14362,T0C0_T1C11,1,p011_T0D_ILE_LPS_3p_AGGGATGAGGCCGAAT-1,T0C0,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,CD,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth
14363,T0C0_T1C11,2,p011_T0D_ILE_LPS_3p_ATTGGTGAGTCCGGTC-1,T0C0,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,CD,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth
14364,T0C0_T1C11,3,p011_T0D_ILE_LPS_3p_ATTTCTGCAAAGCAAT-1,T0C0,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,CD,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth
14365,T0C0_T1C11,4,p011_T0D_ILE_LPS_3p_CACACTCCAGATGGCA-1,T0C0,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,T0C0.T1C11,CD,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth,CD.Epith_Paneth


In [12]:
# %%
# save to wherever 
df.to_csv(getenv("CD_MAPPED_TSV"), sep="\t", index=False, quoting=0)