In [1]:
# use python 3
import glob
import yaml
import pandas as pd 
from functools import reduce
from dotenv import load_dotenv
from os import getenv
load_dotenv()

# %%
# folder where endcluster directory is
all_files = glob.glob("curated_endclusts/*")

# %%
# read in each cluster file as dataframe
li = []
for file in all_files:
     li.append(pd.read_csv(file, header=None, sep="\t"))


# %%
# merge into single frame with saved cluster identity
df = pd.concat(li, 
                axis=0,
                # ignore_index=True,
                keys=[f.split("/")[-1].split(".")[0] for f in all_files],
                names=["cluster", "orig_idx"]).reset_index()
df.columns = ['cluster', 'orig_idx', "CellID"]

In [2]:
df.head()

Unnamed: 0,cluster,orig_idx,CellID
0,tier0_clust0tier1_clust4tier2_clust0tier3_clus...,0,p049_T0D_ILE_LPS_3p_AAGACCTTCCCATTTA-1
1,tier0_clust0tier1_clust4tier2_clust0tier3_clus...,1,p049_T0D_ILE_LPS_3p_ACGGGCTTCAAACCAC-1
2,tier0_clust0tier1_clust4tier2_clust0tier3_clus...,2,p049_T0D_ILE_LPS_3p_ACTGAGTAGTTCGATC-1
3,tier0_clust0tier1_clust4tier2_clust0tier3_clus...,3,p049_T0D_ILE_LPS_3p_AGCTTGACAGCCTGTG-1
4,tier0_clust0tier1_clust4tier2_clust0tier3_clus...,4,p049_T0D_ILE_LPS_3p_AGTTGGTTCACTGGGC-1


In [3]:
df.shape

(115569, 3)

In [4]:
# %%
# probably don't need this line (had to do with how I ran tiered clustering)
# cluster column should look like this "T0C0_T1C0" ...

# df.cluster = df.cluster.str[5:]
df.cluster = df.cluster\
    .str.split("tier")\
    .str.join("_T")\
    .str.split("_clust")\
    .str.join("C")\
    .str[1:]

In [5]:
df.head()

Unnamed: 0,cluster,orig_idx,CellID
0,T0C0_T1C4_T2C0_T3C5_T4C3,0,p049_T0D_ILE_LPS_3p_AAGACCTTCCCATTTA-1
1,T0C0_T1C4_T2C0_T3C5_T4C3,1,p049_T0D_ILE_LPS_3p_ACGGGCTTCAAACCAC-1
2,T0C0_T1C4_T2C0_T3C5_T4C3,2,p049_T0D_ILE_LPS_3p_ACTGAGTAGTTCGATC-1
3,T0C0_T1C4_T2C0_T3C5_T4C3,3,p049_T0D_ILE_LPS_3p_AGCTTGACAGCCTGTG-1
4,T0C0_T1C4_T2C0_T3C5_T4C3,4,p049_T0D_ILE_LPS_3p_AGTTGGTTCACTGGGC-1


In [6]:
# %%
# get cluster paths down to iterated level
# exchanges "_" for "."
df["tier0ident"] = df.cluster.str.split("_").str[:1].str.join(".") # not interesting (all the same)
df["tier1ident"] = df.cluster.str.split("_").str[:2].str.join(".") # Major cell types
df["tier2ident"] = df.cluster.str.split("_").str[:3].str.join(".") 
df["tier3ident"] = df.cluster.str.split("_").str[:4].str.join(".")
df["tier4ident"] = df.cluster.str.split("_").str[:5].str.join(".")
df["tierNident"] = df.cluster.str.replace("_", ".")  # endclusters # used for cell subsets

In [7]:
# %%
# lookit! 
df

Unnamed: 0,cluster,orig_idx,CellID,tier0ident,tier1ident,tier2ident,tier3ident,tier4ident,tierNident
0,T0C0_T1C4_T2C0_T3C5_T4C3,0,p049_T0D_ILE_LPS_3p_AAGACCTTCCCATTTA-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3
1,T0C0_T1C4_T2C0_T3C5_T4C3,1,p049_T0D_ILE_LPS_3p_ACGGGCTTCAAACCAC-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3
2,T0C0_T1C4_T2C0_T3C5_T4C3,2,p049_T0D_ILE_LPS_3p_ACTGAGTAGTTCGATC-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3
3,T0C0_T1C4_T2C0_T3C5_T4C3,3,p049_T0D_ILE_LPS_3p_AGCTTGACAGCCTGTG-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3
4,T0C0_T1C4_T2C0_T3C5_T4C3,4,p049_T0D_ILE_LPS_3p_AGTTGGTTCACTGGGC-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3
...,...,...,...,...,...,...,...,...,...
115564,T0C0_T1C3_T2C1_T3C0_T4C3,57,p035_T0D_ILE_LPS_3p_ACGGAGATCTTTCCTC-1,T0C0,T0C0.T1C3,T0C0.T1C3.T2C1,T0C0.T1C3.T2C1.T3C0,T0C0.T1C3.T2C1.T3C0.T4C3,T0C0.T1C3.T2C1.T3C0.T4C3
115565,T0C0_T1C3_T2C1_T3C0_T4C3,58,p043_T0D_ILE_LPS_3p_ACGGAGAAGCAATATG-1,T0C0,T0C0.T1C3,T0C0.T1C3.T2C1,T0C0.T1C3.T2C1.T3C0,T0C0.T1C3.T2C1.T3C0.T4C3,T0C0.T1C3.T2C1.T3C0.T4C3
115566,T0C0_T1C3_T2C1_T3C0_T4C3,59,p043_T0D_ILE_LPS_3p_CAACTAGAGCTAAGAT-1,T0C0,T0C0.T1C3,T0C0.T1C3.T2C1,T0C0.T1C3.T2C1.T3C0,T0C0.T1C3.T2C1.T3C0.T4C3,T0C0.T1C3.T2C1.T3C0.T4C3
115567,T0C0_T1C3_T2C1_T3C0_T4C3,60,p050_T0D_ILE_LPS_3p_CGTGAGCGTCGTCTTC-1,T0C0,T0C0.T1C3,T0C0.T1C3.T2C1,T0C0.T1C3.T2C1.T3C0,T0C0.T1C3.T2C1.T3C0.T4C3,T0C0.T1C3.T2C1.T3C0.T4C3


In [13]:
# %%
# read in yaml file (had to change structer a little) (removes clusters: tag)
with open(r"20200416_allcells_FGID13.yml", "r+") as fl:
    tree = {"T0C0": yaml.load(fl, Loader=yaml.FullLoader)}

In [14]:
df.tierNident.unique()[0]

'T0C0.T1C4.T2C0.T3C5.T4C3'

In [15]:
def get_next_node(d, k):
    if d.get(k, False): # if key is annotated
        if not d.get("isend", False): # if 
            d_next = d.get(k)
            d_next["path"] = ".".join([d.get("path"), d_next.get("name")])
            return d_next
    return d

In [16]:
node = reduce(get_next_node, df.tier2ident.unique()[0].split("."), tree['T0C0'])
node["path"]

'FG.IGA_IGK_Plsma.Plsma_1'

In [17]:
# %%
df["tier0name"] = df.tier0ident.apply(lambda p: reduce(get_next_node, p.split("."), tree["T0C0"])["path"])
df["tier1name"] = df.tier1ident.apply(lambda p: reduce(get_next_node, p.split("."), tree["T0C0"])["path"])
df["tier2name"] = df.tier2ident.apply(lambda p: reduce(get_next_node, p.split("."), tree["T0C0"])["path"])
df["tier3name"] = df.tier3ident.apply(lambda p: reduce(get_next_node, p.split("."), tree["T0C0"])["path"])
df["tier4name"] = df.tier4ident.apply(lambda p: reduce(get_next_node, p.split("."), tree["T0C0"])["path"])
df["tierNname"] = df.tierNident.apply(lambda p: reduce(get_next_node, p.split("."), tree["T0C0"])["path"])

In [18]:
# %%
# QC checks to make sure there are no NA values
df.isna().sum()

cluster       0
orig_idx      0
CellID        0
tier0ident    0
tier1ident    0
tier2ident    0
tier3ident    0
tier4ident    0
tierNident    0
tier0name     0
tier1name     0
tier2name     0
tier3name     0
tier4name     0
tierNname     0
dtype: int64

In [15]:
# checks that the default value "Mm" is not an exact match anywhere except tier0name
df.apply(func=lambda x: x.str.match("^FG$"), axis=1).sum()

cluster            0.0
orig_idx           0.0
CellID             0.0
tier0ident         0.0
tier1ident         0.0
tier2ident         0.0
tier3ident         0.0
tier4ident         0.0
tierNident         0.0
tier0name     115569.0
tier1name          0.0
tier2name          0.0
tier3name          0.0
tier4name          0.0
tierNname          0.0
dtype: float64

In [32]:
df.CellID.duplicated().sum()

0

In [19]:
df.tier1ident.unique()

array(['T0C0.T1C4', 'T0C0.T1C2', 'T0C0.T1C7', 'T0C0.T1C0', 'T0C0.T1C8',
       'T0C0.T1C5', 'T0C0.T1C6', 'T0C0.T1C1', 'T0C0.T1C9', 'T0C0.T1C11',
       'T0C0.T1C10', 'T0C0.T1C3'], dtype=object)

In [20]:
df.loc[df.tier1name == "FG.Bclls", :].tierNname.value_counts()

FG.Bclls.Bclls_noncycling.CD27_memory                                                    10114
FG.Bclls.Bclls_noncycling.FCER2_IGHD_follicular_transitioning                             8140
FG.Bclls.Bclls_cycling.proliferating_DarkZone                                             3544
FG.Bclls.Bclls_cycling.tcll_doublets                                                      1614
FG.Bclls.Bclls_cycling.GerminalCenter_HLA-DQB1_CD40_CD79B.DarkZone_CXCR4_AICDA_Kappa      1108
FG.Bclls.Bclls_cycling.GerminalCenter_HLA-DQB1_CD40_CD79B.GerminalCenter_Ribosomal         935
FG.Bclls.Bclls_cycling.GerminalCenter_HLA-DQB1_CD40_CD79B.LightZone_HLA_CD83               902
FG.Bclls.Bclls_cycling.GerminalCenter_HLA-DQB1_CD40_CD79B.p034_dominated                   620
FG.Bclls.Bclls_cycling.GerminalCenter_HLA-DQB1_CD40_CD79B.DarkZone_CXCR4_AICDA_Lambda      618
FG.Bclls.Bclls_noncycling.p050_dominated_SPILLOVER                                         612
FG.Bclls.Bclls_noncycling.p035_dominated          

In [21]:
df.head()

Unnamed: 0,cluster,orig_idx,CellID,tier0ident,tier1ident,tier2ident,tier3ident,tier4ident,tierNident,tier0name,tier1name,tier2name,tier3name,tier4name,tierNname
0,T0C0_T1C4_T2C0_T3C5_T4C3,0,p049_T0D_ILE_LPS_3p_AAGACCTTCCCATTTA-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3,FG,FG.IGA_IGK_Plsma,FG.IGA_IGK_Plsma.Plsma_1,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique
1,T0C0_T1C4_T2C0_T3C5_T4C3,1,p049_T0D_ILE_LPS_3p_ACGGGCTTCAAACCAC-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3,FG,FG.IGA_IGK_Plsma,FG.IGA_IGK_Plsma.Plsma_1,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique
2,T0C0_T1C4_T2C0_T3C5_T4C3,2,p049_T0D_ILE_LPS_3p_ACTGAGTAGTTCGATC-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3,FG,FG.IGA_IGK_Plsma,FG.IGA_IGK_Plsma.Plsma_1,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique
3,T0C0_T1C4_T2C0_T3C5_T4C3,3,p049_T0D_ILE_LPS_3p_AGCTTGACAGCCTGTG-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3,FG,FG.IGA_IGK_Plsma,FG.IGA_IGK_Plsma.Plsma_1,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique
4,T0C0_T1C4_T2C0_T3C5_T4C3,4,p049_T0D_ILE_LPS_3p_AGTTGGTTCACTGGGC-1,T0C0,T0C0.T1C4,T0C0.T1C4.T2C0,T0C0.T1C4.T2C0.T3C5,T0C0.T1C4.T2C0.T3C5.T4C3,T0C0.T1C4.T2C0.T3C5.T4C3,FG,FG.IGA_IGK_Plsma,FG.IGA_IGK_Plsma.Plsma_1,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique,FG.IGA_IGK_Plsma.Plsma_1.IGKV_unique


In [22]:
# %%
# save to wherever 
df.to_csv(getenv("FG_MAPPED_TSV"), sep="\t", index=False, quoting=0)