In [1]:
import matplotlib.pyplot as plt
import snapatac2 as snap
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import silhouette_score
import glob
import warnings
import os
warnings.filterwarnings("ignore")


In [14]:
df_l3_summary = pd.read_csv("/data2st1/junyi/output/atac0627/iterative/l3/l3_summary.csv", index_col=0)

In [15]:
df_l3_summary.head()

Unnamed: 0,celltype.L2,best_resolution,n_clusters,n_cells
0,AMY Strip2_Penk GABA,0.3,2,1129
1,Microglia-3,0.3,2,1956
2,AMY Lhx6_Maf GABA,0.6,3,1872
3,AMY Hgf_Satb2 Glut,0.4,2,2104
4,AMY Zbtb7c_Vwa5b1 Glut,0.3,2,5122


In [18]:
df_l4_summary = pd.read_csv("/data2st1/junyi/output/atac0627/iterative/l4/l4_summary.csv",index_col=0)

In [19]:
df_l4_summary.head()

Unnamed: 0,celltype.L3,best_resolution,n_clusters,n_cells
0,AMY Foxp2_Penk GABA-0,0.4,2,1161
1,AMY Foxp2_Penk GABA-1,0.7,3,635
2,PFC Lamp5 GABA-0,0.6,2,618
3,PFC Lamp5 GABA-1,0.5,2,451
4,AMY Zfhx4_Gfra1 GABA-0,0.6,3,2457


In [20]:
df_l3metas= glob.glob("/data2st1/junyi/output/atac0627/iterative/l3/*l3.csv")

In [21]:
df_l4metas = glob.glob("/data2st1/junyi/output/atac0627/iterative/l4/*l4.csv")

In [22]:
df_l4_summary.columns = ['celltype.L3','best_resolution','n_clusters','n_cells']

In [23]:
df_l4_summary['cellpclust'] = df_l4_summary['n_cells']/df_l4_summary['n_clusters']

In [25]:
df_l3 = pd.DataFrame()
for f in df_l3metas:
    df = pd.read_csv(f,index_col=0)
    df_l3 = pd.concat([df_l3, df], axis=0)

In [26]:
df_l4 = pd.DataFrame()
for f in df_l4metas:
    df = pd.read_csv(f,index_col=0)
    df_l4 = pd.concat([df_l4, df], axis=0)

In [27]:
df_l4.head()

Unnamed: 0,celltype.L4
MC37A_AMY:AAACGAATCGAGAACG-1,AMY Meis1_Ebf1 Glut-0-2
MC37A_AMY:AAAGATGTCCATCGAA-1,AMY Meis1_Ebf1 Glut-0-2
MC37A_AMY:AAAGGATAGCCTTTGA-1,AMY Meis1_Ebf1 Glut-0-0
MC37A_AMY:AAAGGGCGTAAGCCTT-1,AMY Meis1_Ebf1 Glut-0-1
MC37A_AMY:AACAAAGCAACGTCGC-1,AMY Meis1_Ebf1 Glut-0-0


In [28]:
df_merge = df_l3.merge(df_l4,left_index=True, right_index=True,how='left')

In [31]:
grouped = df_merge.groupby(['celltype.L3', 'celltype.L4']).size().reset_index(name='count')

In [32]:
filtered = grouped[grouped['count'] < 200]


In [33]:
filtered.to_csv("/data2st1/junyi/output/atac0627/iterative/l3l4_SMALL.csv",index=False)

In [34]:
df_ncluster = df_merge.groupby(['celltype.L3']).nunique()

In [36]:
df_l200_cluster = filtered.groupby(['celltype.L3']).count()
df_l200_cluster


Unnamed: 0_level_0,celltype.L4,count
celltype.L3,Unnamed: 1_level_1,Unnamed: 2_level_1
AMY Ccdc3_Acvr1c Glut-1,1,1
AMY Foxp2_Penk GABA-1,1,1
AMY Lhx6_Maf GABA-0,2,2
AMY Lhx6_Maf GABA-2,1,1
AMY Lhx6_Ostm1 GABA-0,1,1
AMY Lhx6_Ostm1 GABA-1,1,1
AMY Meis1_Ebf1 Glut-0,1,1
AMY Meis1_Ebf1 Glut-1,1,1
AMY Rspo2_Tfap2d Glut-0,1,1
AMY Rspo2_Tfap2d Glut-1,1,1


In [37]:
df_l200_summary = df_l200_cluster.merge(df_ncluster, left_index=True, right_index=True, how='left')

In [38]:
df_l200_summary.columns = ['n_clus<200', 'n_clus<=200','n_clus']

In [39]:
df_l200_summary

Unnamed: 0_level_0,n_clus<200,n_clus<=200,n_clus
celltype.L3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AMY Ccdc3_Acvr1c Glut-1,1,1,2
AMY Foxp2_Penk GABA-1,1,1,3
AMY Lhx6_Maf GABA-0,2,2,3
AMY Lhx6_Maf GABA-2,1,1,2
AMY Lhx6_Ostm1 GABA-0,1,1,2
AMY Lhx6_Ostm1 GABA-1,1,1,2
AMY Meis1_Ebf1 Glut-0,1,1,4
AMY Meis1_Ebf1 Glut-1,1,1,2
AMY Rspo2_Tfap2d Glut-0,1,1,3
AMY Rspo2_Tfap2d Glut-1,1,1,2


In [None]:
#Hippo Mossy cell 0 is proproblematic

In [40]:
# 29 clusters are merged
refine_map = {
    'AMY Ccdc3_Acvr1c Glut-1-1':'AMY Ccdc3_Acvr1c Glut-1-0',
    'AMY Foxp2_Penk GABA-1-2':'AMY Foxp2_Penk GABA-1-1',
    'AMY Lhx6_Maf GABA-0-2':'AMY Lhx6_Maf GABA-0-1',
    'AMY Lhx6_Maf GABA-2-1':'AMY Lhx6_Maf GABA-2-0',
    'AMY Lhx6_Ostm1 GABA-1-1':'AMY Lhx6_Ostm1 GABA-1-0',
    'AMY Meis1_Ebf1 Glut-0-3': 'AMY Meis1_Ebf1 Glut-0-2',
    'AMY Meis1_Ebf1 Glut-1-1': 'AMY Meis1_Ebf1 Glut-1-0',
    'AMY Rspo2_Tfap2d Glut-0-2' : 'AMY Rspo2_Tfap2d Glut-0-0',
    'AMY Rspo2_Tfap2d Glut-1-1': 'AMY Rspo2_Tfap2d Glut-1-0',
    'AMY Sox6_Ephb1 GABA-0-2': 'AMY Sox6_Ephb1 GABA-0-1',
    'AMY Trh_Met Glut-1-1': 'AMY Trh_Met Glut-1-0',
    'AMY Vgll3_Pde11a Glut-0-2':'AMY Vgll3_Pde11a Glut-0-0',
    'AMY Zfhx4_Gfra1 GABA-0-2':'AMY Zfhx4_Gfra1 GABA-0-1',
    'Astrocyte-4-0-2': 'Astrocyte-4-0-0',
    'HPF CA2 Glut-0-3': 'HPF CA2 Glut-0-2',
    'HPF CA2 Glut-0-1': 'HPF CA2 Glut-0-0',
    'HPF CA2 Glut-1-1': 'HPF CA2 Glut-1-0', 
    'HPF CA3 Glut-1-1': 'HPF CA3 Glut-1-0',
    'HPF DG GC Glut-1-3': 'HPF DG GC Glut-1-1',
    'HPF DG GC Glut-1-2': 'HPF DG GC Glut-1-0',
    'Microglia-1-0-2': 'Microglia-1-0-0',
    'Microglia-3-1-1': 'Microglia-3-1-0',
    'MOL-2-2-2': 'MOL-2-2-1',
    'MOL-4-0-1': 'MOL-4-0-0',
    'MOL-4-1-2' : 'MOL-4-1-1',
    'OPC-1-1-2': 'OPC-1-1-0',
    'PFC L5 NP Glut-1-2' : 'PFC L5 NP Glut-1-1',
    'PFC Lamp5 GABA-1-1': 'PFC Lamp5 GABA-1-0',
    'PFC Pvalb GABA-1-2': 'PFC Pvalb GABA-1-1',
    'PFC Vip GABA-0-2': 'PFC Vip GABA-0-1'
    # 'AMY Dach1 Glut-1-2': 'AMY Dach1 Glut-1-1',
    # 'AMY Dach1 Glut-2-1': 'AMY Dach1 Glut-2-0',
    # 'PFC L2/3 IT-1-1': 'PFC L2/3 IT-1-0',
    # 'AMY Sim1 Glut-0-1':'AMY Sim1 Glut-0-0',
    # 'AMY Sim1 Glut-0-2':'AMY Sim1 Glut-0-0',
    # 'AMY Slc17a6 Glut-1-4':'AMY Slc17a6 Glut-1-0',
    # 'AMY Slc17a6 Glut-1-5':'AMY Slc17a6 Glut-1-2',
    # 'AMY Sox6 GABA-1-2':'AMY Sox6 GABA-1-1',
    # 'AMY Zfp536 GABA-0-1':'AMY Zfp536 GABA-0-0',
    # 'Hippo CA1-v pyramidal-0-1':'Hippo CA1-v pyramidal-0-0',
    # 'Hippo CA1-v pyramidal-1-1':'Hippo CA1-v pyramidal-1-0',
    # 'Hippo CA3-d pyramidal-1-1':'Hippo CA3-d pyramidal-1-0',
    # 'Hippo DG GC-1-3':'Hippo DG GC-1-0',
    # 'Hippo DG GC-1-4':'Hippo DG GC-1-0',
    # 'Hippo Lamp5 GABA-0-1':'Hippo Lamp5 GABA-0-0',
    # 'Hippo Pvalb GABA-0-1':'Hippo Pvalb GABA-0-0',
    # 'Hippo Sst GABA-0-1':'Hippo Sst GABA-0-0',
    # 'Hippo Sst GABA-0-2':'Hippo Sst GABA-0-0',
    # 'PFC L2/3 IT-1-1':'PFC L2/3 IT-1-0',
    # 'PFC L6 CT-0-8':'PFC L6 CT-0-0',
    # 'PFC L6 IT-0-2':'PFC L6 IT-0-0',
    # 'PFC Lamp5 GABA-0-1':'PFC Lamp5 GABA-0-0',
    # 'PFC NP-1-1':'PFC NP-1-0',
    # 'PFC Sst GABA-0-1':'PFC Sst GABA-0-0'
}

In [41]:
df_merge['celltype.L4.raw'] = df_merge['celltype.L4']



In [42]:
df_merge['celltype.L4']=df_merge['celltype.L4'].fillna(df_merge['celltype.L3']+"-0")

In [43]:
df_merge

Unnamed: 0,celltype.L3,celltype.L4,celltype.L4.raw
MC50B_PFC:AAGATAGCAGGTAACG-1,PFC L6b Glut-0,PFC L6b Glut-0-0,
MC50B_PFC:ACTAGGTAGTGTGTAA-1,PFC L6b Glut-1,PFC L6b Glut-1-0,
MC50B_PFC:AGGCGTCCAATTCTCT-1,PFC L6b Glut-0,PFC L6b Glut-0-0,
MC50B_PFC:AGGCGTCTCCATGTTT-1,PFC L6b Glut-1,PFC L6b Glut-1-0,
MC50B_PFC:CCCGTTATCCTCCTGA-1,PFC L6b Glut-1,PFC L6b Glut-1-0,
...,...,...,...
MW65A_AMY:TTTGGTTAGCAATAAC-1,AMY Zbtb7c_Vwa5b1 Glut-0,AMY Zbtb7c_Vwa5b1 Glut-0-1,AMY Zbtb7c_Vwa5b1 Glut-0-1
MW65A_AMY:TTTGGTTAGTACCTCA-1,AMY Zbtb7c_Vwa5b1 Glut-1,AMY Zbtb7c_Vwa5b1 Glut-1-0,AMY Zbtb7c_Vwa5b1 Glut-1-0
MW65A_AMY:TTTGGTTCATTATGGC-1,AMY Zbtb7c_Vwa5b1 Glut-1,AMY Zbtb7c_Vwa5b1 Glut-1-0,AMY Zbtb7c_Vwa5b1 Glut-1-0
MW65A_AMY:TTTGGTTGTACGCAAG-1,AMY Zbtb7c_Vwa5b1 Glut-1,AMY Zbtb7c_Vwa5b1 Glut-1-0,AMY Zbtb7c_Vwa5b1 Glut-1-0


In [44]:
df_merge['celltype.L4'] = df_merge['celltype.L4'].replace(refine_map)

In [45]:
len(df_merge['celltype.L3'].unique())

133

In [46]:
len(df_merge['celltype.L4'].unique())

212

In [50]:
df_merge.groupby(['celltype.L4']).size().reset_index(name='count').to_csv("/data2st1/junyi/output/atac0627/iterative/L4_count.csv",index=False)

In [51]:
df_merge.groupby(['celltype.L3']).size().reset_index(name='count').to_csv("/data2st1/junyi/output/atac0627/iterative/L3_count.csv",index=False)

In [48]:
df_merge.to_csv("/data2st1/junyi/output/atac0627/iterative/annotated_l3l4.csv")