In [1]:
import numpy as np
import pandas as pd
from scipy.stats import binom


## number of cells analyzed in each individual and technology

In [2]:
datanames = ["HLCA4_P2_10x_with_postprocessing_lung","HLCA4_P3_10x_with_postprocessing_lung","HLCA_smartseq_P2_with_postprocessing_shared","HLCA_smartseq_P3_with_postprocessing_shared"]
datatypes = ["10x","10x","ss2","ss2"]
individuals = ["P2","P3","P2","P3"]
ind_dict = {x : 0 for x in individuals}
tech_dict = {x : 0 for x in datatypes}
ontologies = set()
total = 0
tech_med_dict = {x : [] for x in datatypes}
for i in range(len(datanames)):
  df = pd.read_parquet("/scratch/groups/horence/JuliaO/single_cell/SZS_pipeline2/data/{}.pq".format(datanames[i]),columns=["cell","free_annotation","compartment","geneR1A_uniq"])
#   df["cell_gene"] = df["cell"] + df["geneR1A_uniq"]
  
#   df = df.drop_duplicates("cell_gene")
  df["ontology"] = df["compartment"] + df["free_annotation"]
  num_cells = df["cell"].nunique()
  print("{:,} cells {} {}, {} ontologies".format(num_cells,datatypes[i],individuals[i],df["ontology"].nunique()))
  print("median genes per cell: {:,}".format(df.groupby("cell")["geneR1A_uniq"].nunique().median()))
  tech_med_dict[datatypes[i]] += list(df.groupby("cell")["geneR1A_uniq"].nunique())
  ontologies = set.union(ontologies, set(df["ontology"].unique()))
  ind_dict[individuals[i]] += num_cells
  tech_dict[datatypes[i]] += num_cells
  total += num_cells
print()
for key, val in ind_dict.items():
  print("{}: {:,}".format(key,val))
print()
for key, val in tech_dict.items():
  print("{}: {:,}".format(key,val)) 
print()
print("total cells: {:,}".format(total))
print("total number ontologies:",len(ontologies))

print()

for key, value in tech_med_dict.items():
  print("{}: {:,}".format(key,np.median(value)))

28,793 cells 10x P2, 39 ontologies
median genes per cell: 931.0
24,676 cells 10x P3, 50 ontologies
median genes per cell: 883.0
4,217 cells ss2 P2, 31 ontologies
median genes per cell: 1,513.0
2,836 cells ss2 P3, 33 ontologies
median genes per cell: 1,819.0

P2: 33,010
P3: 27,512

10x: 53,469
ss2: 7,053

total cells: 60,522
total number ontologies: 57

10x: 912.0
ss2: 1,660.0


## Number of cells/genes with computable SpliZ

In [3]:
datanames = ["HLCA4_P2_10x_with_postprocessing_lung","HLCA4_P3_10x_with_postprocessing_lung","HLCA_smartseq_P2_with_postprocessing_shared","HLCA_smartseq_P3_with_postprocessing_shared"]
datatypes = ["10x","10x","ss2","ss2"]
individuals = ["P2","P3","P2","P3"]
tech_med_dict = {x : [] for x in datatypes}
tech_genes = {x : set() for x in datatypes}
for i in range(len(datanames)):
  df = pd.read_parquet("/scratch/PI/horence/JuliaO/single_cell/SZS_pipeline2/scripts/output/rijk_zscore/{}_sym_SVD_normdonor_S_0.1_z_0.0_b_5.pq".format(datanames[i]),columns=["cell","geneR1A_uniq"])
  print(datanames[i],df.groupby("cell")["geneR1A_uniq"].nunique().median())
  tech_med_dict[datatypes[i]] += list(df.groupby("cell")["geneR1A_uniq"].nunique())
  ser = df.groupby("geneR1A_uniq")["cell"].nunique()
  ser = ser[ser >= 10]
  print("{:,} genes".format(ser.shape[0]))
  tech_genes[datatypes[i]].update(ser.index)
for key, value in tech_med_dict.items():
  print("median genes per cell {}: {:,}".format(key,np.median(value)))
  
for key, value in tech_genes.items():
  print("num genes >= 10 {}: {}".format(key, len(value)))

HLCA4_P2_10x_with_postprocessing_lung 60.0
1,599 genes
HLCA4_P3_10x_with_postprocessing_lung 74.0
1,415 genes
HLCA_smartseq_P2_with_postprocessing_shared 782.0
10,876 genes
HLCA_smartseq_P3_with_postprocessing_shared 881.0
9,795 genes
median genes per cell 10x: 67.0
median genes per cell ss2: 833.0
num genes >= 10 10x: 1754
num genes >= 10 ss2: 11640


## called genes by the SpliZ, SpliZVD

In [5]:
inpath = "/scratch/PI/horence/JuliaO/single_cell/SZS_pipeline2/scripts/output/variance_adjusted_permutations/"
datanames = ["HLCA4_P2_10x_with_postprocessing_lung","HLCA4_P3_10x_with_postprocessing_lung","HLCA_smartseq_P2_with_postprocessing_shared","HLCA_smartseq_P3_with_postprocessing_shared"]
# datanames = ["HLCA4_P2_10x_with_postprocessing_lung","HLCA4_P3_10x_with_postprocessing_lung"]

# suffixes = ["","_shuffle","_lungimmuneMacrophage_10"]

# datanames = ["TSP1_10x_with_postprocessing_nopanc_cellann","TSP2_10x_rerun_with_postprocessing_3prime_cellann","TS_pilot_smartseq_with_postprocessing_nopanc_cellann","TSP2_SS2_RUN1_RUN2_cellann"]
suffixes = [""]

In [6]:
z_cols = ["scZ","svd_z0"]
out_dict = {x : [] for x in z_cols}
out_dict_frac = {x : [] for x in z_cols}
out_dict["dataname"] = []
out_dict_frac["dataname"] = []
eigen_thresh = .9
gene = "PPP1R12A"
out_string = ""
for dataname in datanames:
  out_string += dataname + "\n"
  for suffix in suffixes:
    out_dict["dataname"].append(dataname + suffix)
    out_dict_frac["dataname"].append(dataname + suffix)
    print(dataname + suffix)
    df = pd.read_csv("{}{}{}_pvals_100_S_0.1_z_0.0_b_5.tsv".format(inpath,dataname,suffix),sep="\t")
    
    df["f0+f1"] = df["f0"] + df["f1"]

    for z_col in z_cols:
      print(z_col)
      if z_col == "svd_z1":
        out_dict[z_col].append(df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0"] < eigen_thresh)]["geneR1A_uniq"].nunique())
        out_dict_frac[z_col].append(df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0"] < eigen_thresh)]["geneR1A_uniq"].nunique()/df["geneR1A_uniq"].nunique())
        display(df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0"] < eigen_thresh)].sort_values("max_abs_median_" + z_col).tail(10)[["geneR1A_uniq","num_onts","perm_pval_adj_" + z_col, "max_abs_median_" + z_col]])
#         df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0"] < eigen_thresh)].to_csv("{}{}{}{}_sig.tsv".format(outpath,dataname,suffix,z_col),sep="\t",index=False)
        out_string += "{} {} in {}\n".format(z_col,gene,gene in df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0"] < eigen_thresh)]["geneR1A_uniq"].unique())

      elif z_col == "svd_z2":
        out_dict[z_col].append(df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0+f1"] < eigen_thresh)]["geneR1A_uniq"].nunique())
        out_dict_frac[z_col].append(df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0+f1"] < eigen_thresh)]["geneR1A_uniq"].nunique()/df["geneR1A_uniq"].nunique())
        display(df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0+f1"] < eigen_thresh)].sort_values("max_abs_median_" + z_col).tail(10)[["geneR1A_uniq","num_onts","perm_pval_adj_" + z_col, "max_abs_median_" + z_col]])
#         print(gene," in",gene in df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0+f1"] < eigen_thresh)]["geneR1A_uniq"].unique())
#         df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0+f1"] < eigen_thresh)].to_csv("{}{}{}{}_sig.tsv".format(outpath,dataname,suffix,z_col),sep="\t",index=False)

        out_string += "{} {} in {}\n".format(z_col,gene,gene in df[(df["perm_pval_adj_" + z_col] < 0.05) & (df["f0+f1"] < eigen_thresh)]["geneR1A_uniq"].unique())

      else:
        out_dict[z_col].append(df[df["perm_pval_adj_" + z_col] < 0.05]["geneR1A_uniq"].nunique())
        out_string += "{} {} in {}\n".format(z_col,gene,gene in df[(df["perm_pval_adj_" + z_col] < 0.05) ]["geneR1A_uniq"].unique())
#         df[(df["perm_pval_adj_" + z_col] < 0.05) ].to_csv("{}{}{}{}_sig.tsv".format(outpath,dataname,suffix,z_col),sep="\t",index=False)

#         print(gene," in",gene in df[df["perm_pval_adj_" + z_col] < 0.05]["geneR1A_uniq"].unique())
        out_dict_frac[z_col].append(df[df["perm_pval_adj_" + z_col] < 0.05]["geneR1A_uniq"].nunique()/df["geneR1A_uniq"].nunique())
        print("top 100 genes",list(df[df["perm_pval_adj_" + z_col] < 0.05].sort_values("max_abs_median_" + z_col)[["geneR1A_uniq","num_onts","perm_pval_adj_" + z_col, "max_abs_median_" + z_col]]["geneR1A_uniq"].tail(100)))
        display(df[df["perm_pval_adj_" + z_col] < 0.05].sort_values("max_abs_median_" + z_col).tail(10)[["geneR1A_uniq","num_onts","perm_pval_adj_" + z_col, "max_abs_median_" + z_col]])
out_df = pd.DataFrame.from_dict(out_dict)
out_frac = pd.DataFrame.from_dict(out_dict_frac)
                

HLCA4_P2_10x_with_postprocessing_lung
scZ
top 100 genes ['C19orf33', 'SNHG6', 'NIFK', 'LYAR', 'RARRES1', 'PSMB2', 'HCST', 'RPL5', 'VIM', 'RPL27A', 'YIF1A', 'CMC1', 'BLVRB', 'MICOS10-NBL1', 'CCL20', 'CD63', 'JPT1', 'TXNDC17', 'LRRFIP1', 'S100A13', 'GMFG', 'SRSF5', 'CAST', 'YBX1', 'TMEM147', 'CYBA', 'HSP90AA1', 'HNRNPDL', 'UQCRB', 'SERF2-C15ORF63', 'GSPT1', 'ZBTB8OS', 'NDUFB3', 'CARD16', 'CP', 'NAP1L1', 'LGALS3', 'ATP5MC3', 'ITM2B', 'CALM1', 'RPAIN', 'CKLF-CMTM1', 'SRSF7', 'RPL41', 'MZT2B', 'COX4I1', 'AREG', 'ATP5MG', 'KRTCAP2', 'TMC5', 'DECR1', 'ATP5PO', 'PPFIBP1', 'HSP90B1', 'PLIN2', 'ECH1', 'CTSC', 'ARPC3', 'HNRNPA1', 'ARPC2', 'ELOC', 'RBM39', 'ARHGAP18', 'AUP1', 'SOD2', 'RPS7', 'CIRBP', 'ETHE1', 'CALD1', 'ARID4B', 'DNMT1', 'PQBP1', 'AKAP13', 'ISG20', 'PRMT1', 'SEC61G', 'ARL6IP1', 'SOD1', 'RWDD1', 'CAPG', 'LUC7L3', 'LTA4H', 'MORF4L1', 'CHCHD2', 'NASP', 'LMNA', 'LRRFIP2', 'REEP3', 'RPL17', 'CD47', 'RPN2', 'NOP56', 'THYN1', 'SNHG8', 'SCGB1A1', 'PPP1R12A', 'MYL6', 'RPS24', 'LMO7', 'ATP5F

Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_scZ,max_abs_median_scZ
51,RPN2,12,0.0,0.781146
2,NOP56,4,0.0,0.810319
147,THYN1,2,0.0,0.89172
27,SNHG8,13,0.0,0.907007
46,SCGB1A1,16,0.0,0.99725
11,PPP1R12A,2,0.0,1.054181
36,MYL6,39,0.0,1.258683
59,RPS24,39,0.0,1.372058
95,LMO7,2,0.0,1.874982
142,ATP5F1C,22,0.0,2.648506


svd_z0
top 100 genes ['RPL10', 'RPS25', 'MORF4L1', 'RAB2A', 'ATP5PD', 'RPL24', 'HLA-DQA1', 'CALD1', 'RPL23', 'FCER1G', 'ITM2B', 'VIM', 'RPL30', 'PTMA', 'PSMB2', 'NDUFAF3', 'SAT1', 'HPGD', 'MYL6', 'UXT', 'PSMA3', 'GAS5', 'IFI30', 'S100A13', 'PSME2', 'COX7C', 'RPN2', 'PRDX5', 'PLIN2', 'FABP4', 'TPM2', 'LTA4H', 'LGALS1', 'NDUFB8', 'NAP1L1', 'ARHGAP18', 'RPS13', 'RPS8', 'RPL28', 'NASP', 'CAST', 'RPS27A', 'ATP5F1C', 'RBM39', 'RPS15A', 'RACK1', 'BPIFB1', 'POLR2I', 'RPSA', 'PFDN5', 'LRRFIP1', 'RPS12', 'AREG', 'PSMB7', 'RPL27A', 'DEK', 'MGST3', 'RPL35A', 'HSP90AA1', 'DNAJC15', 'RNF130', 'LGALS3', 'SELENOH', 'ATP5MC3', 'IGFBP7', 'RPS19', 'CALM1', 'RPS6', 'RPL10A', 'S100A6', 'CYBA', 'TPT1', 'DECR1', 'TBCA', 'S100A9', 'CLU', 'RPLP2', 'TYROBP', 'RPS3', 'BLOC1S1-RDH5', 'RPS7', 'CD63', 'TXN', 'KRTCAP2', 'POMP', 'SCGB1A1', 'RPS4X', 'MARCO', 'ANXA1', 'LCN2', 'MT1E', 'RPL12', 'RPS2', 'UBA52', 'RPS14', 'SCGB3A2', 'RPL13A', 'RPL17', 'FTL', 'SCGB3A1']


Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_svd_z0,max_abs_median_svd_z0
571,MT1E,18,0.0,5.671791
731,RPL12,39,0.0,5.708614
754,RPS2,39,0.0,5.95762
144,UBA52,39,0.0,6.878262
56,RPS14,39,0.0,7.027029
779,SCGB3A2,5,0.0,7.084992
78,RPL13A,39,0.0,7.735757
79,RPL17,39,0.0,8.841513
169,FTL,39,0.0,9.722942
5,SCGB3A1,14,0.0,46.593897


HLCA4_P3_10x_with_postprocessing_lung
scZ
top 100 genes ['HEXB', 'NDUFS5', 'RPS27L', 'CIRBP', 'KMT2E', 'RPL3', 'ISG20', 'PLEKHA5', 'NDUFA4', 'SRSF5', 'NDUFB8', 'CLU', 'AREG', 'SEC61G', 'CD74', 'RPS3', 'LY96', 'HNRNPDL', 'TPT1', 'ATP5MC3', 'GMFG', 'SNRPB', 'ATP5PD', 'CIAO2B', 'MGST3', 'S100A13', 'RPS7', 'HNRNPA1', 'FAM183A', 'TYROBP', 'DPP7', 'KYNU', 'MAP4K1', 'EIF3G', 'TMEM258', 'ERGIC3', 'CAST', 'LTA4H', 'PLTP', 'BLVRB', 'ARL6IP1', 'HCST', 'GSTP1', 'ERLEC1', 'PSMB1', 'DECR1', 'TMEM147', 'TREM1', 'EMP3', 'CHCHD2', 'MNDA', 'SOD1', 'MICOS10-NBL1', 'COX5B', 'CAPG', 'RPL38', 'WFDC21P', 'LYAR', 'FTL', 'EMP2', 'NDUFAF3', 'JPT1', 'LGALS3', 'CMC1', 'ARPC2', 'MSRA', 'CD63', 'IFT57', 'COX4I1', 'KRTCAP2', 'ITM2B', 'AUP1', 'ATP5PO', 'NDUFS2', 'SCGB1A1', 'DBI', 'SRSF7', 'CRNDE', 'ATP5MG', 'UQCRB', 'PILRA', 'LMNA', 'DAD1', 'THYN1', 'METTL26', 'RPL4', 'RPN2', 'ARPC3', 'RWDD1', 'FYB1', 'NUPR1', 'CD55', 'PRMT1', 'RPS24', 'CD47', 'MYL6', 'PPP1R12A', 'CIR1', 'ANXA1', 'ATP5F1C']


Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_scZ,max_abs_median_scZ
43,NUPR1,10,0.0,1.083976
153,CD55,4,0.0,1.159876
50,PRMT1,6,0.0,1.175785
96,RPS24,43,0.0,1.182912
156,CD47,20,0.0,1.209776
32,MYL6,48,0.0,1.321093
48,PPP1R12A,3,0.0,1.744816
181,CIR1,3,0.0,1.777411
176,ANXA1,30,0.0,2.085256
159,ATP5F1C,29,0.0,2.890178


svd_z0
top 100 genes ['RPL7A', 'LYAR', 'VIM', 'PTMA', 'GMFG', 'PPP1R12A', 'NAP1L1', 'UBE2D2', 'MRPL52', 'PSMA3', 'SOD2', 'ARPC2', 'MYL6', 'RPL30', 'EIF3K', 'DEK', 'RPL10', 'ATP5PD', 'RPS15A', 'RPL23', 'PRDX5', 'HSPB11', 'RAB2A', 'RPL24', 'AREG', 'CAST', 'CIRBP', 'CLU', 'APRT', 'LGALS1', 'GAS5', 'RPS13', 'CYBA', 'TPRKB', 'MGST3', 'FABP4', 'RPL18', 'RPL28', 'TPM2', 'SCGB1A1', 'HSP90AA1', 'S100A13', 'ATP5F1C', 'RACK1', 'NDUFB8', 'NDUFA4', 'KYNU', 'FCGRT', 'RPL35A', 'RPS8', 'MICOS10-NBL1', 'GSTO1', 'RPS23', 'TPT1', 'RPL4', 'NOP53', 'RPS27L', 'BPIFB1', 'RPL27A', 'NASP', 'ITM2B', 'RPS12', 'ANXA1', 'CD63', 'AGR2', 'TBCA', 'S100A9', 'KRTCAP2', 'RPS19', 'PIP', 'SCGB3A2', 'RPS7', 'TYROBP', 'LMNA', 'GSTP1', 'IGFBP7', 'SELENOH', 'CXCL17', 'RPLP2', 'DBI', 'FXYD3', 'EMP2', 'RPSA', 'BLOC1S1-RDH5', 'MARCO', 'TXN', 'RPS3', 'CAPG', 'LCN2', 'RPS14', 'RPL12', 'RPS6', 'UBA52', 'RPL13A', 'RPL17', 'POMP', 'RPS4X', 'RPL13', 'unknown_chr22_22900000', 'FTL']


Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_svd_z0,max_abs_median_svd_z0
692,RPL12,49,0.0,6.911378
102,RPS6,49,0.0,6.999754
75,UBA52,48,0.0,7.170003
7,RPL13A,49,0.0,8.008816
8,RPL17,47,0.0,8.23839
633,POMP,26,0.0,8.728288
101,RPS4X,49,0.0,9.233596
6,RPL13,48,0.0,10.601019
858,unknown_chr22_22900000,2,0.032482,10.65632
158,FTL,48,0.0,10.935376


HLCA_smartseq_P2_with_postprocessing_shared
scZ
top 100 genes ['COG8', 'ENDOV', 'PRKCH', 'BAALC', 'DPH3', 'CYGB', 'HP1BP3', 'BNIP2', 'MYO6', 'ADGRF5', 'RRBP1', 'CALD1', 'RPL23AP82', 'PRKCE', 'KIAA1217', 'EOGT', 'RWDD3', 'TSTD3', 'CASP3', 'ELK3', 'CTSC', 'PARD3', 'PTK2', 'MSH3', 'HDGFL3', 'CTNND1', 'SPECC1', 'TAGLN', 'unknown_chr12_56200000', 'SCLT1', 'RPP38', 'MCAT', 'FAM13A', 'PLD3', 'TNPO3', 'SLC20A2', 'DGUOK', 'DISP1', 'MNAT1', 'STIMATE', 'MYLK', 'TPD52L2', 'CHD9', 'SOCS2', 'ADD3', 'BCL6', 'C1orf52', 'LOC105376392', 'TP53TG1', 'RNMT', 'unknown_chr8_15800000', 'BOLA3', 'CFLAR-AS1', 'RNF19A', 'ZNF326', 'SNX14', 'L3HYPDH', 'ZNF271P', 'INF2', 'ABHD5', 'EBPL', 'BIN1', 'TTC17', 'RCSD1', 'CD4', 'TXNDC11', 'PNKP', 'TBC1D17', 'unknown_chr2_85700000', 'RASSF1', 'STXBP1', 'IMMP1L', 'MRPL33', 'TNPO1', 'TOMM34', 'MPRIP', 'PRPF6', 'BUB3', 'PNPLA8', 'ADAR', 'EXOC7', 'RPL17-C18orf32', 'TMC5', 'AHNAK', 'SPACA9', 'HTRA1', 'OAS3', 'FBLN5', 'CBX5', 'RNASEH1', 'MGAT1', 'ANXA6', 'PHYKPL', 'LRIF1', 'RGS3'

Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_scZ,max_abs_median_scZ
512,MGAT1,7,0.042142,2.911853
296,ANXA6,9,0.0,3.00247
252,PHYKPL,14,0.0,3.073023
260,LRIF1,4,0.0,3.186091
65,RGS3,7,0.0,3.272411
148,NUDT16L1,9,0.0,3.333638
410,ACTA2,7,0.0,3.508714
492,unknown_chr10_76600000,4,0.0,3.737628
457,ABO,4,0.0,3.992588
44,HYAL2,13,0.0,4.046205


svd_z0
top 100 genes ['LUC7L', 'IDH3G', 'UBE2D2', 'FAM49B', 'SGK1', 'AK2', 'MGLL', 'OSCP1', 'LMBR1', 'TBXAS1', 'PMP22', 'GRAMD2B', 'PSAP', 'BPIFB1', 'TYW3', 'CFDP1', 'ECHDC2', 'HSP90AA1', 'FAM219B', 'STARD3NL', 'PRPF40A', 'EIF4A1', 'CLUAP1', 'RAB13', 'FMC1-LUC7L2', 'IRF9', 'CD55', 'PCNP', 'FRMD4B', 'unknown_chr10_76600000', 'LRRC23', 'IQCG', 'ANXA11', 'VRK2', 'GTF3A', 'MYL6', 'HSPB11', 'LINC00680', 'ARFGAP2', 'EZR', 'UBE2D3', 'IMMP2L', 'STX7', 'DNAJC1', 'UFD1', 'NOSTRIN', 'CRBN', 'FDPS', 'REV3L', 'LAP3', 'IFNAR2', 'NUP50', 'ETFA', 'FCGRT', 'LMNA', 'DNAJC17', 'IK', 'RGS3', 'CLK1', 'TCF7L2', 'NTHL1', 'ENY2', 'PSME2', 'RCSD1', 'RNF130', 'TMEM59', 'ASL', 'TNFSF10', 'SERPING1', 'APLP2', 'MRPS21', 'LRIF1', 'NACA', 'RACK1', 'XRCC6', 'TPM1', 'IL32', 'COPS3', 'RPLP0', 'ABO', 'HPGD', 'CD4', 'CAPG', 'PSMA3', 'SPACA9', 'CNOT2', 'KIF5B', 'COMMD9', 'CFAP36', 'S100A6', 'HYAL2', 'ACTA2', 'ANXA1', 'ANXA4', 'CALM2', 'PRR13', 'EIF3K', 'PSME1', 'CTSH', 'SCGB3A1']


Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_svd_z0,max_abs_median_svd_z0
44,HYAL2,13,0.0,5.042499
410,ACTA2,7,0.0,5.071773
1492,ANXA1,16,0.0,5.144976
1496,ANXA4,12,0.0,5.469234
1892,CALM2,20,0.0,5.666718
4823,PRR13,14,0.0,5.709547
2660,EIF3K,16,0.0,6.146071
4869,PSME1,18,0.0,6.26206
2331,CTSH,10,0.0,7.476342
5331,SCGB3A1,5,0.0,11.678761


HLCA_smartseq_P3_with_postprocessing_shared
scZ
top 100 genes ['RAI14', 'HDGF', 'SIGIRR', 'KIF13A', 'RNASE4', 'LRRC23', 'PRMT2', 'ITGA1', 'unknown_chr12_79800000', 'MPST', 'FAM228B', 'PHACTR4', 'SFTA1P', 'PTPRE', 'UTRN', 'unknown_chr5_150400000', 'RCAN1', 'S100A16', 'CMC1', 'CC2D2A', 'CTNND1', 'NFAT5', 'HNRNPF', 'ANKRD10', 'C12orf76', 'KIAA1217', 'RRBP1', 'SGMS1', 'unknown_chr10_78000000', 'ZNF207', 'XAF1', 'LAMP2', 'PLEKHA1', 'WWOX', 'ARHGEF1', 'ANAPC13', 'unknown_chr12_54300000', 'SYNE1', 'DHRS4', 'HIRIP3', 'PLD3', 'DGUOK', 'OSBPL1A', 'RERE', 'NDUFV3', 'PSMA3-AS1', 'SORBS2', 'LPIN1', 'GOLGA8B', 'KNOP1', 'AP2M1', 'APOL1', 'MPDU1', 'TMEM164', 'ZNF271P', 'CCZ1P-OR7E38P', 'CALD1', 'SOCS2', 'CLIC5', 'ACTA2', 'PKIG', 'ITGAE', 'PDE4D', 'RPARP-AS1', 'CDHR3', 'PSPC1', 'DENND10', 'METTL17', 'SPTBN1', 'ADD3', 'LOC105376392', 'MRPL33', 'unknown_chr12_56200000', 'NBAS', 'RNF146', 'PARP2', 'unknown_chr10_26800000', 'PEX19', 'SVIL', 'LIMCH1', 'ARL4A', 'HOOK2', 'ANXA6', 'TLE5', 'IL16', 'RIPK2', 'EXO

Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_scZ,max_abs_median_scZ
114,RGS3,10,0.0,2.807419
82,FGF7,3,0.0,2.902732
190,AKAP12,4,0.0,2.929596
45,USP12,4,0.0,3.055775
189,CORO1B,11,0.0,3.330518
136,MGAT1,6,0.0,3.331783
23,LRIF1,3,0.0,3.440675
231,DDX39A,8,0.0,3.752157
77,FBLN5,6,0.0,4.415848
237,CCDC50,4,0.0,5.067508


svd_z0
top 100 genes ['ZDHHC6', 'PTMA', 'MTIF3', 'CHPT1', 'EIF4A2', 'RPS5', 'LRRC23', 'SIGIRR', 'ARHGAP15', 'SVIL', 'RARRES1', 'EXOC7', 'MFF', 'CFLAR', 'HAT1', 'CREM', 'UFSP2', 'UBA52', 'FAM204A', 'ARHGAP18', 'COPZ2', 'GOLGA2', 'CD44', 'SPECC1', 'CRYZL1', 'RPS24', 'RAD23A', 'PSAP', 'CYBA', 'MCUB', 'PRPF40A', 'HNRNPUL2-BSCL2', 'STXBP6', 'FGF7', 'UBA3', 'MORF4L1', 'RNASET2', 'LRRFIP1', 'ASL', 'CAV1', 'PCMTD1', 'VPS29', 'FCGRT', 'TMEM87A', 'AREG', 'LMNA', 'USP12', 'ELN', 'KRAS', 'MICOS10-NBL1', 'MYL6', 'CORO1B', 'CCT4', 'PPP1R7', 'RPS3', 'CFDP1', 'LAMTOR4', 'FTO', 'PPHLN1', 'NSMCE1', 'CD55', 'PRMT2', 'CARHSP1', 'WARS1', 'TNFSF10', 'LIMCH1', 'TPM1', 'FDPS', 'GUSB', 'SAT1', 'SNX5', 'NOL3', 'STARD3NL', 'RTN4', 'TMEM59', 'LRIF1', 'GUK1', 'IFT57', 'PSME1', 'NSMCE4A', 'FBLN5', 'ACOT9', 'MGAT1', 'RPLP0', 'BST2', 'RACK1', 'GAPDH', 'IL32', 'CAPG', 'RPL15', 'ACTA2', 'RGS3', 'GSTP1', 'GSN', 'EIF3K', 'CCDC50', 'CTSH', 'ARPC2', 'SERPING1', 'S100A6']


Unnamed: 0,geneR1A_uniq,num_onts,perm_pval_adj_svd_z0,max_abs_median_svd_z0
221,ACTA2,9,0.0,4.36199
114,RGS3,10,0.0,4.539948
2426,GSTP1,18,0.0,4.76293
19,GSN,16,0.0,4.887717
2034,EIF3K,17,0.0,4.945893
237,CCDC50,4,0.0,5.079606
1745,CTSH,9,0.0,5.128431
1083,ARPC2,17,0.0,5.224307
4350,SERPING1,16,0.0,6.088493
4246,S100A6,19,0.0,7.091399


In [7]:
out_df[["dataname"] + z_cols]

Unnamed: 0,dataname,scZ,svd_z0
0,HLCA4_P2_10x_with_postprocessing_lung,210,133
1,HLCA4_P3_10x_with_postprocessing_lung,219,141
2,HLCA_smartseq_P2_with_postprocessing_shared,607,647
3,HLCA_smartseq_P3_with_postprocessing_shared,248,366


In [8]:
out_frac[["dataname"] + z_cols]

Unnamed: 0,dataname,scZ,svd_z0
0,HLCA4_P2_10x_with_postprocessing_lung,0.231533,0.146637
1,HLCA4_P3_10x_with_postprocessing_lung,0.25406,0.163573
2,HLCA_smartseq_P2_with_postprocessing_shared,0.088691,0.094535
3,HLCA_smartseq_P3_with_postprocessing_shared,0.044278,0.065345


## Intersection of genes called as significant

In [17]:
genes = ["CD47","TPM2","PPP1R12A"]
# genes = ["CD47"]
suffix = ""
z_cols = ["scZ","svd_z0"]
for dataname in datanames:
  print(dataname)
#   out_string += dataname + "\n"

#   out_dict["dataname"].append(dataname + suffix)
#   out_dict_frac["dataname"].append(dataname + suffix)
#   print(dataname + suffix)
  df = pd.read_csv("{}{}{}_pvals_100_S_0.1_z_0.0_b_5.tsv".format(inpath,dataname,suffix),sep="\t")
  print("genes shared by SpliZ and SpliZVD: {}".format(df[(df["perm_pval_adj_scZ"] < 0.05) & (df["perm_pval_adj_svd_z0"] < 0.05)]["geneR1A_uniq"].nunique()))
  for gene in genes:
    print("gene: {}".format(gene))
    for z_col in z_cols:
      print("{}: {}".format(z_col,df[df["geneR1A_uniq"] == gene]["perm_pval_adj_"+ z_col].iloc[0]))
      df = df.sort_values("max_abs_median_" + z_col,ascending=False)
      df.reset_index(inplace=True,drop=True)
      print("rank: {}".format(df[df["geneR1A_uniq"] == gene].index[0]))
    print()
  #   break

HLCA4_P2_10x_with_postprocessing_lung
genes shared by SpliZ and SpliZVD: 89
gene: CD47
scZ: 0.0
rank: 12
svd_z0: 0.0
rank: 252

gene: TPM2
scZ: 0.0
rank: 266
svd_z0: 0.0
rank: 121

gene: PPP1R12A
scZ: 0.0
rank: 4
svd_z0: 0.0
rank: 387

HLCA4_P3_10x_with_postprocessing_lung
genes shared by SpliZ and SpliZVD: 90
gene: CD47
scZ: 0.0
rank: 5
svd_z0: 0.0
rank: 304

gene: TPM2
scZ: 0.0
rank: 204
svd_z0: 0.0
rank: 82

gene: PPP1R12A
scZ: 0.0
rank: 3
svd_z0: 0.0
rank: 164

HLCA_smartseq_P2_with_postprocessing_shared
genes shared by SpliZ and SpliZVD: 338
gene: CD47
scZ: nan
rank: 5256
svd_z0: nan
rank: 2531

gene: TPM2
scZ: nan
rank: 5065
svd_z0: 0.05282115869017637
rank: 245

gene: PPP1R12A
scZ: nan
rank: 2909
svd_z0: nan
rank: 3250

HLCA_smartseq_P3_with_postprocessing_shared
genes shared by SpliZ and SpliZVD: 169
gene: CD47
scZ: nan
rank: 3904
svd_z0: nan
rank: 670

gene: TPM2
scZ: nan
rank: 1687
svd_z0: nan
rank: 2376

gene: PPP1R12A
scZ: nan
rank: 1504
svd_z0: 0.0
rank: 2006



In [18]:
for individual in ["P2","P3"]:
  print(individual)
  tenx_df =  pd.read_csv("{}HLCA4_{}_10x_with_postprocessing_lung_pvals_100_S_0.1_z_0.0_b_5.tsv".format(inpath,individual),sep="\t")
  ss2_df =  pd.read_csv("{}HLCA_smartseq_{}_with_postprocessing_shared_pvals_100_S_0.1_z_0.0_b_5.tsv".format(inpath,individual),sep="\t")
  tenx_sig = set(tenx_df[(tenx_df["perm_pval_adj_scZ"] < 0.05) | (tenx_df["perm_pval_adj_svd_z0"] < 0.05)]["geneR1A_uniq"].unique())
  ss2_sig = set(ss2_df[(ss2_df["perm_pval_adj_scZ"] < 0.05) | (ss2_df["perm_pval_adj_svd_z0"] < 0.05)]["geneR1A_uniq"].unique())
  print("number significant 10x: {:,}".format(len(tenx_sig)))
  print("fraction significant 10x: {}".format(len(tenx_sig)/tenx_df.shape[0]))
  print("number signfiicant ss2: {:,}".format(len(ss2_sig)))
  print("fraction significant ss2: {}".format(len(ss2_sig)/ss2_df.shape[0]))
  
  print("{} intersection: {}".format(individual,len(tenx_sig.intersection(ss2_sig))))
  print("pvalue: {}".format(1 - binom.cdf(len(tenx_sig.intersection(ss2_sig)),tenx_df.shape[0],(len(tenx_sig)/tenx_df.shape[0])*(len(ss2_sig)/ss2_df.shape[0]))))
# 1 - binom.cdf(get_num_genes(df,True,True,True), df.shape[0], p_sig[0]*p_sig[1]*p_sig[2])

P2
number significant 10x: 254
fraction significant 10x: 0.28004410143329656
number signfiicant ss2: 916
fraction significant ss2: 0.13383985973115137
P2 intersection: 71
pvalue: 4.137484244104428e-09
P3
number significant 10x: 270
fraction significant 10x: 0.31322505800464034
number signfiicant ss2: 445
fraction significant ss2: 0.07945009819675058
P3 intersection: 46
pvalue: 8.570848222255734e-07


## Concordance 10x vs 10x

In [11]:
df_P2 =  pd.read_csv("{}HLCA4_P2_10x_with_postprocessing_lung_pvals_100_S_0.1_z_0.0_b_5.tsv".format(inpath),sep="\t")
df_P3 =  pd.read_csv("{}HLCA4_P3_10x_with_postprocessing_lung_pvals_100_S_0.1_z_0.0_b_5.tsv".format(inpath),sep="\t")


In [12]:
P2_set = set(df_P2[df_P2["perm_pval_adj_scZ"] < 0.05]["geneR1A_uniq"].unique())
P3_set = set(df_P3[df_P3["perm_pval_adj_scZ"] < 0.05]["geneR1A_uniq"].unique())
print("spliz intersection: {}".format(len(P2_set.intersection(P3_set))))

spliz intersection: 137


In [13]:
P2_set = set(df_P2[df_P2["perm_pval_adj_svd_z0"] < 0.05]["geneR1A_uniq"].unique())
P3_set = set(df_P3[df_P3["perm_pval_adj_svd_z0"] < 0.05]["geneR1A_uniq"].unique())
print("splizVD intersection: {}".format(len(P2_set.intersection(P3_set))))

splizVD intersection: 83


In [14]:
P2_set = set(df_P2[(df_P2["perm_pval_adj_scZ"] < 0.05) | (df_P2["perm_pval_adj_svd_z0"] < 0.05)]["geneR1A_uniq"].unique())
P3_set = set(df_P3[(df_P3["perm_pval_adj_scZ"] < 0.05) | (df_P3["perm_pval_adj_svd_z0"] < 0.05) ]["geneR1A_uniq"].unique())
print("either intersection: {}".format(len(P2_set.intersection(P3_set))))

either intersection: 178


In [15]:
print("pvalue: {}".format(1 - binom.cdf(len(P2_set.intersection(P3_set)),df_P2.shape[0],(len(P2_set)/df_P2.shape[0])*(len(P3_set)/df_P3.shape[0]))))


pvalue: 1.1102230246251565e-16
