In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
ref_map, test_map = 986625, 1038115
ref_file = f"sc_node_attributes_{ref_map}.csv"
test_file = f"sc_node_attributes_{test_map}.csv"
reg_node_df = pd.read_csv(Path.cwd() / ref_file)
sc_node_df = pd.read_csv(Path.cwd() / test_file)

In [3]:
print(reg_node_df.columns)
print(sc_node_df.columns)

Index(['CD_AnnotatedMembers', 'CD_AnnotatedMembers_Overlap',
       'CD_AnnotatedMembers_Pvalue', 'CD_AnnotatedMembers_Size',
       'CD_CommunityName', 'CD_Labeled', 'CD_MemberList',
       'CD_MemberList_LogSize', 'CD_MemberList_Size', 'CORUM_FDRs',
       'CORUM_jaccard_indexes', 'CORUM_overlap_genes', 'CORUM_terms',
       'GO_CC_descriptions', 'GO_CC_FDRs', 'GO_CC_jaccard_indexes',
       'GO_CC_overlap_genes', 'GO_CC_terms', 'HiDeF_persistence', 'HPA_FDRs',
       'HPA_jaccard_indexes', 'HPA_overlap_genes', 'HPA_terms', 'name',
       'represents', 'selected', 'shared name'],
      dtype='object')
Index(['CD_AnnotatedMembers', 'CD_AnnotatedMembers_Overlap',
       'CD_AnnotatedMembers_Pvalue', 'CD_AnnotatedMembers_Size',
       'CD_CommunityName', 'CD_Labeled', 'CD_MemberList',
       'CD_MemberList_LogSize', 'CD_MemberList_Size', 'CORUM_FDRs',
       'CORUM_jaccard_indexes', 'CORUM_overlap_genes', 'CORUM_terms',
       'GO_CC_descriptions', 'GO_CC_FDRs', 'GO_CC_jaccard_indexes',

In [4]:
# sort both by CD_MemberList_Size
reg_node_df.sort_values(by=['CD_MemberList_Size'], inplace=True, ascending=False)
sc_node_df.sort_values(by=['CD_MemberList_Size'], inplace=True, ascending=False)

In [5]:
reg_node_df.head()
reg_node_df.index = range(len(reg_node_df))

In [6]:
sc_node_df.head()
sc_node_df.index = range(len(sc_node_df))

In [7]:
sc_node_df.loc[1]

CD_AnnotatedMembers                                                          NaN
CD_AnnotatedMembers_Overlap                                                  0.0
CD_AnnotatedMembers_Pvalue                                                   0.0
CD_AnnotatedMembers_Size                                                       0
CD_CommunityName                                                             NaN
CD_Labeled                                                                 False
CD_MemberList                  NPRL3 MSTO1 CDC123 ASB1 TOM1L2 RPS14 XPNPEP3 S...
CD_MemberList_LogSize                                                     11.573
CD_MemberList_Size                                                          3047
CORUM_FDRs                                                                   NaN
CORUM_jaccard_indexes                                                        NaN
CORUM_overlap_genes                                                          NaN
CORUM_terms                 

In [8]:
sc_genes = sc_node_df.iloc[0]["CD_MemberList"].split(" ")
reg_genes = reg_node_df.iloc[0]["CD_MemberList"].split(" ")
print(len(sc_genes), len(reg_genes), len(set(sc_genes) & set(reg_genes)))
print(len(sc_node_df), len(reg_node_df))

4971 4971 4971
334 320


In [9]:
!python hypergeom_test.py --infname $test_map --refname $ref_map --w_root

100%|█████████████████████████████████████████| 334/334 [00:08<00:00, 38.32it/s]
output_file = 1038115_986625_hypergeom.csv
=== finished analyze_hidef_output ===


In [10]:
tar_df, ref_df = reg_node_df, sc_node_df
enrich_to_ref_df = pd.read_csv(Path.cwd() / f"{test_map}_{ref_map}_hypergeom.csv")
enrich_to_ref_df.drop(columns=['Unnamed: 0'], inplace=True)
print(len(enrich_to_ref_df), len(enrich_to_ref_df.columns))
enrich_to_ref_df.head()

334 320


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
num_enriched = np.sum(enrich_to_ref_df < 1.0, axis=1)
convert_index = [x for x, y in ref_df.iterrows()]
print(f"{np.sum(num_enriched)}\n")
for i in tar_df.index:
    enriched_indices = np.where(enrich_to_ref_df.loc[i] < 1.0)[0]
    enriched_indices = [convert_index[x] for x in enriched_indices]
    members = set(tar_df.loc[i]["CD_MemberList"].split(" "))
    if len(enriched_indices) > 0:
        print(i, enriched_indices)
        print("Node Memberlist:", members)
        # print("Ref Annotions:", ref_df.loc[enriched_indices]["Annotation"].values)
        print("Ref Annotions:", ref_df.iloc[enriched_indices]["GO_CC_descriptions"].values)
        # print("Ref Annotions:", ref_df.loc[enriched_indices]["corum_terms"].values)
        print("Ref Memberlist:", ref_df.iloc[enriched_indices]["CD_MemberList"].values)
        matched_members = []
        for match in ref_df.loc[enriched_indices]["CD_MemberList"].values:
            matched_members.append(members & set(match.split(" ")))
        print("Memberlist Intersection:", matched_members)
        print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
        print()

187

0 [0]
Node Memberlist: {'TEN1', 'SSBP2', 'CAMLG', 'RAB14', 'FYTTD1', 'POMT2', 'STARD3', 'BIRC6', 'COX15', 'WDR37', 'PRMT2', 'NKTR', 'OLFML3', 'ACOT9', 'SRSF11', 'MRPL44', 'PIN1', 'ACD', 'GOLPH3L', 'FOXN3', 'RPAP2', 'ALDH16A1', 'CUEDC2', 'ATP6V1C2', 'BOD1L1', 'UFSP2', 'FAF1', 'FKBP15', 'SPRYD4', 'CWC25', 'APOOL', 'LETM1', 'ATXN2L', 'SHARPIN', 'RBM7', 'OPTN', 'GTF3C2', 'CNNM4', 'MCM10', 'DCAKD', 'DAAM1', 'ISX', 'TPT1', 'ATP6V1C1', 'DPYSL4', 'CAPZA2', 'PRKAR2B', 'SULT2B1', 'SAMHD1', 'NAB2', 'OSBPL11', 'SBDS', 'PSMC3', 'MAB21L3', 'YTHDC2', 'OSTF1', 'MED14', 'ZNF608', 'NAA35', 'C3orf18', 'LRBA', 'ALG2', 'PIK3R4', 'INPP5F', 'PPP1R12A', 'SMOC1', 'SNX11', 'DARS2', 'PLK4', 'HDAC5', 'ABCE1', 'NMNAT2', 'APLF', 'KIDINS220', 'TMEM186', 'PIM1', 'FCF1', 'ARIH1', 'CSNK1D', 'PIGB', 'NOS1AP', 'EEF1A2', 'ATP2C1', 'RRP7A', 'CDR2L', 'ZDHHC13', 'MBIP', 'TMEM245', 'ACO1', 'VPS13D', 'CYB5B', 'VASP', 'RNF2', 'EPHA2', 'FAM20B', 'MAST1', 'ANKMY2', 'UBE4A', 'LAMB3', 'RGS12', 'DAPK3', 'NNT', 'PALM', 'POM121',

  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("FDR:", enrich_to_ref_df.loc[i][enriched_indices].values)
  print("F