In [1]:
import pathlib
import pandas as pd

## Major Type And Sub Type Pairwise Gene

In [2]:
major_type_pairwise = pd.read_csv('MajorTypePairwiseDEG/TotalPairwiseMarker.csv', index_col=0)

In [3]:
sub_type_dir = pathlib.Path('SubTypePairwiseDEG/')
sub_type_paths = list(sub_type_dir.glob('*/TotalPairwiseMarker.csv'))

records = []
for path in sub_type_paths:
    df = pd.read_csv(path, index_col=0)
    records.append(df)
sub_type_pairwise = pd.concat(records)

In [4]:
major_type_pairwise.to_msgpack('MajorTypePairwiseMarker.msg')
sub_type_pairwise.to_msgpack('SubTypePairwiseMarker.msg')

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.
It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  


In [5]:
major_type_marker = major_type_pairwise['gene_id'].unique()
sub_type_marker = sub_type_pairwise['gene_id'].unique()

In [6]:
total_marker_gene = pd.Index(sub_type_marker) | pd.Index(major_type_marker)

In [7]:
gene_meta = pd.read_csv(
    '/home/hanliu/project/mouse_rostral_brain/study/ClusterMethylMarker/gencode.vM22.annotation.gene.flat.filtered_white_genes.tsv.gz',
    index_col='gene_id',
    sep='\t')
gene_meta.index.name = 'gene'
gene_name_to_id = {v: k for k, v in gene_meta['gene_name'].iteritems()}
gene_idbase_to_id = {i.split('.')[0]: i for i in gene_meta.index}

In [8]:
marker_meta = gene_meta.loc[total_marker_gene]
marker_id_bases = marker_meta.index.map(lambda i: i.split('.')[0])

marker_meta.to_csv('TotalUniquePairwiseMarker.csv')

## Overlap with SynGO

In [9]:
import json
with open('/home/hanliu/ref/SynGO/bp_syngo_to_mouse_gene.json') as f:
    syngo_bp = json.load(f)
with open('/home/hanliu/ref/SynGO/cc_syngo_to_mouse_gene.json') as f:
    syngo_cc = json.load(f)

In [10]:
syngo_genes = set()
for l in syngo_bp.values():
    for g in l:
        syngo_genes.add(g)
for l in syngo_cc.values():
    for g in l:
        syngo_genes.add(g)
        
syngo_genes = pd.Index(syngo_genes)

hit_syngo_genes = syngo_genes & marker_id_bases
miss_syngo_genes = pd.Index([g for g in syngo_genes if g not in marker_id_bases])


In [11]:
hit_syngo_genes.size, miss_syngo_genes.size

(839, 285)

In [27]:
with open('TotalHitSynGOID.txt', 'w') as f:
    hit_syngo_ids = marker_meta.index[marker_meta.index.map(lambda i: i.split('.')[0] in hit_syngo_genes)]
    for g in hit_syngo_ids:
        f.write(f'{g}\n')

## Overlap with TF

In [14]:
marker_meta = pd.read_csv('TotalUniquePairwiseMarker.csv', index_col=0)

In [15]:
mouse_tf_gene = pd.read_csv('/home/hanliu/ref/TFGene/TFCat/Mouse.RIKEN.TFGene.EnsemblID.txt',
                            header=None, index_col=0).index
tfclass_tf_gene = pd.read_csv('/home/hanliu/ref/TFGene/TFClass/TFClass.with_mouse_gene_id.csv')
tf_class_ids = pd.Index(set([g for gs in tfclass_tf_gene['EnsemblID'] 
 if isinstance(gs, str) 
 for g in gs.split(',')]))
_total_idbase = tf_class_ids | mouse_tf_gene
total_ids = gene_meta[gene_meta.index.map(lambda i: i.split('.')[0] in _total_idbase)].index


In [21]:
hit_tfs = marker_meta.index[marker_meta.index.isin(total_ids)]
hit_tfs.size, total_ids.size

(1247, 1920)

In [22]:
with open('TotalHitTFID.txt', 'w') as f:
    for g in hit_tfs:
        f.write(f'{g}\n')