In [1]:
import numpy as np
import os
import sys
import collections
import matplotlib.pyplot as plt
import gzip
import pybedtools
from pybedtools import BedTool
import warnings

sys.path.append('/home/camiel/chromograph/')
import chromograph
from chromograph.peak_calling.utils import *

import cytograph as cg
from typing import *

import logging
logger = logging.getLogger()
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%H:%M:%S')

In [2]:
cdir = '/data/proj/scATAC/chromograph/cistromes/cistrome_hg38'
motifdir = os.path.join(cdir, os.listdir(cdir)[2])

In [11]:
files = os.listdir(motifdir)
TFs = np.unique([x.split('.')[0] for x in files])

valid = []
for TF in TFs:
    if f"{TF}.A.bed" in files:
        valid.append(TF)
    elif f"{TF}.B.bed" in files:
        valid.append(TF)
    elif f"{TF}.C.bed" in files:
        valid.append(TF)
        
discarded = [x for x in TFs if x not in valid]
logging.info(f"TFs kept: {len(valid)}   TFs discarded: {len(discarded)}")

14:09:53 INFO     TFs kept: 372   TFs discarded: 72


In [12]:
from pybedtools.featurefuncs import *

def add_gene(f, gene):
    """
    adds name to feature
    """
    f.name = gene
    return f

def add_reliability(f, x):
    """
    Adds reliability as 'score' metric in bed.
    Reliability scores should correspond to:
    A: 3
    B: 2
    C: 1
    D: 0
    """
    f.score = x
    return f

In [None]:
## Merge beds to reference of TF binding sites

rel_dict = {'A': 3, 'B': 2, 'C': 1, 'D': 0}

rel = {}
i = 0
M = None

for TF in TFs:
    for x in ['A','B','C']:
        if f"{TF}.{x}.bed" in files:
            cis = BedTool(os.path.join(motifdir, f"{TF}.{x}.bed")).saveas()
            cis = cis.each(extend_fields, 5).each(add_gene, TF.split('_')[0]).each(add_reliability, rel_dict[x]).saveas()
            if M != None:
                M = M.cat(*[cis], postmerge=False).saveas()
            else:
                M = cis.saveas()
            try:
                rel[TF] += 1
            except:
                rel[TF] = 1
    i += 1
    if i%10 == 0:
        logging.info(f"Finished {i} out of {len(valid)}")
logging.info(f"Total length: {len(M)} for {len(rel)} Motifs ")
M.head()

16:28:26 INFO     Finished 10 out of 372
16:29:29 INFO     Finished 20 out of 372
16:30:50 INFO     Finished 30 out of 372
16:33:05 INFO     Finished 40 out of 372
16:35:45 INFO     Finished 50 out of 372
16:38:42 INFO     Finished 60 out of 372
16:42:26 INFO     Finished 70 out of 372
16:47:39 INFO     Finished 80 out of 372
16:52:08 INFO     Finished 90 out of 372
16:58:06 INFO     Finished 100 out of 372


In [None]:
M.saveas('/data/proj/scATAC/chromograph/cistromes/cistrome_hg38/cismotifs_merge.bed')