In [1]:
import numpy as np
import os
import sys
import collections
import matplotlib.pyplot as plt
import gzip
import pybedtools
from pybedtools import BedTool
import warnings

sys.path.append('/home/camiel/chromograph/')
import chromograph
from chromograph.peak_calling.utils import *

import cytograph as cg
from typing import *

import logging
logger = logging.getLogger()
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=logging.INFO,
    datefmt='%H:%M:%S')

In [2]:
cdir = '/data/proj/scATAC/chromograph/cistromes/cistrome_hg38'
cisdir = os.path.join(cdir, os.listdir(cdir)[1])
motifdir = os.path.join(cdir, os.listdir(cdir)[2])

In [4]:
files = os.listdir(cisdir)
TFs = np.unique([x.split('.')[0] for x in files])

valid = []
for gene in genes:
    if f"{gene}.A.bed" in files:
        valid.append(gene)
    elif f"{gene}.B.bed" in files:
        valid.append(gene)
    elif f"{gene}.C.bed" in files:
        valid.append(gene)
        
discarded = [x for x in genes if x not in valid]
logging.info(f"TFs kept: {len(valid)}   TFs discarded: {len(discarded)}")

11:12:49 INFO     Genes kept: 496   Genes discarded: 103


In [6]:
from pybedtools.featurefuncs import *

def add_gene(f, gene):
    """
    adds name to feature
    """
    f.name = gene
    return f

def add_reliability(f, x):
    """
    Adds reliability as 'score' metric in bed.
    Reliability scores should correspond to:
    A: 3
    B: 2
    C: 1
    D: 0
    """
    f.score = x
    return f

In [None]:
## Merge beds to reference of TF binding sites

rel_dict = {'A': 3, 'B': 2, 'C': 1, 'D': 0}

rel = {}
i = 0

for TF in valid:
    for x in ['A','B','C']:
        if f"{TF}.{x}.bed" in files:
            cis = BedTool(os.path.join(motifdir, f"{TF}.{x}.bed")).saveas()
            cis = cis.each(extend_fields, 5).each(add_gene, TF).each(add_reliability, rel_dict[x]).saveas()
            try:
                M = M.cat(cis).saveas()
            except:
                M = cis.saveas()
            try:
                rel[TF] += 1
            except:
                rel[TF] = 1
    i += 1
    if i%10 == 0:
        logging.info(f"Finished {i} out of {len(valid)}")
logging.info(f"Total length: {len(M)} for {len(rel)} Motifs ")

11:13:32 INFO     Finished 10 out of 496
11:14:40 INFO     Finished 20 out of 496
11:15:35 INFO     Finished 30 out of 496
11:16:52 INFO     Finished 40 out of 496
11:18:06 INFO     Finished 50 out of 496
11:18:31 INFO     Finished 60 out of 496
11:19:09 INFO     Finished 70 out of 496
11:20:10 INFO     Finished 80 out of 496
11:21:14 INFO     Finished 90 out of 496
11:22:59 INFO     Finished 100 out of 496
11:24:15 INFO     Finished 110 out of 496
11:26:09 INFO     Finished 120 out of 496
11:28:28 INFO     Finished 130 out of 496
11:31:43 INFO     Finished 140 out of 496
11:33:22 INFO     Finished 150 out of 496
11:34:43 INFO     Finished 160 out of 496
11:36:37 INFO     Finished 170 out of 496
11:39:01 INFO     Finished 180 out of 496
11:40:35 INFO     Finished 190 out of 496
11:42:00 INFO     Finished 200 out of 496
11:43:32 INFO     Finished 210 out of 496
11:45:48 INFO     Finished 220 out of 496
11:48:26 INFO     Finished 230 out of 496
11:51:18 INFO     Finished 240 out of 496
1