In [3]:
# necessary functions
# cr: CRCmapper
def parseTable(fn, sep, header = False,excel = False):
    '''takes in a table where columns are separated by a given symbol and outputs
    a nested list such that list[row][col]
    example call:
    table = parseTable('file.txt','\t')
    '''
    fh = open(fn)
    lines = fh.readlines()
    fh.close()
    if excel:
        lines = lines[0].split('\r')
    if lines[0].count('\r') > 0:
        lines = lines[0].split('\r')
    table = []
    if header == True:
        lines =lines[1:]
    for i in lines:
        table.append(i[:-1].split(sep))

    return table


def unParseTable(table, output, sep):
    '''takes in a table generated by parseTable and writes it to an output file
    takes as parameters (table, output, sep), where sep is how the file is delimited
    example call unParseTable(table, 'table.txt', '\t') for a tab del file
    '''
    fh_out = open(output,'w')
    if len(sep) == 0:
        for i in table:
            fh_out.write(str(i))
            fh_out.write('\n')
    else:
        for line in table:
            line = [str(x) for x in line]
            line = sep.join(line)

            fh_out.write(line)
            fh_out.write('\n')

    fh_out.close()
    

'''uniquify function by Peter Bengtsson Used under a creative commons license
sourced from  here: http://www.peterbe.com/plog/uniqifiers-benchmark
'''
def uniquify(seq, idfun=None):
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if marker in seen: continue
        seen[marker] = 1
        result.append(item)
    return result


def makeSearchLocus(locus,upSearch,downSearch):
    '''takes a locus and expands it by a fixed upstream/downstream amount. spits out the new larger locus
    '''
    if locus.sense() == '-':
        searchLocus = Locus(locus.chr(),locus.start()-downSearch,locus.end()+upSearch,locus.sense(),locus.ID())
    else:
        searchLocus = Locus(locus.chr(),locus.start()-upSearch,locus.end()+downSearch,locus.sense(),locus.ID())
    return searchLocus


def importRefseq(refseqFile, returnMultiples = False):

    '''
    opens up a refseq file downloaded by UCSC
    '''
    refseqTable = parseTable(refseqFile,'\t')
    refseqDict = {}
    ticker = 1
    for line in refseqTable[1:]:
        if refseqDict.__contains__(line[1]):
            refseqDict[line[1]].append(ticker)
        else:
            refseqDict[line[1]] = [ticker]
        ticker = ticker + 1

    multiples = []
    for i in refseqDict:
        if len(refseqDict[i]) > 1:
            multiples.append(i)

    if returnMultiples == True:
        return refseqTable,refseqDict,multiples
    else:
        return refseqTable,refseqDict

    
def getTSSs(geneList,refseqTable,refseqDict):
    if len(geneList) == 0:
        refseq = refseqTable
    else:
        refseq = refseqFromKey(geneList,refseqDict,refseqTable)
    TSS = []
    for line in refseq:
        if line[3] == '+':
            TSS.append(line[4])
        if line[3] == '-':
            TSS.append(line[5])
    TSS = map(int,TSS)
    return TSS


def refseqFromKey(refseqKeyList,refseqDict,refseqTable):
    typeRefseq = []
    for name in refseqKeyList:
        if refseqDict.__contains__(name):
            typeRefseq.append(refseqTable[refseqDict[name][0]])
    return typeRefseq


def makeStartDict(annotFile,geneList = []):
    '''makes a dictionary keyed by refseq ID that contains information about 
    chrom/start/stop/strand/common name
    '''

    if type(geneList) == str:
        geneList = parseTable(geneList,'\t')
        geneList = [line[0] for line in geneList]
            
    if annotFile.upper().count('REFSEQ') == 1:
        refseqTable,refseqDict = importRefseq(annotFile)
        if len(geneList) == 0:
            geneList = refseqDict.keys()
        startDict = {}
        for gene in geneList:
            if refseqDict.__contains__(gene) == False:
                continue
            startDict[gene]={}
            startDict[gene]['sense'] = refseqTable[refseqDict[gene][0]][3]
            startDict[gene]['chr'] = refseqTable[refseqDict[gene][0]][2]
            startDict[gene]['start'] = [*getTSSs([gene],refseqTable,refseqDict)]
            if startDict[gene]['sense'] == '+':
                startDict[gene]['end'] =[int(refseqTable[refseqDict[gene][0]][5])]
            else:
                startDict[gene]['end'] = [int(refseqTable[refseqDict[gene][0]][4])]
            startDict[gene]['name'] = refseqTable[refseqDict[gene][0]][12]
    return startDict


def makeTSSLocus(gene,startDict,upstream,downstream):
    '''given a startDict, make a locus for any gene's TSS w/ upstream and downstream windows
    '''
    
    start = startDict[gene]['start'][0]
    if startDict[gene]['sense'] =='-':
        return Locus(startDict[gene]['chr'],start-downstream,start+upstream,'-',gene)
    else:
        return Locus(startDict[gene]['chr'],start-upstream,start+downstream,'+',gene)
    
    
def fetchSeq(directory,chrom,start,end,UCSC=False,lineBreaks=True,header = True):
    '''function that fetches a sequence from a genome directory
    directory that contains individual chrom fasta files
    '''
    fn = directory + chrom + '.fa'
    fh = open(fn,'r')
    headerOffset = 0
    nStart = 0
    nEnd = 0
    if header:
        fh.seek(0)
        headerOffset = len(fh.readline())
    if lineBreaks:

        nStart = int((start-1)/50)
        nEnd = int((end-1)/50)
    if UCSC:
        fh.seek((start+nStart+headerOffset))
    else:
        fh.seek((start-1+nStart+headerOffset))
    span = ((end+nEnd-1)-(start+nStart-1))

    read = fh.read(span)
    if lineBreaks:
        read = read.replace('\n','')

    return read
    fh.close()

    
class Locus:
    __chrDict = dict()
    __senseDict = {'+':'+', '-':'-', '.':'.'}
    def __init__(self,chr,start,end,sense,ID='',score=0):
        coords = [int(start),int(end)]
        # coords = [start, end]
        coords.sort(reverse=False)
        if not(self.__chrDict.__contains__(chr)): self.__chrDict[chr] = chr
        self._chr = self.__chrDict[chr]
        self._sense = self.__senseDict[sense]
        self._start = int(coords[0])
        self._end = int(coords[1])
        self._ID = ID
        self._score = score
    def ID(self): return self._ID
    def chr(self): return self._chr
    def start(self): return self._start
    def end(self): return self._end
    def len(self): return self._end - self._start + 1
    def score(self): return self._score
    def getAntisenseLocus(self):
        if self._sense=='.': return self
        else:
            switch = {'+':'-', '-':'+'}
            return Locus(self._chr,self._start,self._end,switch[self._sense])
    def coords(self): return [self._start,self._end]
    def sense(self): return self._sense
    def overlaps(self,otherLocus):
        if self.chr()!=otherLocus.chr(): return False
        elif not(self._sense=='.' or \
                 otherLocus.sense()=='.' or \
                 self.sense()==otherLocus.sense()): return False
        elif self.start() > otherLocus.end() or otherLocus.start() > self.end(): return False
        else: return True
    def contains(self,otherLocus):
        if self.chr()!=otherLocus.chr(): return False
        elif not(self._sense=='.' or \
                 otherLocus.sense()=='.' or \
                 self.sense()==otherLocus.sense()): return False
        elif self.start() > otherLocus.start() or otherLocus.end() > self.end(): return False
        else: return True
    def overlapsAntisense(self,otherLocus):
        return self.getAntisenseLocus().overlaps(otherLocus)
    def containsAntisense(self,otherLocus):
        return self.getAntisenseLocus().contains(otherLocus)
    def __hash__(self): return self._start + self._end
    def __eq__(self,other):
        if self.__class__ != other.__class__: return False
        if self.chr()!=other.chr(): return False
        if self.start()!=other.start(): return False
        if self.end()!=other.end(): return False
        if self.sense()!=other.sense(): return False
        return True
    def __ne__(self,other): return not(self.__eq__(other))
    def __str__(self): return self.chr()+'('+self.sense()+'):'+'-'.join(map(str,self.coords()))
    def plotStr(self): return self.chr() + ':' + self.sense() + ':' + '-'.join(map(str,self.coords()))
    def checkRep(self):
        pass
    def gffLine(self): return [self.chr(),self.ID(),'',self.start(),self.end(),'',self.sense(),'',self.ID()]
    
    
class LocusCollection:
    def __init__(self,loci,windowSize):
        self.__chrToCoordToLoci = dict()
        self.__loci = dict()
        self.__winSize = windowSize
        for lcs in loci: self.__addLocus(lcs)

    def __addLocus(self,lcs):
        if not(self.__loci.__contains__(lcs)):
            self.__loci[lcs] = None
            if lcs.sense()=='.': chrKeyList = [lcs.chr()+'+', lcs.chr()+'-']
            else: chrKeyList = [lcs.chr()+lcs.sense()]
            for chrKey in chrKeyList:
                if not(self.__chrToCoordToLoci.__contains__(chrKey)): self.__chrToCoordToLoci[chrKey] = dict()
                for n in self.__getKeyRange(lcs):
                    if not(self.__chrToCoordToLoci[chrKey].__contains__(n)): self.__chrToCoordToLoci[chrKey][n] = []
                    self.__chrToCoordToLoci[chrKey][n].append(lcs)
    def __getKeyRange(self,locus):
        start = int(locus.start() / self.__winSize)
        end = int(locus.end() / self.__winSize) + 1
        return range(start,end)
    def __len__(self): return len(self.__loci)
        
    def append(self,new): self.__addLocus(new)
    def extend(self,newList):
        for lcs in newList: self.__addLocus(lcs)
    def hasLocus(self,locus):
        return self.__loci.__contains__(locus)
    def remove(self,old):
        if not(self.__loci.__contains__(old)): raise ValueError("requested locus isn't in collection")
        del self.__loci[old]
        if old.sense()=='.': senseList = ['+','-']
        else: senseList = [old.sense()]
        for k in self.__getKeyRange(old):
            for sense in senseList:
                self.__chrToCoordToLoci[old.chr()+sense][k].remove(old)
    def getWindowSize(self): return self.__winSize
    def getLoci(self): return self.__loci.keys()
    def getChrList(self):
        tempKeys = dict()
        for k in self.__chrToCoordToLoci.keys(): tempKeys[k[:-1]] = None
        return tempKeys.keys()
            
    def __subsetHelper(self,locus,sense):
        sense = sense.lower()
        if ['sense','antisense','both'].count(sense)!=1:
            raise ValueError("sense command invalid: '"+sense+"'.")
        matches = dict()
        senses = ['+','-']
        if locus.sense()=='.' or sense=='both': lamb = lambda s: True
        elif sense=='sense': lamb = lambda s: s==locus.sense()
        elif sense=='antisense': lamb = lambda s: s!=locus.sense()
        else: raise ValueError("sense value was inappropriate: '"+sense+"'.")
        for s in filter(lamb, senses):
            chrKey = locus.chr()+s
            if self.__chrToCoordToLoci.__contains__(chrKey):
                for n in self.__getKeyRange(locus):
                    if self.__chrToCoordToLoci[chrKey].__contains__(n):
                        for lcs in self.__chrToCoordToLoci[chrKey][n]:
                            matches[lcs] = None
        return matches.keys()
    def getOverlap(self,locus,sense='sense'):
        matches = self.__subsetHelper(locus,sense)
        realMatches = dict()
        if sense=='sense' or sense=='both':
            for i in filter(lambda lcs: lcs.overlaps(locus), matches):
                realMatches[i] = None
        if sense=='antisense' or sense=='both':
            for i in filter(lambda lcs: lcs.overlapsAntisense(locus), matches):
                realMatches[i] = None 
        return realMatches.keys()
    def getContained(self,locus,sense='sense'):
        matches = self.__subsetHelper(locus,sense)
        realMatches = dict()
        if sense=='sense' or sense=='both':
            for i in filter(lambda lcs: locus.contains(lcs), matches):
                realMatches[i] = None
        if sense=='antisense' or sense=='both':
            for i in filter(lambda lcs: locus.containsAntisense(lcs), matches):
                realMatches[i] = None
        return realMatches.keys()
    def getContainers(self,locus,sense='sense'):
        matches = self.__subsetHelper(locus,sense)
        realMatches = dict()
        if sense=='sense' or sense=='both':
            for i in filter(lambda lcs: lcs.contains(locus), matches):
                realMatches[i] = None
        if sense=='antisense' or sense=='both':
            for i in filter(lambda lcs: lcs.containsAntisense(locus), matches):
                realMatches[i] = None
        return realMatches.keys()
    def stitchCollection(self,stitchWindow=1,sense='both'):
        locusList = self.getLoci()
        oldCollection = LocusCollection(locusList,500)
        stitchedCollection = LocusCollection([],500)
        for locus in locusList:
            if oldCollection.hasLocus(locus):
                oldCollection.remove(locus)
                overlappingLoci = oldCollection.getOverlap(Locus(locus.chr(),locus.start()-stitchWindow,locus.end()+stitchWindow,locus.sense(),locus.ID()),sense)
                stitchTicker = 1
                while len(overlappingLoci) > 0:
                    stitchTicker+=len(overlappingLoci)
                    overlapCoords = locus.coords()
                    for overlappingLocus in overlappingLoci:
                        overlapCoords+=overlappingLocus.coords()
                        oldCollection.remove(overlappingLocus)
                    if sense == 'both':
                        locus = Locus(locus.chr(),min(overlapCoords),max(overlapCoords),'.',locus.ID())
                    else:
                        locus = Locus(locus.chr(),min(overlapCoords),max(overlapCoords),locus.sense(),locus.ID())
                    overlappingLoci = oldCollection.getOverlap(Locus(locus.chr(),locus.start()-stitchWindow,locus.end()+stitchWindow,locus.sense()),sense)
                locus._ID = '%s_%s_lociStitched' % (stitchTicker,locus.ID())
                stitchedCollection.append(locus)
            else:
                continue
        return stitchedCollection
    def getLoci(self): return self.__loci.keys()

In [4]:
from subprocess import call
import networkx as nx
from math import log

In [5]:
# input files
## for refseqToNameDict
annotationFile = './CRCmapper/CRCmapper_package/annotation/hg19_refseq_NM.ucsc'
annotTable = parseTable(annotationFile, '\t')

## refseqToNameDict
refseqToNameDict = {}
for line in annotTable[1:]:
    gid = line[1]
    genename = line[12].upper()
    refseqToNameDict[gid] = genename
# print(dict(list(refseqToNameDict.items())[0:5]))

## SuperTable
superFile = './ROSE/2.rose_Beta_narrow.05_broad.1_incltss/Beta_peaks_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')

## SuperTable changed to tadTable
# tadFile = './hg19.GSE63525_GM12878_50K.TopDom_10.bed'
# tadTable = parseTable(tadFile, '\t')
# for i in range(1, len(tadTable)+1):
#     tadTable[i-1].append(f'hg19_TAD_{i}')
### chr start end type name 
## 3139

## enhancerNumber
## enhancerNumber = 500

## expressionTable
#expressionFile = './CRCmapper/2-2.crc_Beta_narrow.05_broad.1_incltss_cutoff500/matrix.gff'
# expressionFile = './CRCmapper/1-1.crc_HI-32_K27ac_incltss/matrix.gff'
#expressionTable = parseTable(expressionFile, '\t')

## TFfile
TFfile = './CRCmapper/CRCmapper_package/TFlist_NMid_hg.txt'
TFtable = parseTable(TFfile, '\t')
TFlist = [line[0] for line in TFtable]
TFlistGene = [line[1] for line in TFtable]

## subpeaks
subpeaks = './ChIP-seq/Meissner/macs2_Beta_narrow.05_broad.1/Beta_peaks.broadPeak'
# subpeaks = './crup/islets.HI-32.CRUP.singleEnh.bed'

## genomeDirectory FASTA (DNA sequences)
genomeDirectory = './CRCmapper/hg19/'

## motifExtension
motifExtension = 500

## expCutoff: top 2/3 of the genes are considered to be expressed
## expCutoff = 33

## Transfac.v$ZIC2_01 ZIC2
motifConvertFile = './CRCmapper/CRCmapper_package/MotifDictionary.txt'
# motifDatabase = parseTable(motifConvertFile, '\t')

## PWM (motifDatabaseFile)
PWMfile = './CRCmapper/CRCmapper_package/VertebratePWMs.txt'

In [85]:
'''
# 2-2. Beta_narrow.05_broad.1_incltss_cutoff500_ext500
## enhancerNumber
Enumber = 500
## motifExtension
motifExtension = 500
## working directory
projectFolder = f'./CRCmapper/2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'Beta'
## SuperTable
superFile = './ROSE/2.rose_Beta_narrow.05_broad.1_incltss/Beta_peaks_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './ChIP-seq/Meissner/macs2_Beta_narrow.05_broad.1/Beta_peaks.broadPeak'
## expressionTable
expressedNMfile = f'./CRCmapper/2-2.crc_Beta_narrow.05_broad.1_incltss_cutoff{Enumber}_ext{motifExtension}/Beta_EXPRESSED_TRANSCRIPTS.txt'
'''

#'''
# 2-2. Beta_narrow.05_broad.1_incltss_cutoff500_ext0
## enhancerNumber
Enumber = 500
## motifExtension
motifExtension = 0
## working directory
projectFolder = f'./CRCmapper/2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'Beta'
## SuperTable
superFile = './ROSE/2.rose_Beta_narrow.05_broad.1_incltss/Beta_peaks_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './ChIP-seq/Meissner/macs2_Beta_narrow.05_broad.1/Beta_peaks.broadPeak'
## expressionTable
expressedNMfile = f'./CRCmapper/2-2.crc_Beta_narrow.05_broad.1_incltss_cutoff{Enumber}_ext500/Beta_EXPRESSED_TRANSCRIPTS.txt'
#'''


'''# 1-1. CRUP_HI-32_K27ac_incltss_cutoffSE_ext500
## enhancerNumber
Enumber = 'SE'
## motifExtension
motifExtension = 500
## working directory
projectFolder = f'./CRCmapper/1-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'CRUP_HI-32_K27ac'
## SuperTable
superFile = './ROSE/1-1.rose_CRUP_HI-32_K27ac_incltss/islets_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './crup/islets.HI-32.CRUP.singleEnh.bed'
## expressionTable
expressedNMfile = f'./CRCmapper/1-1.crc_CRUP_HI-32_K27ac_incltss_cutoff{Enumber}_ext{motifExtension}/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 1-1. CRUP_HI-32_K27ac_incltss_cutoffSE_ext0
## enhancerNumber
Enumber = 'SE'
## motifExtension
motifExtension = 0
## working directory
projectFolder = f'./CRCmapper/1-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'CRUP_HI-32_K27ac'
## SuperTable
superFile = './ROSE/1-1.rose_CRUP_HI-32_K27ac_incltss/islets_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './crup/islets.HI-32.CRUP.singleEnh.bed'
## expressionTable
expressedNMfile = f'./CRCmapper/1-1.crc_CRUP_HI-32_K27ac_incltss_cutoff{Enumber}_ext{motifExtension}/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 2-1. macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoffSE_ext0
## enhancerNumber
Enumber = 'SE'
## motifExtension
motifExtension = 0
## working directory
projectFolder = f'./CRCmapper/2-1-TRAP.crc_macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'macs2_HI-32_K27ac'
## SuperTable
superFile = './ROSE/2-1.rose_macs2_HI-32_K27ac_narrow.05_broad.05_incltss/H3K27ac_HI-32_peaks_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './ChIP-seq/Meissner/macs2_HI-32_K27ac_narrow.05_broad.05/H3K27ac_HI-32_peaks.broadPeak'
## expressionTable
expressedNMfile = f'./CRCmapper/2-1.crc_macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoff{Enumber}_ext{motifExtension}/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 2-1. macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoffSE_ext500
## enhancerNumber
Enumber = 'SE'
## motifExtension
motifExtension = 500
## working directory
projectFolder = f'./CRCmapper/2-1-TRAP.crc_macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'macs2_HI-32_K27ac'
## SuperTable
superFile = './ROSE/2-1.rose_macs2_HI-32_K27ac_narrow.05_broad.05_incltss/H3K27ac_HI-32_peaks_AllEnhancers.table.txt'
# superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './ChIP-seq/Meissner/macs2_HI-32_K27ac_narrow.05_broad.05/H3K27ac_HI-32_peaks.broadPeak'
## expressionTable
expressedNMfile = f'./CRCmapper/2-1.crc_macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoff{Enumber}_ext0/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 6-1.crc_DE_narrow.05_broad.05_exclNMtss_cutoffSE_ext500
## enhancerNumber
Enumber = 'SE'
## motifExtension
motifExtension = 500
## working directory
projectFolder = f'./CRCmapper/6-1-TRAP.crc_DE_narrow.05_broad.05_exclNMtss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'DE'
## SuperTable
superFile = './ROSE/6.rose_DE_narrow.05_broad.05_exclNMtss/DE_peaks_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './ChIP-seq/Meissner/macs2_DE_narrow.05_broad.05/DE_peaks.broadPeak'
## expressionTable
expressedNMfile = f'./CRCmapper/6-1.crc_DE_narrow.05_broad.05_exclNMtss_cutoff{Enumber}_ext{motifExtension}/DE_H3K27ac_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 6-1.crc_DE_narrow.05_broad.05_exclNMtss_cutoffSE_ext0
## enhancerNumber
Enumber = 'SE'
## motifExtension
motifExtension = 0
## working directory
projectFolder = f'./CRCmapper/6-1-TRAP.crc_DE_narrow.05_broad.05_exclNMtss_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'DE'
## SuperTable
superFile = './ROSE/6.rose_DE_narrow.05_broad.05_exclNMtss/DE_peaks_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './ChIP-seq/Meissner/macs2_DE_narrow.05_broad.05/DE_peaks.broadPeak'
## expressionTable
expressedNMfile = f'./CRCmapper/6-1.crc_DE_narrow.05_broad.05_exclNMtss_cutoff{Enumber}_ext500/DE_H3K27ac_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 3-1.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoffSE_ext500
## enhancerNumber
Enumber = 'SE'  # 668
## motifExtension
motifExtension = 500
## working directory
projectFolder = f'./CRCmapper/3-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'CRUP_HI-32_K27ac'
## SuperTable
superFile = './ROSE/3-1.rose_CRUP_HI-32_K27ac_incltss_peakCutoff.8/islets_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './crup/islets.HI-32.CRUP.singleEnh_peakCutoff.8.bed'
## expressionTable
expressedNMfile = f'./CRCmapper/3-1.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoff{Enumber}_ext500/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
'''

'''
# 3-1.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoffSE_ext0
## enhancerNumber
Enumber = 'SE'  # 668
## motifExtension
motifExtension = 0
## working directory
projectFolder = f'./CRCmapper/3-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoff{Enumber}_ext{motifExtension}/'
projectName = 'CRUP_HI-32_K27ac'
## SuperTable
superFile = './ROSE/3-1.rose_CRUP_HI-32_K27ac_incltss_peakCutoff.8/islets_AllEnhancers.table.txt'
superTable = parseTable(superFile, '\t')
## subpeak
subpeaks = './crup/islets.HI-32.CRUP.singleEnh_peakCutoff.8.bed'
## expressionTable
expressedNMfile = f'./CRCmapper/3-1.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoff{Enumber}_ext{motifExtension}/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
'''

"\n# 3-1.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoffSE_ext0\n## enhancerNumber\nEnumber = 'SE'  # 668\n## motifExtension\nmotifExtension = 0\n## working directory\nprojectFolder = f'./CRCmapper/3-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoff{Enumber}_ext{motifExtension}/'\nprojectName = 'CRUP_HI-32_K27ac'\n## SuperTable\nsuperFile = './ROSE/3-1.rose_CRUP_HI-32_K27ac_incltss_peakCutoff.8/islets_AllEnhancers.table.txt'\nsuperTable = parseTable(superFile, '\t')\n## subpeak\nsubpeaks = './crup/islets.HI-32.CRUP.singleEnh_peakCutoff.8.bed'\n## expressionTable\nexpressedNMfile = f'./CRCmapper/3-1.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoff{Enumber}_ext{motifExtension}/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'\n"

## 1. createSuperLoci

In [7]:
def createSuperLoci(superTable, Enumber='SE'):
    '''
    takes as input a ROSE SuperEnhancer table 
    output a table of loci for SuperEnhancers
    '''
    
    print('CREATING SUPER-ENHANCER LOCUS COLLECTION')
    
    output = []

    if Enumber == 'SE':
        for line in superTable[6:]:
            if line[-1] == '1':
                locus = Locus(line[1], line[2], line[3], '.', line[0], (float(line[6])-float(line[7])))
                output.append(locus)
    else:
        end = 6+int(Enumber)
        for line in superTable[6:end]:
            locus = Locus(line[1], line[2], line[3], '.', line[0], (float(line[6])-float(line[7])))
            output.append(locus)

    return output

In [81]:
# superLoci = createSuperLoci(superTable, 500)
superLoci = createSuperLoci(superTable, Enumber)
print(*superLoci[:5])
len(superLoci)

CREATING SUPER-ENHANCER LOCUS COLLECTION
chr16(.):46386031-46435529 chr1(.):121458818-121485538 chr9_gl000199_random(.):30876-169007 chr4(.):49093615-49157018 chr10(.):42355279-42409601


500

## 2. createExpressionDict

In [106]:
# expressedNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expressionTable)
expressedNMfile = './CRCmapper/2-2.crc_Beta_narrow.05_broad.1_incltss_cutoff500_ext500/Beta_EXPRESSED_TRANSCRIPTS.txt'
# expressedNMfile = './CRCmapper/1-1.crc_CRUP_HI-32_K27ac_incltss_cutoffSE_ext500/H3K27ac_HI-32_EXPRESSED_TRANSCRIPTS.txt'
with open(expressedNMfile, "r") as f:
    expressedNM = [line.rstrip('\n') for line in f]
print(len(expressedNM))
print(expressedNMfile)

14739
./CRCmapper/2-2.crc_Beta_narrow.05_broad.1_incltss_cutoff500_ext500/Beta_EXPRESSED_TRANSCRIPTS.txt


## 3. findCanidateTFs

In [93]:
def findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName):
    '''
    find all TFs within 1Mb of the super-enhancer center that are considered expressed 
    return a dictionary keyed by TF that points to a list of super-enhancer loci
    '''

    print('FINDING CANIDATE TFs')

    startDict = makeStartDict(annotationFile)

    # Find the location of the TSS of all transcripts (NMid) considered expressed
    tssLoci = []
    for geneID in expressedNM:
        tssLoci.append(makeTSSLocus(geneID,startDict,0,0))
    tssCollection = LocusCollection(tssLoci,50)

    # Assign all transcripts (NMid) that are TFs to a super-enhancer if it is the closest gene
    seAssignment = []
    seAssignmentGene = []
    TFandSuperDict = {}

    for superEnh in superLoci:

        seCenter = (superEnh.start() + superEnh.end()) / 2 

        # Find all transcripts whose TSS occur within 1Mb of the SE center
        searchLocus = Locus(superEnh.chr(), superEnh.start()-1000000, superEnh.end()+1000000, '.')
        allEnhancerLoci = tssCollection.getOverlap(searchLocus)
        allEnhancerGenes = [locus.ID() for locus in allEnhancerLoci]
        #loc = [(locus.start(), locus.end()) for locus in allEnhancerLoci]

        # Find the transcript that is closest to the center
        if allEnhancerGenes:
            distList = [abs(seCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            closestGene = allEnhancerGenes[distList.index(min(distList))]
            #closestloc = loc[distList.index(min(distList))]
        else:
            closestGene = ''

        seAssignment.append([superEnh.chr(), superEnh.start(), superEnh.end(), closestGene])

        # Select the transcript if it is a TF, and allow for a TF to have multiple SEs
        if closestGene in TFlist and closestGene not in TFandSuperDict.keys():
            TFandSuperDict[closestGene] = [superEnh]
        elif closestGene in TFlist and closestGene in TFandSuperDict.keys():
            TFandSuperDict[closestGene].append(superEnh)

        # Convert the selected TF NMids to gene names
        if closestGene != '':
            geneName = refseqToNameDict[closestGene]
            seAssignmentGene.append([superEnh.chr(), superEnh.start(), superEnh.end(), geneName])
            #seAssignmentGene.append([superEnh.chr(), superEnh.start(), superEnh.end(), geneName,
                                     #closestloc[0], closestloc[1]])

    #'''
    # Output the list of SE-assigned transcripts (NMids)
    seAssignmentFile = projectFolder + projectName + '_SE_ASSIGNMENT_TRANSCRIPT.txt'
    #unParseTable(seAssignment, seAssignmentFile, '\t')

    # Output the list of SE-assigned genes
    seAssignmentGeneFile = projectFolder + projectName + '_SE_ASSIGNMENT_GENE.txt'
    #unParseTable(seAssignmentGene, seAssignmentGeneFile, '\t')
    #'''

    print('Number of canidate TFs:', len(TFandSuperDict))

    return TFandSuperDict#, seAssignmentGene


In [94]:
TFandSuperDict = findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName)

FINDING CANIDATE TFs
Number of canidate TFs: 65


In [29]:
TFandSuperDict, seAssignmentGene = findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName)

FINDING CANIDATE TFs
Number of canidate TFs: 63


In [452]:
def formatOutput(TFandSuperDict, refseqToNameDict, projectName, projectFolder):

    '''
    takes as input the dictionary mapping TFs to all proximal super-enhancers
    returns a file that lists each candidate TFs
    and gives the coordinates of the super-enhancers around them
    '''

    print('CREATE CANDIDATE TFs AND SE TABLE')

    output = [['TF_refseq', 'TF_name', 'chr', 'start', 'stop', 'SuperID', 'Super_Load' ]]

    used = []
 
    for gene in TFandSuperDict.keys():
        for superEnh in TFandSuperDict[gene]:

            check = (refseqToNameDict[gene], superEnh.chr(), superEnh.start(), superEnh.end())

            if check not in used:
                newline = [gene, refseqToNameDict[gene]]
                newline.append(superEnh.chr())
                newline.append(superEnh.start())
                newline.append(superEnh.end())
                newline.append(superEnh.ID())
                newline.append(superEnh.score())
                output.append(newline)

                used.append(check)

    # Output the list of SE-assigned TFs and the associated super-enhancer loci
    outputname = projectFolder + projectName + '_CANIDATE_TF_AND_SUPER_TABLE.txt'

    unParseTable(output, outputname, '\t')

    return 1

In [453]:
formatOutput(TFandSuperDict, refseqToNameDict, projectName, projectFolder)

CREATE CANDIDATE TFs AND SE TABLE


1

In [96]:
candidateGenes = [refseqToNameDict[x].upper() for x in TFandSuperDict.keys()]

In [97]:
candidateGenes = uniquify(candidateGenes)
len(candidateGenes)

65

## 4. TRAP

In [19]:
import re

In [724]:
# create TRAP matrix: candidateMotifs.fa
with open('/project/ngsvin/bin/TRAP/Data/merged_matrices.TFP_2022.2.fa', 'r') as f:
    lines = f.readlines()
    # mat = open('/project/NeuralNet/CRC/CRCmapper/2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff500/Beta_canidateMotifs.fa', 'w')
    mat = open(projectFolder+projectName+'_canidateMotifs.fa', 'w')
    for i in range(0, len(lines)): # len(lines)
        if lines[i].startswith('>V'):
            r = re.compile('/gene', flags=re.IGNORECASE)
            if re.findall(r, lines[i]):
                r = re.compile('|'.join(candidateGenes), flags=re.IGNORECASE)
                if re.findall(r, lines[i]):
                    # print(lines[i], end="")
                    mat.writelines(lines[i])
                    j = i+1
                    while not lines[j].startswith('>'):
                        # print(lines[j], end="")
                        mat.writelines(lines[j])
                        j += 1
                        if j >= len(lines):
                            break;
    mat.close()


In [20]:
with open('/project/ngsvin/bin/TRAP/Data/merged_matrices.TFP_2022.2.fa', 'r') as f:
    motif2TFdict = {}
    lines = f.readlines()
    for i in range(0, len(lines)): # len(lines)
        if lines[i].startswith('>V'):
            r = re.compile('/gene', flags=re.IGNORECASE)
            if re.findall(r, lines[i]):
                line = lines[i].replace('\n', '').split(' /')
                r = re.compile('name')
                name = list(filter(r.match, line))[0].split('=')[1]
                r = re.compile('gene')
                gene = list(filter(r.match, line))[0].split('=')[1]
                if gene not in motif2TFdict.keys():
                    motif2TFdict[gene] = []
                    motif2TFdict[gene].append(name)
                else:
                    motif2TFdict[gene].append(name)
# 1490 TF with 7353 motifs


In [478]:
# for checking the number of motifs
count = 0
TFcount = 0
for i in candidateGenes:
    if i in motif2TFdict.keys():
        count += len(motif2TFdict[i])
        TFcount += 1
    else:
        print(i)
print(f'# of total motifs: {count}')
print(f'# of TF that has motifs: {TFcount}')

SIM1
MYT1
# of total motifs: 484
# of TF that has motifs: 63


## 5. generateSubpeakFASTA
TRAP_matrix + subpeaks

In [98]:
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension):
    '''
    takes as input a BED file of constituents
    outputs a FASTA  file of merged extended super-enhancer consituents and associated formated name
    '''

    print('MAKE FASTA')

    subpeakDict = {}
    subpeakBED = []
    subpeakTable = parseTable(subpeaks, '\t')

    subpeakLoci = [Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = LocusCollection(subpeakLoci, 50)

    for gene in TFandSuperDict.keys():
        subpeakDict[gene] = []
        for region in TFandSuperDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps]

            overlapCollectionTemp = LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)

    print(f'# of subpeaks: {len(subpeakBED)}')
    if motifExtension != 0:
        bedfilename = projectFolder + projectName + '_subpeaks_ext500.bed'
    else:
        bedfilename = projectFolder + projectName + '_subpeaks_ext0.bed'
    

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(fastaLine.upper())

    # Output the fasta file of extended SE constituents
    if motifExtension != 0:
        outname = projectFolder + projectName + '_SUBPEAKS.fa'
    else:
        outname = projectFolder + projectName + '_SUBPEAKS_ext0.fa'

    '''    
    unParseTable(subpeakBED, bedfilename, '\t')
    unParseTable(fasta, outname, '')
    '''
    
    return subpeakDict


In [226]:
# subpeakDict: TF_NMid: SE constituents
print(f'motifExtension: {motifExtension}')
subpeakDict = generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension)
# subpeakDict = generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, 0)

motifExtension: 500
MAKE FASTA
# of subpeaks: 296


In [99]:
# subpeakDict: TF_NMid: SE constituents
print(f'motifExtension: {motifExtension}')
subpeakDict0 = generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, 0)
# subpeakDict = generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, 0)

motifExtension: 0
MAKE FASTA
# of subpeaks: 405


In [None]:
# run TRAP on shell
TRAP='/project/ngsvin/bin/TRAP/TRAPv1.04'
genome='/project/NeuralNet/CRC/ChIP-seq/mapping/bowtie2/hg19/hg19.fa'
norm='/project/ngsvin/bin/TRAP/Data/hg19.TSSup.TFP_2022.2.GEVparams'
folder='/project/NeuralNet/CRC/CRCmapper/2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff500_ext500/Beta_'
matrix=${folder}'canidateMotifs.fa'
region=${folder}'subpeaks_ext500.bed'
$TRAP -thread 20 -s $genome -norm $norm -matrix $matrix -region $region -gene > ${folder}TRAPresults_ext500.bed &
$TRAP -thread 10 -s $genome -norm $norm -matrix $matrix -region $region -gene -w 5000 > ${folder}TRAPresults_ext500_win5k.bed &

In [729]:
# subpeaks * motifs
# # of peaks exceeded 5k in length / # of all peaks
# (of # of TF that has motifs / # of unique candidate TF / # of candidate TF)
# 2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff500_ext500: 2/296 peaks * 484 motifs (of 63/65/65 TF) = 143264 lines
# 2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff500_ext0: 0/405 peaks * 484 motifs (of 63/65/65 TF) = 196020 lines
# 1-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_cutoffSE_ext500: 225/784 peaks * 653 motifs (of 83/94/97 TF) = 511952 lines
# 1-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_cutoffSE_ext0: 2/1783 peaks * 653 motifs (of 83/94/97 TF) = 1164299 lines
# 2-1-TRAP.crc_macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoffSE_ext0: 0/1069 peaks * 967 motifs (of 136/150/156 TF) = 1033723 lines
# 2-1-TRAP.crc_macs2_HI-32_K27ac_narrow.05_broad.05_incltss_cutoffSE_ext500: 1/746 peaks * 967 motifs (of 136/150/156 TF) = 721382 lines
# 6-1-TRAP.crc_DE_narrow.05_broad.05_excltss_cutoffSE_ext500: 25/188 peaks * 391 motifs (of 61/69/69 TF) = 73508 lines
# 6-1-TRAP.crc_DE_narrow.05_broad.05_excltss_cutoffSE_ext0: 4/333 peaks * 391 motifs (of 61/69/69 TF) = 130203 lines
# 3-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoffSE_ext500: 82/341 peaks * 411 motifs (of 55/62/63 TF) = 140151 lines
# 3-1-TRAP.crc_CRUP_HI-32_K27ac_incltss_peakCutoff.8_cutoffSE_ext0: 0/691 peaks * 411 motifs (of 55/62/63 TF) = 284001 lines

## 6. find autoTF

In [48]:
import pandas as pd
import numpy as np
from math import ceil, log
import networkx as nx
from networkx.algorithms.clique import find_cliques_recursive

In [100]:
posNMdict = {}
for NM in subpeakDict0.keys():
    TF = refseqToNameDict[NM]
    for peak in subpeakDict0[NM]:
        peakPos = f'{peak.chr()}:{peak.start()}-{peak.end()}'
        if peakPos in posNMdict.keys():
            print(peakPos)
        posNMdict[peakPos] = TF
dict(list(posNMdict.items())[0:5])
len(posNMdict.keys())

405

In [46]:
def getKey(dict, value):
    return [k for k, v in dict.items() if value in v]

In [286]:
getKey(refseqToNameDict, 'INSM1')

['NM_002196']

In [101]:
TRAPfile = projectFolder + projectName + '_TRAPresults_ext' + str(motifExtension) + '.bed'
TRAPdf = pd.read_table(TRAPfile, skiprows=1, header=None, sep='\t', names=['peak_chr', 'peak_start', 'peak_end', 'motif', 'p_value'])
TRAPdf = TRAPdf.astype({'peak_start': int, 'peak_end': int, 'p_value': float})
TRAPdf.head(5)

Unnamed: 0,peak_chr,peak_start,peak_end,motif,p_value
0,chr1,156459251,156459696,V$AFP1_Q6#ZFHX3,0.370081
1,chr1,156459251,156459696,V$ARID3A_02#ARID3A,0.837649
2,chr1,156459251,156459696,V$ARID3A_04#ARID3A,0.813306
3,chr1,156459251,156459696,V$ATF4_02#ATF4,0.343902
4,chr1,156459251,156459696,V$ATF4_03#ATF4,0.511825


In [102]:
TRAPdf['peak'] = TRAPdf[TRAPdf.columns[:3]].apply(lambda x: f'{x[0]}:{x[1]-1}-{x[2]}', axis=1)
TRAPdf[['motif_name', 'motif_gene']] = TRAPdf.motif.str.split('#', expand=True)
TRAPdf['peak_gene'] = TRAPdf.peak.apply(lambda x: posNMdict[x])
TRAPdf = TRAPdf[['peak', 'peak_gene', 'motif_name', 'motif_gene', 'p_value']]
TRAPdf.head()

Unnamed: 0,peak,peak_gene,motif_name,motif_gene,p_value
0,chr1:156459250-156459696,MEF2D,V$AFP1_Q6,ZFHX3,0.370081
1,chr1:156459250-156459696,MEF2D,V$ARID3A_02,ARID3A,0.837649
2,chr1:156459250-156459696,MEF2D,V$ARID3A_04,ARID3A,0.813306
3,chr1:156459250-156459696,MEF2D,V$ATF4_02,ATF4,0.343902
4,chr1:156459250-156459696,MEF2D,V$ATF4_03,ATF4,0.511825


In [51]:
len(uniquify(TRAPdf.peak.values))

691

In [52]:
TRAPdf.query("p_value == 0.0")

Unnamed: 0,peak,peak_gene,motif_name,motif_gene,p_value
121382,chr8:144522000-144523100,MAFA,V$PLAGL2_06,PLAGL2,0.0
244230,chr20:38648400-38649500,MAFB,V$NKX61_02,NKX6-1,0.0


In [None]:
gene = 'NKX2-2'
TRAPdf[TRAPdf.motif_gene == gene].sort_values(by = 'p_value', ignore_index = True).query("peak_gene == @gene").head()


Unnamed: 0,peak,peak_gene,motif_name,motif_gene,p_value
150,chr20:21504000-21505100,NKX2-2,V$NKX2B_Q3,NKX2-2,0.030906
342,chr20:21487600-21489800,NKX2-2,V$NKX2B_Q3_01,NKX2-2,0.067742
357,chr20:21516200-21517300,NKX2-2,V$NKX22_06,NKX2-2,0.070837
366,chr20:21498700-21499800,NKX2-2,V$NKX22_05,NKX2-2,0.07184
395,chr20:21487600-21489800,NKX2-2,V$NKX22_02,NKX2-2,0.076473


In [465]:
#percent = 0.01
#percent = 0.02
percent = 0.04
#percent = 0.05
autoTF = set()
pairDict = {}
count = 0
for TF in candidateGenes:
    TFdf = TRAPdf[TRAPdf.motif_gene == TF].sort_values(by='p_value', ignore_index=True)
    listLen = TFdf.shape[0]
    if listLen != 0:
        cutoff = ceil(listLen * percent)
        selfdf = TFdf.query("peak_gene == @TF")
        selfRank = selfdf.index[0]  + 1
        selfP = selfdf['p_value'].values[0]
        if (selfP == 0.0) and (selfdf.shape[0] > 1):
            selfRank = selfdf.index[1] + 1
            selfP = selfdf['p_value'].values[1]
        print(TF, listLen, selfRank, selfP, sep=',')
        
        
        if (selfRank <= cutoff) and (selfP != 0.0):
            autoTF.add(TF)
            # print(TF, cutoff, selfRank, selfP, sep=',')
            
            # pairdf = TFdf[:cutoff2]
            pairdf = TFdf[:cutoff]
            count += len(uniquify(pairdf.peak_gene))

            
            for i, TF2 in enumerate(pairdf.peak_gene):
                if pairdf.p_value[i] == 0.0:
                    # print((TF, TF2, pairdf.p_value[i]))
                    continue;
                if (TF, TF2) not in pairDict.keys():
                    pairDict[(TF, TF2)] = pairdf.p_value[i]
print(f'len(autoTF): {len(autoTF)}')
print(f'len(edges): {count}')

NFIC,5528,6,0.00240281
PAX6,6910,24,0.00368251
MEIS2,6910,52,0.00571461
TEAD1,6219,13,0.0036393
FOXP4,691,45,0.0296982
MEF2D,4837,62,0.00825383
SREBF1,9674,74,0.00911766
MAFB,6910,86,0.0095147
EHF,6910,40,0.00888677
ZKSCAN1,5528,75,0.00885855
NKX2-2,4146,151,0.0309056
XBP1,6219,352,0.0671234
JUNB,6910,192,0.0121691
ISL1,6219,237,0.0440116
MNT,1382,157,0.106692
RREB1,4146,250,0.0543085
FOXK1,7601,228,0.0244845
FOXA2,8983,3,0.000757403
PDX1,12438,13,0.00101526
RARA,11747,239,0.0156828
RFX2,7601,15,0.00238422
MEIS1,5528,1,6.79177e-05
ARID3A,2073,84,0.0649778
RFX3,8292,29,0.00382963
GLIS2,3455,93,0.0241344
TFEB,2764,174,0.0625573
ZBTB17,2073,243,0.158525
JUND,9674,7,0.00106999
CREB3L2,2073,47,0.0226889
INSM1,691,203,0.275894
MNX1,3455,38,0.00952651
HSF4,2764,39,0.0103314
BHLHE40,8292,256,0.0523247
MLXIP,691,40,0.0564629
PLAGL2,4146,41,0.00783373
ZNF34,1382,3,0.00226769
ARID5B,1382,77,0.0399475
KLF9,3455,44,0.0262111
SOX13,1382,58,0.02244
RXRA,10365,129,0.00845888
VEZF1,2073,95,0.0547808
HE

In [466]:
edgeList = []
for pair in pairDict.keys():
    edgeList.append((pair[0], pair[1], -log(pairDict[pair])))
print(len(edgeList))
print(edgeList[:10])

1822
[('NFIC', 'RORC', 6.640480016308019), ('NFIC', 'RARA', 6.357301332839101), ('NFIC', 'RFX2', 6.1278296652285285), ('NFIC', 'SOX13', 6.1133610314059075), ('NFIC', 'JUND', 6.063828809638116), ('NFIC', 'NFIC', 6.031116393185708), ('NFIC', 'ZKSCAN1', 5.932659933443242), ('NFIC', 'MAFB', 5.917734646414269), ('NFIC', 'MEF2D', 5.808046924546965), ('NFIC', 'RXRA', 5.787929593914829)]


In [469]:
graph = nx.DiGraph()
graph.add_nodes_from(autoTF)
graph.add_weighted_edges_from(edgeList)

autoRegNodeList = set()
autoRegEdgeList = set()
G = nx.DiGraph(name='Beta_narrow.05_broad.1_incltss_cutoff500_ext0_TRAP_noTAD')

for n in autoTF:
    for m in autoTF:
        if n == m:
            autoRegNodeList.add(n)
            autoRegEdgeList.add((n, m, graph.edges[n, m]['weight']))
        else:
            if graph.has_edge(n,m) and graph.has_edge(m,n):
                autoRegNodeList.add(n)
                autoRegEdgeList.add((n, m, graph.edges[n, m]['weight']))
                autoRegEdgeList.add((m, n, graph.edges[m, n]['weight']))
                
G.add_nodes_from(autoRegNodeList)
G.add_weighted_edges_from(autoRegEdgeList)
cliques = nx.find_cliques_recursive(G)
cliqueList = list(cliques)

cliqueRanking = []

for clique in cliqueList:
    cliqueScore = 0
    edgeNum = 0
    for n in clique:
        for m in clique:
            if n == m:
                # continue;
                cliqueScore += G.edges[n, m]['weight']
                edgeNum += 1
            else:
                cliqueScore += G.edges[n, m]['weight']
                cliqueScore += G.edges[m, n]['weight']
                edgeNum += 2
                
    cliqueRanking.append((clique, cliqueScore/edgeNum, len(clique)))
    
sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1])
sortCliqueRanking[:5]

[(['EHF',
   'PAX6',
   'NKX6-1',
   'FOXA2',
   'PDX1',
   'MAFB',
   'RFX3',
   'MEIS2',
   'MEIS1',
   'NKX2-2'],
  5.7086623925828075,
  10),
 (['EHF',
   'PAX6',
   'NKX6-1',
   'FOXA2',
   'PDX1',
   'MAFB',
   'ZKSCAN1',
   'NKX2-2',
   'JUND',
   'MEIS2',
   'MEIS1'],
  5.707895075315044,
  11),
 (['EHF',
   'NFIC',
   'NKX6-1',
   'FOXA2',
   'PDX1',
   'MAFB',
   'ZKSCAN1',
   'NKX2-2',
   'JUND',
   'MEIS2',
   'MEIS1'],
  5.673364891786511,
  11),
 (['EHF', 'NFIC', 'HNF1A', 'MAFB', 'FOXK1', 'CREB3L2', 'ZKSCAN1', 'NKX2-2'],
  5.642290096236783,
  8),
 (['EHF',
   'PAX6',
   'RARA',
   'MAFB',
   'MEIS1',
   'MEIS2',
   'ZKSCAN1',
   'SOX9',
   'NKX2-2',
   'PDX1'],
  5.63561196007308,
  10)]

In [470]:
edges = [[edge_data[0], edge_data[1], edge_data[-1]["weight"]] for edge_data in G.edges(data=True)]
len(edges)

941

In [471]:
projectFolder = './CRCmapper/2-2-TRAP.crc_Beta_narrow.05_broad.1_incltss_cutoff500_ext0/'
if percent >= 0.01:
    autoTFFile = projectFolder + projectName + f'_AUTOREG_top{int(percent*100)}%.txt'
    cliqueFile = projectFolder + projectName + f'_CRC_SCORES_top{int(percent*100)}%.txt'
    edgeFile = projectFolder + projectName + f'_EDGES_top{int(percent*100)}%.txt'
else:
    autoTFFile = projectFolder + projectName + f'_AUTOREG_top{percent*100}%.txt'
    cliqueFile = projectFolder + projectName + f'_CRC_SCORES_top{percent*100}%.txt'
    edgeFile = projectFolder + projectName + f'_EDGES_top{percent*100}%.txt'

unParseTable(sorted(autoTF), autoTFFile, '')
unParseTable(sortCliqueRanking, cliqueFile, '\t')
unParseTable(edges, edgeFile, '\t')