In [1]:
%run ontologyPackage/ontologySTATanalysis.ipynb

In [None]:
import pandas as pd 
import numpy as np
from numpy import random, array
import time 

import rpy2.robjects as robjects

import os
from goatools.obo_parser import GODag

pd.set_option('display.max_colwidth', None) # Values in columns won't be shortned | None over -1
pd.set_option('chained_assignment',None) # Disabling chained assignments 

geneColID = ["chromosome","source","type","start","end","score","strand","phase","gene_symbol","gene_ensID","length","entrezid"]
geneAnnotationDF = pd.read_csv('entrez_id/geneAnnotationsDF_Selected_entrezID.csv', sep=',', comment='#', low_memory=False, header=0, names=geneColID)

chromosomeColID = ['chromosome','source','type','start','end','score','strand','phase']
chromosomesDF = pd.read_csv('chromosomesDF.csv', sep=',', comment='#', low_memory=False, header=0, names=chromosomeColID)

In [3]:
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests as padjust
from statistics import median, stdev

In [73]:
import segment_liftover

In [76]:
help(segment_liftover)

Help on package segment_liftover:

NAME
    segment_liftover

PACKAGE CONTENTS
    segmentLiftover

FILE
    /Users/joshyk/venv/lib/python3.7/site-packages/segment_liftover/__init__.py




In [4]:
# GO Term information
if not os.path.exists('go-basic.obo'):
    !wget http://geneontology.org/ontology/go-basic.obo
goDATA = GODag('go-basic.obo', optional_attrs=['relationship'])

go-basic.obo: fmt(1.2) rel(2019-07-01) 47,413 GO Terms; optional_attrs(relationship)


In [5]:
#Dropping unneccesary stuff and resetting index from 0
# geneAnnotationDF = geneAnnotationDF[geneAnnotationDF.gene_symbol != 'CTD-2207O23.3']
geneAnnotationDF = geneAnnotationDF.sort_values(by=['chromosome', 'start'])
chromosomesDF = chromosomesDF.sort_values(by=['chromosome'])
chromosomesDF = chromosomesDF.reset_index()
geneAnnotationDF = geneAnnotationDF.reset_index()
chromosomesDF = chromosomesDF.drop(columns=['score', 'strand', 'phase', 'index'])
geneAnnotationDF = geneAnnotationDF.drop(columns=['score', 'phase', 'index'])

In [6]:
# Making a list of DataFrames to be used in addwindow tss so that if - gene the start is its end and its end is its start 
u = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y'] # Chromosome ids 
geneDFL = []
for c in u:
    geneDF = geneAnnotationDF.loc[geneAnnotationDF['chromosome'] == c].copy()
    for i, row in geneDF.iterrows():
        if row['strand'] == '-':
            start = row['end']
            end = row['start']
            geneDF.at[i, 'start'] = start
            geneDF.at[i, 'end'] = end
    geneDF = geneDF.sort_values(by=['start'])
    geneDF = geneDF.reset_index()
    geneDF = geneDF.drop(columns=['index'])
    geneDFL.append(geneDF)

In [7]:
# Making a list of DataFrames to be used in future functions in var geneDFL
u = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
geneDFLBODY = []
for cc in u:
    geneDFbod = geneAnnotationDF.loc[geneAnnotationDF['chromosome'] == cc].copy()
    geneDFbod = geneDFbod.sort_values(by=['start'])
    geneDFbod = geneDFbod.reset_index()
    geneDFbod = geneDFbod.drop(columns=['index'])
    geneDFLBODY.append(geneDFbod)

In [8]:
# probability that a randomly chosen gene is in a certain chromosome. Ordered from 1 - MT,X,Y | As shown chromosome 1 has highest prob of selection
probabilityL = [0.08069467597786323, 0.07850260775517634, 0.06427388269957329, 0.06165457287524136, 0.05884231002379223, 0.05536363751707386, 0.05164908592141782, 0.0470440371698401, 0.044858119022639725, 0.043367989830914035, 0.043785860460049814, 0.04319875643982657, 0.037069107410936775, 0.034696265410969616, 0.033058580434273406, 0.029281523923645837, 0.026986378244674356, 0.026051531764496164, 0.018999829086634144, 0.02088839915494138, 0.015140187376094325, 0.016471877735809576, 5.370214538878023e-06, 0.05057780526426391, 0.017537607961181506]

In [9]:
# Generates random sites
dl = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25'] # 23, 24, 25
chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]}
def nRandSites(data):
    nSite = data[0]
    rand = data[1] 
    sitesbyC =[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

    random.seed(rand)
        
    for site in range(nSite):
        chrN = random.choice(dl, p=probabilityL)
        randnum = round(random.uniform(chrD[chrN][0], chrD[chrN][1]))
        sitesbyC[int(chrN) - 1].append(randnum)
    for l in sitesbyC:
        l.sort()
    return array([array(l) for l in sitesbyC])

def nRandSitesSim(nSite, nSim):
    totalsites = [nSite] * nSim
    chunks = []
    for i in totalsites:
        chunks.append([i])
    for l in chunks:
        l.append(random.randint(100000))
    
    pool = Pool(5)
    result = pool.map(nRandSites, chunks)
    pool.close()
    return(result) 

In [10]:
##### Adds midpoint in gene and upper and lower bound values to geneDFLtss based on size of window or nearby genes. Adding around geneTSS.
def addWindowTSS(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    geneWindowDFL = []
    for i in range(len(geneDFL)):
        df = geneDFL[i].copy()
        geneWindowDFL.append(df)
    outList = []
    chrIDNUM = 0
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1   
        for i, row in dfG.iterrows():
            start = row['start']
            
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + start) # Middle of gene body 
            if i == 0 or i == dfIndexMax: # Case for start and end genes
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # chrD[chrIDNUM][0] = start val of the chromosome
                    dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                    dfG.at[i, 'upperB'] = min(start + window, chrD[str(chrIDNUM)][1]) # chrD[chrIDNUM][1] = end val of chromosome 
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
#         dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    
    return(outList)

In [77]:
viewer = addWindowTSS20(100000)

In [80]:
viewer[0]

Unnamed: 0,chromosome,source,type,start,end,strand,gene_symbol,gene_ensID,length,entrezid,midBody,lowB,upperB,domainLEN
0,1,havana,gene,11869,14409,+,DDX11L1,ENSG00000223972,2541,100287102,13139,1,40480,40479
1,1,ensembl_havana,gene,69091,70008,+,OR4F5,ENSG00000186092,918,79501,69550,40480,169091,128611
2,1,ensembl_havana,gene,923928,944581,+,SAMD11,ENSG00000187634,20654,148398,934255,823928,941618,117690
3,1,ensembl_havana,gene,959309,944204,-,NOC2L,ENSG00000188976,15106,26155,966862,941618,959948,18330
4,1,ensembl_havana,gene,960587,965715,+,KLHL17,ENSG00000187961,5129,339451,963151,959948,963542,3594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,1,ensembl_havana,gene,248755739,248718649,-,LYPD8,ENSG00000259823,37091,646627,248774284,248719021,248791186,72165
2200,1,ensembl_havana,gene,248826633,248810446,-,SH3BP5L,ENSG00000175137,16188,80851,248834727,248791186,248832421,41235
2201,1,ensembl_havana,gene,248838210,248849517,+,ZNF672,ENSG00000171161,11308,79894,248843864,248832421,248848677,16256
2202,1,ensembl_havana,gene,248859144,248850006,-,ZNF692,ENSG00000171163,9139,55657,248863713,248848677,248882670,33993


In [11]:
##### Adds midpoint in gene and upper and lower bound values to geneDFLtss based on size of window or nearby genes. Adding around geneTSS.
def addWindowTSS20(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    geneWindowDFL = []
    for i in range(len(geneDFL)):
        df = geneDFL[i].copy()
        geneWindowDFL.append(df)
    chrIDNUM = 0
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1   
        for i, row in dfG.iterrows():
            start = row['start']
            
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + start) # Middle of gene body 
            if i == 0 or i == dfIndexMax: # Case for start and end genes
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # chrD[chrIDNUM][0] = start val of the chromosome
                    dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                    dfG.at[i, 'upperB'] = min(start + window, chrD[str(chrIDNUM)][1]) # chrD[chrIDNUM][1] = end val of chromosome 
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
    
    return(geneWindowDFL)

In [12]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    chrIDNUM = 0
    geneWindowDFL = []
    for i in range(len(geneDFLBODY)):
        df = geneDFLBODY[i].copy()
        geneWindowDFL.append(df)
    
    outList = []
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            start = row['start']
            end = row['end']
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = min(row['end'] + window, chrD[str(chrIDNUM)][1])
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
#         dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    
    return(outList)

In [13]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY20(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    chrIDNUM = 0
    geneWindowDFL = []
    for i in range(len(geneDFLBODY)):
        df = geneDFLBODY[i].copy()
        geneWindowDFL.append(df)
    
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            start = row['start']
            end = row['end']
            
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = min(row['end'] + window, chrD[str(chrIDNUM)][1])
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
    
    return(geneWindowDFL)

In [14]:
def findSMLdomain(windowDF):
    windowLens = windowDF
    allVALS = []
    for df in windowLens:
        values = list(df.domainLEN.values)
        allVALS += values
    allVALS = sorted(allVALS)
    cutoff = int(len(allVALS) / 3)
    small = allVALS[:cutoff][-1]
    medium = allVALS[cutoff:cutoff*2][-1]
    return [small, medium]

In [15]:
# Creating list of all the entrezids 
entrezIDLtss = []
for geneDF in geneDFL:
    entrezIDLtss.append(geneDF.entrezid.values)

In [16]:
entrezIDLbody = []
for geneDF in geneDFLBODY:
    entrezIDLbody.append(geneDF.entrezid.values)

In [53]:
windDF=addWindowTSS20(100000)

In [70]:
windDF[0]

Unnamed: 0,chromosome,source,type,start,end,strand,gene_symbol,gene_ensID,length,entrezid,midBody,lowB,upperB,domainLEN
0,1,havana,gene,11869,14409,+,DDX11L1,ENSG00000223972,2541,100287102,13139,1,40480,40479
1,1,ensembl_havana,gene,69091,70008,+,OR4F5,ENSG00000186092,918,79501,69550,40480,169091,128611
2,1,ensembl_havana,gene,923928,944581,+,SAMD11,ENSG00000187634,20654,148398,934255,823928,941618,117690
3,1,ensembl_havana,gene,959309,944204,-,NOC2L,ENSG00000188976,15106,26155,966862,941618,959948,18330
4,1,ensembl_havana,gene,960587,965715,+,KLHL17,ENSG00000187961,5129,339451,963151,959948,963542,3594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,1,ensembl_havana,gene,248755739,248718649,-,LYPD8,ENSG00000259823,37091,646627,248774284,248719021,248791186,72165
2200,1,ensembl_havana,gene,248826633,248810446,-,SH3BP5L,ENSG00000175137,16188,80851,248834727,248791186,248832421,41235
2201,1,ensembl_havana,gene,248838210,248849517,+,ZNF672,ENSG00000171161,11308,79894,248843864,248832421,248848677,16256
2202,1,ensembl_havana,gene,248859144,248850006,-,ZNF692,ENSG00000171163,9139,55657,248863713,248848677,248882670,33993


In [71]:
#21338
locusF = []
for df in windDF:
    chromo = df.chromosome.values
    lowB = df.lowB.values
    upB = df.upperB.values
    entrez = df.entrezid.values
    
    locus = []
    for i in range(len(chromo)):
        locus.append([chromo[i], lowB[i], upB[i], entrez[i]])
    locusF.append(locus)
    

In [72]:
for lch in locusF:
    for l in lch:
        print(l)

['1', 1, 40480, 100287102]
['1', 40480, 169091, 79501]
['1', 823928, 941618, 148398]
['1', 941618, 959948, 26155]
['1', 959948, 963542, 339451]
['1', 963542, 974295, 84069]
['1', 974295, 991132, 84808]
['1', 991132, 1000655, 57801]
['1', 1000655, 1010630, 9636]
['1', 1010630, 1039928, 375790]
['1', 1039928, 1067020, 100288175]
['1', 1067020, 1095334, 401934]
['1', 1095334, 1145122, 54991]
['1', 1145122, 1176719, 254173]
['1', 1176719, 1193123, 100506376]
['1', 1193123, 1210414, 8784]
['1', 1210414, 1223084, 7293]
['1', 1223084, 1232148, 51150]
['1', 1232148, 1239493, 126792]
['1', 1239493, 1260303, 388581]
['1', 1260303, 1277160, 118424]
['1', 1277160, 1294501, 6339]
['1', 1294501, 1309088, 126789]
['1', 1309088, 1317150, 116983]
['1', 1317150, 1324723, 54973]
['1', 1324723, 1328035, 80772]
['1', 1328035, 1340332, 83756]
['1', 1340332, 1355563, 1855]
['1', 1355563, 1368636, 54587]
['1', 1368636, 1387411, 54998]
['1', 1387411, 1403320, 81669]
['1', 1403320, 1414541, 55052]
['1', 1414541

['1', 207033561, 207043673, 5208]
['1', 207043673, 207070911, 55432]
['1', 207070911, 207096552, 725]
['1', 207096552, 207204262, 722]
['1', 207221508, 207387869, 1604]
['1', 207387869, 207475188, 1380]
['1', 207475188, 207570630, 1378]
['1', 207570630, 207698585, 1379]
['1', 207698585, 207831729, 4179]
['1', 207831729, 208011402, 947]
['1', 208144320, 208344320, 5362]
['1', 209483717, 209618091, 57172]
['1', 209618091, 209663943, 3914]
['1', 209663943, 209680799, 50486]
['1', 209680799, 209705151, 3290]
['1', 209705151, 209740078, 101930114]
['1', 209740078, 209770295, 80342]
['1', 209770295, 209795367, 148304]
['1', 209795367, 209817091, 3664]
['1', 209817091, 209883090, 27042]
['1', 209883090, 210038174, 255928]
['1', 210132799, 210233423, 56256]
['1', 210233423, 210281149, 574036]
['1', 210281149, 210428252, 55733]
['1', 211034115, 211196246, 3756]
['1', 211196246, 211292496, 55758]
['1', 211292496, 211409766, 7188]
['1', 211409766, 211535829, 343035]
['1', 211535829, 211627186, 77

['2', 227552736, 227675666, 56918]
['2', 227675666, 227765927, 80704]
['2', 227765927, 227842448, 6364]
['2', 227842448, 227971054, 164781]
['2', 228081645, 228281645, 80309]
['2', 229171285, 229371285, 55022]
['2', 229614558, 229814558, 92737]
['2', 229822302, 229922770, 130888]
['2', 229922770, 229996119, 9320]
['2', 229996119, 230136054, 151473]
['2', 230136054, 230214419, 11262]
['2', 230214419, 230276456, 3431]
['2', 230276456, 230371563, 93349]
['2', 230371563, 230515942, 6672]
['2', 230612845, 230788742, 51719]
['2', 230788742, 230875569, 81618]
['2', 230875569, 230923783, 151484]
['2', 230923783, 230978593, 9290]
['2', 230978593, 231016805, 130560]
['2', 231016805, 231047177, 257407]
['2', 231047177, 231090991, 5707]
['2', 231090991, 231161832, 3357]
['2', 231161832, 231297044, 80210]
['2', 231297044, 231439592, 93010]
['2', 231439592, 231507068, 4691]
['2', 231507068, 231561698, 10316]
['2', 231561698, 231649898, 165100]
['2', 231649898, 231744283, 5757]
['2', 231744283, 23178

['4', 17698071, 17796135, 27146]
['4', 17796135, 17810830, 54876]
['4', 17810830, 17910902, 64151]
['4', 17921876, 18121876, 254251]
['4', 20153260, 20322724, 9353]
['4', 20322724, 20492189, 100505893]
['4', 20596282, 20796282, 133015]
['4', 21482096, 21682096, 105374516]
['4', 21848799, 22048799, 80333]
['4', 22416054, 22616054, 166647]
['4', 23804089, 24004089, 10891]
['4', 24484550, 24684550, 1665]
['4', 24689912, 24885058, 6649]
['4', 24885058, 25005541, 91050]
['4', 25005541, 25095660, 55203]
['4', 25095660, 25160552, 51091]
['4', 25160552, 25197319, 55300]
['4', 25197319, 25273380, 55300]
['4', 25273380, 25344999, 29063]
['4', 25344999, 25477213, 29945]
['4', 25555301, 25755301, 10568]
['4', 25761830, 25862795, 389203]
['4', 25862795, 25963760, 23231]
['4', 26063455, 26263455, 3516]
['4', 26390462, 26533449, 886]
['4', 26533449, 26676437, 55296]
['4', 26757678, 26957678, 57620]
['4', 30620415, 30820415, 5099]
['4', 36144116, 36244312, 439933]
['4', 36244312, 36263065, 116984]
['4

['5', 172547894, 172706230, 54492]
['5', 172706230, 172802735, 1843]
['5', 172802735, 172896502, 57222]
['5', 172896502, 172959060, 51121]
['5', 172959060, 172971574, 100268168]
['5', 172971574, 173020054, 8992]
['5', 173020054, 173100397, 153222]
['5', 173100397, 173189899, 662]
['5', 173189899, 173282430, 1482]
['5', 173282430, 173429503, 8614]
['5', 173479643, 173598151, 285593]
['5', 173598151, 173716660, 91272]
['5', 173788280, 173931029, 80315]
['5', 173931029, 174009691, 133491]
['5', 174009691, 174145604, 51617]
['5', 174624533, 174824533, 4488]
['5', 175344208, 175460635, 1812]
['5', 175460635, 175567546, 94081]
['5', 175567546, 175727170, 3274]
['5', 175727170, 175896310, 10814]
['5', 175934680, 176047684, 84321]
['5', 176047684, 176061355, 202134]
['5', 176061355, 176150194, 100996385]
['5', 176150194, 176300167, 375484]
['5', 176300167, 176363718, 57179]
['5', 176363718, 176377107, 285598]
['5', 176377107, 176388861, 192286]
['5', 176388861, 176402772, 51491]
['5', 17640277

['7', 48004350, 48032315, 256979]
['7', 48032315, 48062069, 136288]
['7', 48062069, 48130043, 7378]
['7', 48130043, 48271458, 154664]
['7', 49673661, 49873661, 375567]
['7', 49996036, 50108682, 100130988]
['7', 50108682, 50212726, 11055]
['7', 50212726, 50404124, 10320]
['7', 50431759, 50537147, 100129427]
['7', 50537147, 50553996, 63979]
['7', 50553996, 50665457, 1644]
['7', 50693462, 50893462, 2887]
['7', 51216818, 51416818, 23242]
['7', 52935642, 53135642, 285877]
['7', 54442325, 54549647, 222008]
['7', 54549647, 54656970, 285878]
['7', 54659974, 54859974, 23480]
['7', 54919021, 55103977, 1956]
['7', 55103977, 55277191, 100507500]
['7', 55277191, 55465448, 55915]
['7', 55472988, 55643120, 81552]
['7', 55643120, 55788020, 360132]
['7', 55788020, 55875132, 346288]
['7', 55875132, 55919634, 349075]
['7', 55919634, 55951806, 2631]
['7', 55951806, 56001711, 51373]
['7', 56001711, 56051617, 5723]
['7', 56051617, 56057816, 908]
['7', 56057816, 56078499, 25870]
['7', 56078499, 56099786, 526

['9', 96401019, 96434265, 195828]
['9', 96434265, 96535015, 22927]
['9', 96535015, 96637566, 8555]
['9', 96637566, 96716716, 195827]
['9', 96716716, 96826876, 22869]
['9', 96826876, 96901966, 158431]
['9', 96901966, 96971009, 441457]
['9', 96971009, 97026675, 84278]
['9', 97026675, 97060794, 1515]
['9', 97060794, 97181945, 340508]
['9', 97207304, 97359627, 100499483]
['9', 97359627, 97456565, 23424]
['9', 97456565, 97567377, 7111]
['9', 97567377, 97633600, 158427]
['9', 97633600, 97634838, 4686]
['9', 97634838, 97666704, 105376171]
['9', 97666704, 97775305, 7507]
['9', 97775305, 97887912, 2304]
['9', 97887912, 97933713, 51531]
['9', 97933713, 97964108, 55363]
['9', 97964108, 98020050, 10541]
['9', 98020050, 98087975, 54187]
['9', 98087975, 98155926, 9830]
['9', 98155926, 98224180, 7464]
['9', 98224180, 98355721, 55357]
['9', 98609197, 98753081, 9568]
['9', 98753081, 98802332, 203286]
['9', 98802332, 98840057, 79695]
['9', 98840057, 98907797, 102724684]
['9', 98907797, 99023608, 1306]
[

['11', 45913047, 45920216, 9409]
['11', 45920216, 46021399, 120071]
['11', 46021399, 46199419, 51317]
['11', 46199419, 46305283, 90993]
['11', 46305283, 46356830, 8525]
['11', 46356830, 46383682, 4192]
['11', 46383682, 46486608, 1132]
['11', 46494125, 46605826, 55626]
['11', 46605826, 46617718, 9776]
['11', 46617718, 46659262, 283254]
['11', 46659262, 46700716, 392]
['11', 46700716, 46709999, 79797]
['11', 46709999, 46782744, 2147]
['11', 46782744, 46846360, 9793]
['11', 46846360, 46882527, 100507401]
['11', 46882527, 46927665, 4038]
['11', 46927665, 47036689, 79096]
['11', 47077125, 47181784, 84364]
['11', 47181784, 47200454, 29763]
['11', 47200454, 47231382, 1643]
['11', 47231382, 47248603, 10062]
['11', 47248603, 47259033, 53]
['11', 47259033, 47270635, 8567]
['11', 47270635, 47312406, 101928943]
['11', 47312406, 47365639, 4607]
['11', 47365639, 47392854, 6688]
['11', 47392854, 47416802, 91252]
['11', 47416802, 47437825, 5702]
['11', 47437825, 47507257, 5913]
['11', 47507257, 475653

['12', 80607498, 80712205, 4618]
['12', 80712205, 80816912, 4617]
['12', 80836414, 80937169, 79611]
['12', 80937169, 81037925, 8825]
['12', 81659553, 81859553, 8499]
['12', 82258497, 82358651, 84190]
['12', 82358651, 82458805, 29080]
['12', 82586880, 82786880, 160335]
['12', 84813615, 84974946, 55117]
['12', 84974946, 85036295, 144448]
['12', 85036295, 85136314, 84125]
['12', 85180107, 85380107, 8092]
['12', 85736570, 85855432, 9182]
['12', 85855432, 85974295, 4922]
['12', 86738904, 86938904, 25834]
['12', 87933846, 88033941, 91298]
['12', 88033941, 88088126, 160419]
['12', 88088126, 88142256, 80184]
['12', 88142256, 88242296, 160418]
['12', 88480851, 88680851, 4254]
['12', 89253271, 89439038, 1848]
['12', 89439038, 89525415, 8693]
['12', 89525415, 89526143, 282809]
['12', 89526143, 89617781, 100528030]
['12', 89617781, 89809300, 490]
['12', 90855176, 90980101, 196477]
['12', 90980101, 91031504, 1833]
['12', 91031504, 91084907, 11081]
['12', 91084907, 91147477, 4060]
['12', 91147477, 9

['15', 53912904, 54112904, 440279]
['15', 55097067, 55258013, 51187]
['15', 55258013, 55319036, 9488]
['15', 55319036, 55319137, 5873]
['15', 55319137, 55363835, 101928527]
['15', 55363835, 55408529, 9236]
['15', 55408529, 55458391, 145788]
['15', 55458391, 55548590, 161582]
['15', 55548590, 55666018, 26108]
['15', 55666018, 55843090, 283659]
['15', 55893746, 56093746, 4734]
['15', 56143266, 56243637, 64864]
['15', 56243637, 56344009, 374618]
['15', 56365137, 56565137, 55329]
['15', 56818571, 56918597, 54816]
['15', 56918597, 57018623, 6938]
['15', 57275967, 57475967, 84952]
['15', 57491908, 57591924, 145781]
['15', 57591924, 57649285, 100820829]
['15', 57649285, 57806629, 81488]
['15', 58038169, 58238169, 366]
['15', 58310569, 58454217, 3990]
['15', 58454217, 58498300, 8854]
['15', 58498300, 58598735, 101928694]
['15', 58649978, 58760585, 102]
['15', 58760585, 58818183, 54629]
['15', 58818183, 58899414, 54778]
['15', 58899414, 59019365, 79811]
['15', 59019365, 59155950, 9133]
['15', 5

['17', 28629827, 28653772, 9703]
['17', 28653772, 28662140, 6830]
['17', 28662140, 28687021, 6388]
['17', 28687021, 28715141, 147011]
['17', 28715141, 28718911, 83871]
['17', 28718911, 28722645, 6147]
['17', 28722645, 28726916, 284086]
['17', 28726916, 28735959, 116238]
['17', 28735959, 28799461, 9618]
['17', 28799461, 28855085, 26284]
['17', 28855085, 28876455, 55731]
['17', 28876455, 28900375, 2319]
['17', 28900375, 28914673, 147015]
['17', 28914673, 28938394, 101927018]
['17', 28938394, 28951142, 51268]
['17', 28951142, 28979105, 57649]
['17', 28979105, 29042648, 124925]
['17', 29042648, 29129634, 9220]
['17', 29129634, 29213637, 399687]
['17', 29213637, 29270490, 1411]
['17', 29270490, 29342291, 57532]
['17', 29342291, 29478258, 57551]
['17', 29478258, 29566594, 90313]
['17', 29566594, 29578453, 116236]
['17', 29578453, 29591911, 124930]
['17', 29591911, 29608480, 28964]
['17', 29608480, 29722907, 84940]
['17', 29829200, 29929738, 374786]
['17', 29929738, 30022898, 85464]
['17', 30

['19', 22615290, 22709215, 57615]
['19', 22709215, 22808154, 7652]
['19', 22808154, 22917688, 646864]
['19', 22917688, 23039193, 388523]
['19', 23039193, 23162800, 100129543]
['19', 23162800, 23322975, 440519]
['19', 23322975, 23495560, 7644]
['19', 23587220, 23723055, 171392]
['19', 23723055, 23836883, 148213]
['19', 23836883, 23936403, 730087]
['19', 23936403, 23995667, 100505851]
['19', 23995667, 24098415, 9534]
['19', 24098415, 24263425, 100101266]
['19', 29113541, 29313541, 7386]
['19', 29426499, 29565258, 342865]
['19', 29565258, 29634536, 10775]
['19', 29634536, 29690422, 79156]
['19', 29690422, 29763843, 83636]
['19', 29763843, 29867771, 898]
['19', 29867771, 30023644, 8725]
['19', 30128290, 30328290, 9745]
['19', 31249547, 31449547, 57616]
['19', 32245594, 32375568, 22847]
['19', 32375568, 32405551, 147991]
['19', 32405551, 32493314, 400684]
['19', 32493314, 32628237, 9141]
['19', 32628237, 32676002, 388531]
['19', 32676002, 32684279, 84079]
['19', 32684279, 32705857, 390916]


['22', 18094125, 18130115, 51807]
['22', 18130115, 18249899, 11274]
['22', 18418161, 18522659, 728226]
['22', 18522659, 18527479, 100996401]
['22', 18527479, 18552885, 728229]
['22', 18552885, 18594943, 728233]
['22', 18594943, 18711919, 85376]
['22', 18743399, 18845105, 2679]
['22', 18845105, 18876419, 102725072]
['22', 18876419, 18921290, 8214]
['22', 18921290, 18992464, 5625]
['22', 18992464, 19085414, 25786]
['22', 19085414, 19126631, 9993]
['22', 19126631, 19137746, 23617]
['22', 19137746, 19147483, 8220]
['22', 19147483, 19164556, 2928]
['22', 19164556, 19235273, 6576]
['22', 19235273, 19361809, 8218]
['22', 19361809, 19439676, 64976]
['22', 19439676, 19447841, 7290]
['22', 19447841, 19463723, 128977]
['22', 19463723, 19479337, 7353]
['22', 19479337, 19503502, 8318]
['22', 19503502, 19621004, 7122]
['22', 19621004, 19718704, 5413]
['22', 19718704, 19739824, 2812]
['22', 19739824, 19805799, 6899]
['22', 19805799, 19854917, 79680]
['22', 19854917, 19898273, 54584]
['22', 19898273, 

In [59]:
len(geneDFL[8])

817

In [42]:
testmappedFile = adaptiveEnrichmentAnalysis(FileSites, 100000, method='TSS')

In [51]:
mapped = geneReadSites(FileSites, addWindowTSS(100000), method='TSS')

In [52]:
len(mapped)

16477

In [43]:
len(testmappedFile)

17091

In [44]:
adjMappedFile = reFABSlistC(testmappedFile)
c=1
print('done')

R[write to console]: Loading required package: ROntoTools

R[write to console]: Loading required package: graph

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colnames, dirname, do.call,
    duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
    lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
  

done


In [45]:
len(adjMappedFile)

17091

In [46]:
for l in adjMappedFile:
    print(l)
print('...Done...')

['GO:0019452', 0.5458889669720169]
['GO:0016363', 0.5984754684113482]
['GO:0071670', 0.793783169682258]
['GO:0003333', 0.26722887708084037]
['GO:0036332', 0.9372979052381686]
['GO:0050646', 0.9372979052381686]
['GO:1901684', 0.5458889669720169]
['GO:0008506', 0.771795808340334]
['GO:0051791', 0.8785273631638588]
['GO:0008626', 0.5582928278548038]
['GO:0052907', 0.8271448663876793]
['GO:0000398', 0.9228874947412256]
['GO:0009949', 0.9372979052381686]
['GO:0019556', 0.46805919465503487]
['GO:0033291', 0.9372979052381686]
['GO:0006513', 0.38863522700236397]
['GO:0000012', 0.9684899743270888]
['GO:0003996', 0.9211450450615867]
['GO:0090534', 0.9372979052381686]
['GO:0006539', 0.5458889669720169]
['GO:0071298', 0.9372979052381686]
['GO:2000367', 0.8271448663876793]
['GO:0015990', 0.9575177285717068]
['GO:0050730', 0.4019154758858719]
['GO:0004018', 0.9372979052381686]
['GO:0033567', 0.8271448663876793]
['GO:0051066', 0.9372979052381686]
['GO:0004534', 0.9852423908350101]
['GO:0060980', 0.93

['GO:0015362', 0.8271448663876793]
['GO:0031530', 0.8785273631638588]
['GO:0047840', 0.5458889669720169]
['GO:0007265', 0.5142158359428395]
['GO:0070324', 0.5582928278548038]
['GO:0005671', 0.9659069678514302]
['GO:1990408', 0.8785273631638588]
['GO:1903348', 0.8785273631638588]
['GO:0009101', 0.37847927995794206]
['GO:0034134', 0.8477478477097452]
['GO:1904379', 0.6841686299914919]
['GO:1905242', 0.9372979052381686]
['GO:0039003', 0.878519099587672]
['GO:0030379', 0.8271448663876793]
['GO:0032836', 0.37847927995794206]
['GO:0004677', 0.9999846078024368]
['GO:0070997', 0.5659065700409542]
['GO:0071795', 0.8271448663876793]
['GO:0000802', 0.9372979052381686]
['GO:0007442', 0.7717667617733613]
['GO:0005952', 0.6467634684585888]
['GO:0097149', 0.8785273631638588]
['GO:1905502', 0.9948352792295165]
['GO:0043982', 0.8581465487627244]
['GO:0071035', 0.9897179765995912]
['GO:0034059', 0.878519099587672]
['GO:1990130', 0.8234418571878966]
['GO:0008607', 0.9701211027838668]
['GO:0008206', 0.766

['GO:1990184', 0.684148517980588]
['GO:0032277', 0.9372979052381686]
['GO:0042558', 0.9372979052381686]
['GO:0017018', 0.46805919465503487]
['GO:0016046', 0.793783169682258]
['GO:0016493', 0.7446681115176372]
['GO:0001824', 0.7697231364717168]
['GO:0090082', 0.793783169682258]
['GO:0008401', 0.9788261038318602]
['GO:0061044', 0.5458889669720169]
['GO:0035948', 0.9063546621566538]
['GO:0097150', 0.815407023110089]
['GO:0002857', 0.8271448663876793]
['GO:0019863', 0.6301470419815071]
['GO:0002432', 0.8271448663876793]
['GO:0015789', 0.9960684473124783]
['GO:0061578', 0.4963678623077092]
['GO:0000932', 0.7600646358751844]
['GO:0043116', 0.32022200435042963]
['GO:0000276', 0.6780373586086896]
['GO:1903724', 0.771795808340334]
['GO:0102521', 0.8785273631638588]
['GO:0014704', 0.3888160383312572]
['GO:0102294', 0.684148517980588]
['GO:0010793', 0.8234418571878966]
['GO:0097084', 0.6780373586086896]
['GO:0051878', 0.9372979052381686]
['GO:0048846', 0.5659065700409542]
['GO:0002460', 0.5458889

['GO:0072229', 0.9372979052381686]
['GO:0004879', 0.5355458414003162]
['GO:0030003', 0.684148517980588]
['GO:0097116', 0.9948352792295165]
['GO:0005309', 0.5458889669720169]
['GO:0051561', 0.8709247525723423]
['GO:0019811', 0.8271448663876793]
['GO:0008356', 0.5956239345781401]
['GO:0051302', 0.7477115131107983]
['GO:0018016', 0.9372979052381686]
['GO:1904306', 0.970141214782146]
['GO:0048643', 0.8581465487627244]
['GO:0001829', 0.826718181557963]
['GO:0030618', 0.941173976618978]
['GO:0061845', 0.8271448663876793]
['GO:1901625', 0.9372979052381686]
['GO:0032876', 0.5659065700409542]
['GO:0043654', 0.5659065700409542]
['GO:0032732', 0.771795808340334]
['GO:0150025', 0.9372979052381686]
['GO:0006677', 0.793783169682258]
['GO:0070198', 0.7805487701430781]
['GO:0032270', 0.3116116862566418]
['GO:0004689', 0.9912349954641154]
['GO:2000646', 0.793783169682258]
['GO:0001838', 0.878519099587672]
['GO:0048568', 0.5231775145544317]
['GO:2000329', 0.684148517980588]
['GO:0036378', 0.968489974327

['GO:0000062', 0.7993452319848534]
['GO:0099546', 0.8271448663876793]
['GO:0030866', 0.11783863633144259]
['GO:0005151', 0.9960684473124783]
['GO:0030134', 0.536158393214023]
['GO:2000249', 0.5589251153416228]
['GO:0004611', 0.5458889669720169]
['GO:0006186', 0.9372979052381686]
['GO:0048333', 0.40375265769498797]
['GO:0001729', 0.8785273631638588]
['GO:0045131', 0.5458889669720169]
['GO:0033168', 0.8271448663876793]
['GO:0004647', 0.8234418571878966]
['GO:0032680', 0.5582928278548038]
['GO:0002161', 0.5781552334450144]
['GO:1900452', 0.6780373586086896]
['GO:0030851', 0.5582928278548038]
['GO:0033745', 0.771795808340334]
['GO:0140078', 0.8234418571878966]
['GO:0072189', 0.968819015839567]
['GO:0090306', 0.9063546621566538]
['GO:0040036', 0.6841686299914919]
['GO:0097530', 0.8271448663876793]
['GO:0004356', 0.9372979052381686]
['GO:0050829', 0.8016410972031307]
['GO:0042803', 0.017423766248900113]
['GO:0031550', 0.9372979052381686]
['GO:0048669', 0.878519099587672]
['GO:0035665', 0.545

['GO:0042659', 0.9372979052381686]
['GO:0008457', 0.9372979052381686]
['GO:0060327', 0.9960684473124783]
['GO:0047150', 0.684148517980588]
['GO:0006290', 0.9948352792295165]
['GO:0001921', 0.41281419029460187]
['GO:2000566', 0.8785273631638588]
['GO:0016594', 0.7805487701430781]
['GO:1903441', 0.41281419029460187]
['GO:2001140', 0.8785273631638588]
['GO:0033622', 0.24329090267209133]
['GO:0097136', 0.6539923313468676]
['GO:0042282', 0.9372979052381686]
['GO:1902685', 0.6841686299914919]
['GO:1903937', 0.878519099587672]
['GO:2000080', 0.9372979052381686]
['GO:0001885', 0.793783169682258]
['GO:0019089', 0.9372979052381686]
['GO:1900369', 0.9372979052381686]
['GO:0016495', 0.8271448663876793]
['GO:0016056', 0.5582928278548038]
['GO:0006090', 0.4352787640996767]
['GO:0022848', 0.34312912570569604]
['GO:0044549', 0.5458889669720169]
['GO:0120193', 0.6841686299914919]
['GO:0047322', 0.684148517980588]
['GO:0010909', 0.8234186188738093]
['GO:0004331', 0.9130716649984782]
['GO:1902817', 0.998

['GO:1905912', 0.9372979052381686]
['GO:0015272', 0.3201937593422591]
['GO:1903898', 0.9982072323507039]
['GO:0002248', 0.6539923313468676]
['GO:0031477', 0.8271448663876793]
['GO:0043063', 0.8271448663876793]
['GO:0071386', 0.771795808340334]
['GO:0051286', 0.771795808340334]
['GO:0004332', 0.5686497808084049]
['GO:0070326', 0.6780182219080516]
['GO:0047275', 0.878519099587672]
['GO:0045990', 0.5458889669720169]
['GO:0061196', 0.5458889669720169]
['GO:2000653', 0.6301470419815071]
['GO:1903804', 0.9372979052381686]
['GO:0005165', 0.9372979052381686]
['GO:0102488', 0.9063546621566538]
['GO:0035552', 0.8785273631638588]
['GO:0036085', 0.8271448663876793]
['GO:1902444', 0.5458889669720169]
['GO:0043988', 0.8234418571878966]
['GO:0072734', 0.8234418571878966]
['GO:0072562', 0.9439012300430639]
['GO:0003920', 0.9960684473124783]
['GO:0006491', 0.3429082260765215]
['GO:0070681', 0.771795808340334]
['GO:0019676', 0.9372979052381686]
['GO:0090210', 0.684148517980588]
['GO:0005893', 0.68414851

['GO:0050861', 0.9479095889685825]
['GO:1901987', 0.9948352792295165]
['GO:2000764', 0.9372979052381686]
['GO:1903508', 0.5611690801400108]
['GO:0043084', 0.9852423908350101]
['GO:0072558', 0.9912349954641154]
['GO:0035854', 0.9960684473124783]
['GO:0033081', 0.9063546621566538]
['GO:0006693', 0.45072700015980893]
['GO:0008045', 0.4019154758858719]
['GO:0035693', 0.8271448663876793]
['GO:0006654', 0.06406476203747365]
['GO:0048638', 0.878519099587672]
['GO:0031904', 0.26722887708084037]
['GO:0060160', 0.9948352792295165]
['GO:0034587', 0.38523872824652083]
['GO:0055002', 0.9372979052381686]
['GO:0071665', 0.8271448663876793]
['GO:0002605', 0.771795808340334]
['GO:0007042', 0.3201937593422591]
['GO:0033204', 0.5781552334450144]
['GO:0019002', 0.771795808340334]
['GO:0061304', 0.6780182219080516]
['GO:1900280', 0.8271448663876793]
['GO:1900745', 0.3559757199545807]
['GO:0032795', 0.9063546621566538]
['GO:0008192', 0.9372979052381686]
['GO:0051972', 0.968819015839567]
['GO:0003281', 0.288

['GO:1902902', 0.7993452319848534]
['GO:0003990', 0.793783169682258]
['GO:0071806', 0.6780373586086896]
['GO:0030177', 0.1626720540259305]
['GO:0060169', 0.8271448663876793]
['GO:0051457', 0.6216177954502556]
['GO:1903763', 0.9960684473124783]
['GO:0050913', 0.8215648268486694]
['GO:0120009', 0.8234418571878966]
['GO:0035087', 0.8234418571878966]
['GO:0060430', 0.5071397236834014]
['GO:1903540', 0.8271448663876793]
['GO:1905123', 0.6841686299914919]
['GO:0004398', 0.8271448663876793]
['GO:1900827', 0.9372979052381686]
['GO:0072061', 0.8234186188738093]
['GO:1902455', 0.8785273631638588]
['GO:1904182', 0.5458889669720169]
['GO:1905580', 0.9372979052381686]
['GO:0099159', 0.793783169682258]
['GO:1903799', 0.7665330003646615]
['GO:0051054', 0.8785273631638588]
['GO:0030007', 0.3116116862566418]
['GO:0071287', 0.5002220744378443]
['GO:0030299', 0.5582928278548038]
['GO:0004357', 0.9960684473124783]
['GO:0004832', 0.2979599182976811]
['GO:0099580', 0.878519099587672]
['GO:0001816', 0.630688

['GO:0034693', 0.8271448663876793]
['GO:0031094', 0.9948352792295165]
['GO:0044389', 0.8861177306998541]
['GO:0007224', 0.35126617043987013]
['GO:1990573', 0.022777720834370314]
['GO:0031088', 0.9063546621566538]
['GO:1903251', 0.9912349954641154]
['GO:0032819', 0.6539923313468676]
['GO:0010955', 0.951538688253635]
['GO:0004801', 0.8271448663876793]
['GO:0090009', 0.9666015550139212]
['GO:0044530', 0.771795808340334]
['GO:0099185', 0.8785273631638588]
['GO:2000057', 0.8271448663876793]
['GO:0003712', 0.7592824771368394]
['GO:0060729', 0.6780182219080516]
['GO:0050812', 0.5458889669720169]
['GO:0032490', 0.9960684473124783]
['GO:0016702', 0.32022200435042963]
['GO:0090410', 0.8271448663876793]
['GO:0021798', 0.7805487701430781]
['GO:0043320', 0.8234418571878966]
['GO:0046452', 0.8234186188738093]
['GO:1902914', 0.9372979052381686]
['GO:0090083', 0.9960684473124783]
['GO:0050663', 0.8778907462738719]
['GO:1903507', 0.4556513240267781]
['GO:0018243', 0.6841686299914919]
['GO:0022851', 0.9

['GO:0031593', 0.2112127684990751]
['GO:0090267', 0.9232799440834389]
['GO:0042791', 0.6243066247639706]
['GO:0044783', 0.9372979052381686]
['GO:0030099', 0.7697231364717168]
['GO:0004139', 0.9372979052381686]
['GO:0001047', 0.7697231364717168]
['GO:0070971', 0.9137375429072796]
['GO:0034374', 0.45968275522407964]
['GO:0005000', 0.5582928278548038]
['GO:1903896', 0.45968275522407964]
['GO:0022850', 0.8215648268486694]
['GO:0050714', 0.40588984710684095]
['GO:0034136', 0.5659065700409542]
['GO:0042613', 0.3233625732134469]
['GO:0086052', 0.9372979052381686]
['GO:0071110', 0.9372979052381686]
['GO:0042567', 0.6780182219080516]
['GO:1990584', 0.6841686299914919]
['GO:0036037', 0.793783169682258]
['GO:0034244', 0.8087023491607079]
['GO:0035651', 0.9852423908350101]
['GO:0090209', 0.6841686299914919]
['GO:0031380', 0.9960684473124783]
['GO:0043629', 0.8271448663876793]
['GO:0000701', 0.6841686299914919]
['GO:1903387', 0.8271448663876793]
['GO:0015730', 0.9372979052381686]
['GO:0046658', 0.0

['GO:0003257', 0.9701211027838668]
['GO:0005673', 0.9701211027838668]
['GO:1902661', 0.900237841306141]
['GO:0032777', 0.8234418571878966]
['GO:0030629', 0.8271448663876793]
['GO:0018283', 0.9701211027838668]
['GO:0048075', 0.5458889669720169]
['GO:0009398', 0.9372979052381686]
['GO:0002866', 0.8785273631638588]
['GO:0097177', 0.46805919465503487]
['GO:0048169', 0.6347571325148222]
['GO:0017108', 0.8778907462738719]
['GO:0000961', 0.9960684473124783]
['GO:0042668', 0.9372979052381686]
['GO:0006477', 0.5071397236834014]
['GO:0000822', 0.8785273631638588]
['GO:0090025', 0.9372979052381686]
['GO:0015709', 0.5658566597136231]
['GO:0090301', 0.5458889669720169]
['GO:0060024', 0.8234418571878966]
['GO:0034397', 0.5458889669720169]
['GO:0036042', 0.7805487701430781]
['GO:1990038', 0.9372979052381686]
['GO:0051265', 0.5458889669720169]
['GO:0005138', 0.7697231364717168]
['GO:0030515', 0.7446681115176372]
['GO:0045651', 0.3116116862566418]
['GO:0030219', 0.7446681115176372]
['GO:0019779', 0.937

In [47]:
analysis = getgoINFO(adjMappedFile, 100000, type='TSS')

In [48]:
len(analysis)

17091

In [49]:
for l in analysis:
    print(l)
print('...done...')

['GO:0019452', 13304.0, 13304, 0, 1, 'L-cysteine catabolic process to taurine', 'biological_process']
['GO:0016363', 80678.23148148147, 65321.5, 60854.26543825854, 109, 'nuclear matrix', 'cellular_component']
['GO:0071670', 88632.0, 88632.0, 87569.51799570442, 2, 'smooth muscle cell chemotaxis', 'biological_process']
['GO:0003333', 92153.51515151515, 74137, 54837.98906786769, 35, 'amino acid transmembrane transport', 'biological_process']
['GO:0036332', 181988.0, 181988, 0, 1, 'placental growth factor-activated receptor activity', 'molecular_function']
['GO:0050646', 114166.0, 114166, 0, 1, '5-oxo-6E,8Z,11Z,14Z-icosatetraenoic acid binding', 'molecular_function']
['GO:1901684', 21399.0, 21399, 0, 1, 'arsenate ion transmembrane transport', 'biological_process']
['GO:0008506', 101663.5, 85396.5, 72815.49637497043, 4, 'sucrose:proton symporter activity', 'molecular_function']
['GO:0051791', 91275.5, 92641.5, 32973.7999276193, 4, 'medium-chain fatty acid metabolic process', 'biological_pro

['GO:0020037', 71154.85820895522, 56567.0, 53518.47585733852, 135, 'heme binding', 'molecular_function']
['GO:0035445', 99378.0, 99378, 0, 1, 'borate transmembrane transport', 'biological_process']
['GO:0005749', 48516.4, 28951, 46950.80533707596, 5, 'mitochondrial respiratory chain complex II, succinate dehydrogenase complex (ubiquinone)', 'cellular_component']
['GO:0002520', 134674.0, 147923.5, 76552.57074281263, 4, 'immune system development', 'biological_process']
['GO:0051276', 90450.6, 100005, 58850.07169166649, 15, 'chromosome organization', 'biological_process']
['GO:0032098', 92014.42857142857, 81092, 74687.84118997582, 7, 'regulation of appetite', 'biological_process']
['GO:0006265', 83023.44444444444, 84193, 49487.19496524103, 9, 'DNA topological change', 'biological_process']
['GO:0000254', 73091.66666666667, 40757, 56377.218469283616, 3, 'C-4 methylsterol oxidase activity', 'molecular_function']
['GO:0048499', 28374.0, 28308.5, 11200.619179313258, 4, 'synaptic vesicle memb

['GO:0032331', 122457.8, 122276.0, 68022.41860600465, 20, 'negative regulation of chondrocyte differentiation', 'biological_process']
['GO:0031859', 13545.0, 13545, 0, 1, 'platelet activating factor receptor binding', 'molecular_function']
['GO:0097305', 176155.0, 176155, 0, 1, 'response to alcohol', 'biological_process']
['GO:2000360', 45706.5, 26927.0, 46164.16943546874, 4, 'negative regulation of binding of sperm to zona pellucida', 'biological_process']
['GO:0031410', 78784.30522088353, 66271, 58222.89825902398, 251, 'cytoplasmic vesicle', 'cellular_component']
['GO:0047057', 81381.0, 81381.0, 103438.40837909291, 2, 'vitamin-K-epoxide reductase (warfarin-sensitive) activity', 'molecular_function']
['GO:0005597', 59999.0, 59999, 0, 1, 'collagen type XVI trimer', 'cellular_component']
['GO:0001094', 81602.72727272728, 69164, 71193.72702294902, 11, 'TFIID-class transcription factor complex binding', 'molecular_function']
['GO:0004668', 63831.6, 64832, 8614.213504435562, 5, 'protein-ar

['GO:0002196', 29342.5, 29342.5, 14814.594172639358, 2, 'Ser-tRNA(Ala) hydrolase activity', 'molecular_function']
['GO:2001070', 127378.5, 127378.5, 59827.5976494126, 2, 'starch binding', 'molecular_function']
['GO:0060754', 90492.33333333333, 78319.0, 70812.05612511662, 6, 'positive regulation of mast cell chemotaxis', 'biological_process']
['GO:0099192', 100990.0, 100990, 0, 1, 'cerebellar Golgi cell to granule cell synapse', 'cellular_component']
['GO:0001519', 110980.5, 110980.5, 125892.58421567174, 2, 'peptide amidation', 'biological_process']
['GO:1902373', 83995.66666666667, 93469, 56033.855242463316, 3, 'negative regulation of mRNA catabolic process', 'biological_process']
['GO:0032258', 71469.0, 75553, 59424.347308153076, 3, 'protein localization by the Cvt pathway', 'biological_process']
['GO:0032435', 71564.58064516129, 41571, 65430.812798850966, 31, 'negative regulation of proteasomal ubiquitin-dependent protein catabolic process', 'biological_process']
['GO:0048306', 76621

['GO:0048286', 115638.44117647059, 116426.0, 60333.28151957821, 34, 'lung alveolus development', 'biological_process']
['GO:0140007', 75498.2, 29418, 84533.58790267925, 5, 'KICSTOR complex', 'cellular_component']
['GO:0042666', 119351.0, 119351, 0, 1, 'negative regulation of ectodermal cell fate specification', 'biological_process']
['GO:0030971', 85661.38596491228, 84274, 66349.21463383679, 57, 'receptor tyrosine kinase binding', 'molecular_function']
['GO:1901016', 125380.83333333333, 119546.0, 57568.48105835924, 6, 'regulation of potassium ion transmembrane transporter activity', 'biological_process']
['GO:0006910', 109892.44444444444, 107931, 57002.01124765492, 77, 'phagocytosis, recognition', 'biological_process']
['GO:0004980', 126331.33333333333, 155091, 91503.42710703972, 3, 'melanocyte-stimulating hormone receptor activity', 'molecular_function']
['GO:0102341', 200000.0, 200000, 0, 1, '3-oxo-lignoceroyl-CoA reductase activity', 'molecular_function']
['GO:0004977', 123858.4, 14

['GO:0006409', 59968.14705882353, 57173.0, 32463.26302250942, 34, 'tRNA export from nucleus', 'biological_process']
['GO:0010716', 66653.75, 60526.5, 56626.27755953944, 5, 'negative regulation of extracellular matrix disassembly', 'biological_process']
['GO:0022029', 178248.75, 183016.5, 26295.240550018425, 4, 'telencephalon cell migration', 'biological_process']
['GO:0044091', 28156.0, 28156.0, 11354.72069229358, 2, 'membrane biogenesis', 'biological_process']
['GO:0010533', 9957.0, 9957, 0, 1, 'regulation of activation of Janus kinase activity', 'biological_process']
['GO:1990349', 50421.4, 35673, 45489.72646983052, 5, 'gap junction-mediated intercellular transport', 'biological_process']
['GO:0098855', 122545.0, 136947, 85569.86349761228, 3, 'HCN channel complex', 'cellular_component']
['GO:0070182', 44620.58823529412, 40933, 38222.443317144876, 19, 'DNA polymerase binding', 'molecular_function']
['GO:0015787', 82547.0, 82547.0, 34590.24952208353, 2, 'UDP-glucuronic acid transmembra

['GO:0090425', 200000.0, 200000, 0, 1, 'acinar cell differentiation', 'biological_process']
['GO:0043966', 77158.80952380953, 55303.0, 63845.41555833506, 42, 'histone H3 acetylation', 'biological_process']
['GO:0034389', 54831.21052631579, 46256, 40105.47222509243, 19, 'lipid droplet organization', 'biological_process']
['GO:0002383', 29058.0, 29058, 0, 1, 'immune response in brain or nervous system', 'biological_process']
['GO:1990460', 90979.33333333333, 109868, 55167.02722037262, 3, 'leptin receptor binding', 'molecular_function']
['GO:0030091', 104395.0, 132151, 82891.22825426102, 5, 'protein repair', 'biological_process']
['GO:0005891', 104526.85185185185, 95145, 69347.9781928698, 27, 'voltage-gated calcium channel complex', 'cellular_component']
['GO:0009264', 112164.0, 112164.0, 3149.453603404883, 2, 'deoxyribonucleotide catabolic process', 'biological_process']
['GO:0055003', 89138.21052631579, 81097, 54982.264203588566, 19, 'cardiac myofibril assembly', 'biological_process']
[

['GO:0008781', 82043.0, 82043.0, 19965.867073583355, 2, 'N-acylneuraminate cytidylyltransferase activity', 'molecular_function']
['GO:1903597', 38478.0, 38478, 0, 1, 'negative regulation of gap junction assembly', 'biological_process']
['GO:0023019', 114646.65, 114113.5, 68044.3930906991, 20, 'signal transduction involved in regulation of gene expression', 'biological_process']
['GO:0006397', 72003.17763157895, 46472.5, 60042.5090424052, 152, 'mRNA processing', 'biological_process']
['GO:0007616', 95342.5625, 80871.0, 69923.44848657017, 32, 'long-term memory', 'biological_process']
['GO:1902103', 68366.0, 68366, 0, 1, 'negative regulation of metaphase/anaphase transition of meiotic cell cycle', 'biological_process']
['GO:0000824', 100562.0, 100562, 0, 1, 'inositol tetrakisphosphate 3-kinase activity', 'molecular_function']
['GO:0008429', 76960.54545454546, 79001, 43181.96037551708, 11, 'phosphatidylethanolamine binding', 'molecular_function']
['GO:0008659', 73472.0, 73472, 0, 1, '(3R)-

['GO:0140206', 200000.0, 200000, 0, 1, 'dipeptide import across plasma membrane', 'biological_process']
['GO:0072160', 152475.5, 152475.5, 59411.818862074906, 2, 'nephron tubule epithelial cell differentiation', 'biological_process']
['GO:0071801', 57550.333333333336, 49528, 14313.667745666493, 3, 'regulation of podosome assembly', 'biological_process']
['GO:0052597', 88478.0, 88478, 0, 1, 'diamine oxidase activity', 'molecular_function']
['GO:0106003', 165195.0, 165195, 0, 1, 'amyloid-beta complex', 'cellular_component']
['GO:0001527', 90954.1, 84946.5, 72492.41209403306, 10, 'microfibril', 'cellular_component']
['GO:0070940', 81496.8, 66555, 64762.3904762633, 5, 'dephosphorylation of RNA polymerase II C-terminal domain', 'biological_process']
['GO:0034148', 100522.0, 100522, 0, 1, 'negative regulation of toll-like receptor 5 signaling pathway', 'biological_process']
['GO:0008020', 75187.93333333333, 43562, 62259.409144856705, 15, 'G protein-coupled photoreceptor activity', 'molecular

['GO:0016180', 62447.92307692308, 66044, 36871.48505159857, 13, 'snRNA processing', 'biological_process']
['GO:0035042', 86144.0, 86144, 0, 1, 'fertilization, exchange of chromosomal proteins', 'biological_process']
['GO:0010925', 73603.0, 73603, 0, 1, 'positive regulation of inositol-polyphosphate 5-phosphatase activity', 'biological_process']
['GO:0042732', 47655.5, 47655.5, 53913.356531568315, 2, 'D-xylose metabolic process', 'biological_process']
['GO:0071855', 26006.0, 26006, 0, 1, 'neuropeptide receptor binding', 'molecular_function']
['GO:2000552', 66691.5, 59566.0, 44769.82500524209, 4, 'negative regulation of T-helper 2 cell cytokine production', 'biological_process']
['GO:0044305', 105724.54166666667, 106925.0, 65872.37052650221, 24, 'calyx of Held', 'cellular_component']
['GO:0070062', 70792.7183908046, 56290.5, 54999.316931579284, 2163, 'extracellular exosome', 'cellular_component']
['GO:0032886', 120138.33333333333, 127499, 61470.4538640313, 9, 'regulation of microtubule-b

In [17]:
# CLOSEST Genes potentionally regulated by sites in window with respect to the genes body. k = 1 
def geneReadSites(needSortBSL, geneWindow, method='TSS'): # geneWindow = windows lists
    bsL = []           # Sort sites smallest to largest 
    for l in needSortBSL:
        bsL.append(sorted(l))
    
    chromosomeI = 0    # 0 == chromosome group 1          
    mappedEntrezG = [] # Output list of entrez ids 
    
    if method == 'TSS':
        entrezIDL = entrezIDLtss.copy()
    elif method == 'BODY':
        entrezIDL = entrezIDLbody.copy()
    elif method != 'TSS'and method != 'BODY':
        return 'method must be TSS or BODY'
    
    for bounds in geneWindow:
        lowB = bounds[0]
        upperB = bounds[1]
        geneIDS = entrezIDL[chromosomeI]
        sitesL = bsL[chromosomeI]
        
        for site in sitesL:
            for i in range(len(lowB)):
                if site < lowB[i]:
                    lowB = lowB[i:] # Getting rid of the lower geneWindows as it has been mapped. 
                    upperB = upperB[i:]
                    geneIDS = geneIDS[i:]
                    break
                if lowB[i] <= site and upperB[i] > site:
                    mappedEntrezG.append(geneIDS[i])
                    
        chromosomeI += 1   
    return mappedEntrezG

In [18]:
# Given a list of lists with go term and its odds-ratio,p-val return a list of just GO terms 
def getGOfromAnalysis(goAnalysis):
    goTermL = []
    for key in goAnalysis:
        goTermL.append(key)
    return goTermL

In [19]:
# Given original analysis and simulated analysis. Adds count to go term in original list if the random go term has a lower p-val and higher odds ratio 
def compareGOAnalysis(origAnalysis, counters):
    for k in counters.keys():
        if origAnalysis[k][2] != 'NA':
            if (counters[k][0] < origAnalysis[k][2]): #  (simAnalysis[k][0] > origAnalysis[k][0]) and 
                origAnalysis[k][5] += 1
        if origAnalysis[k][3] != 'NA':
            if (counters[k][1] < origAnalysis[k][3]): 
                origAnalysis[k][6] += 1
        if origAnalysis[k][4] != 'NA':
            if (counters[k][2] < origAnalysis[k][4]): 
                origAnalysis[k][7] += 1
    return origAnalysis 

In [20]:
# converts analysis to list format and divides by nSim to get refabs p value
def convertAnalysistoFormat(analysis, nSim):
    analist = []
    for k in analysis.keys():
        val = analysis[k]
        outval = val[:2]
        app = val[5:]
        pval = []
        for i in app:
            p = (i + 1) / (nSim + 1)
            pval.append(p)
        nas = val[2:5]
        for i in range(3):
            if nas[i] == 'NA':
                pval[i] = 'NA'
        outval += pval
        analist.append([k, outval])
    return analist

In [21]:
r = robjects.r

In [22]:
# Combines the three different p-vals together \ Format: pvals = [x,y,z]
def reFABScalc(pvals):    
    r['source']("SMLmetapADJ.R")
    return r("reFABSp(c(" + str(pvals)[1:-1] + "))")   

In [23]:
# Combines the S M & L, pvals for each GO term into one from the adaptiveEnrichmentAnalysis() function || adaptiveEnrichmentAnalysis(randsites, 100000, method='TSS')
# Replaces 1.0 with 0.9999999 & removes 'NA' for genes without s/m/l
def reFABSlistC(enrichmentA):
    r['source']("SMLmetapADJ.R")
    goTerms = []
    pvals = []
    for GO in enrichmentA:
        pvals.append([GO[1][1],GO[2][1],GO[3][1]])
        goTerms.append(GO[0])
    adjPval = []
    for l in pvals:
        newL = []
        for ind in range(len(l)):
            if type(l[ind]) != str:
                if l[ind] > 0.9999999:
                    newL.append(0.9999999)
                else:
                    newL.append(l[ind])
        adjPval.append(newL)
#     return adjPval
    
    stringreturn = "reFABSp(list("
    for ind in range(len(adjPval)):
        stringreturn  += "c("+ str(adjPval[ind])[1:-1] +"), "   
    output = []
    data = r(stringreturn[:-2] + "))") 
    for goID in range(len(goTerms)):
        output.append([goTerms[goID], data[goID]])
    return output



In [31]:
testing=adaptiveEnrichmentAnalysis(randsites, 10000, method='TSS')

In [44]:
combined = reFABSlistC(testing)

In [24]:
File = pd.read_csv('polyenrich/SmLgzFiles/Converted Data-hg38 Site/Large Data/largeDatasetConverted.csv')

In [25]:
chrlabel = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrMT','chrX','chrY']




In [26]:
File

Unnamed: 0.1,Unnamed: 0,Chromo,Start,End,Site
0,0,chr1,225474969,225475318,225475143
1,1,chr16,55465105,55465447,55465276
2,2,chr10,64041215,64041557,64041386
3,3,chr5,139262670,139262962,139262816
4,4,chr17,43760373,43760708,43760540
...,...,...,...,...,...
75630,75638,chr8,80890024,80890264,80890144
75631,75639,chr10,74891915,74892155,74892035
75632,75640,chr6,112148183,112148423,112148303
75633,75641,chr2,214326179,214326419,214326299


In [27]:
len(File[File.Chromo.isin(chrlabel)])

75635

In [28]:
chrList = list(File['Chromo'].values)
SiteLoc = list(File['Site'].values)

In [29]:
len(SiteLoc)

75635

In [30]:
FileData = []
for item in range(len(SiteLoc)):
    FileData.append([chrList[item], SiteLoc[item]])

In [31]:
len(FileData)

75635

In [32]:
FileSites = []
for x in range(25):
    FileSites.append([])

In [33]:
for item in range(len(FileData)):
    FileSites[chrlabel.index(FileData[item][0])].append(FileData[item][1])

        

In [34]:
c = 0 
for i in FileSites:
    c+=len(i)
c

75635

In [35]:
len(FileSites)

25

In [36]:
# Gets dinfo for GO terms. (AVG domain, Median domain length, Domain length stdev, # genes, goname, go namespace) || TSS only ATM 
def getgoINFO(data, window, type='TSS'):
    output = []
    pos = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
    
    if type == 'TSS':
        windDF = addWindowTSS20(window)
    elif type == 'BODY':
        windDF = addWindowBODY20(window)
    else:
        raise InputError("Incorrect option")
    
    for item in data:
        goterm = item[0]
        add = []
        add.append(goterm)
        try:
            genes = go2gene[goterm]
        except KeyError:
            output.append(add)
            continue 
        
        geneDom = []
        for gene in genes:
            try:
                chromo = geneAnnotationDF.loc[geneAnnotationDF['entrezid'] == gene].chromosome.values[0]   
            except IndexError:
                continue
            
            try:
                geneDomain = int(windDF[pos.index(chromo)].loc[windDF[pos.index(chromo)]['entrezid'] == gene].domainLEN.values[0])
            except ValueError:
                continue
            
            geneDom.append(geneDomain)
            
        if len(geneDom) == 0:
            output.append(add)
            continue 
            
        domainAVG = sum(geneDom)/len(geneDom)
        
        medianValue = median(geneDom)
        if len(geneDom) > 1:
            standardDev = stdev(geneDom)
        else:
            standardDev = 0
        
        add.append(domainAVG)
        add.append(medianValue)  
        add.append(standardDev)  
        add.append(len(go2gene[goterm]))
        try:
            add.append(goinfo.nodes[goterm]['name'])
            add.append(goinfo.nodes[goterm]['namespace'])
        except KeyError:
            add.append('NA')
            add.append('NA')
            
        output.append(add)
        
    return output
        
            
    

In [37]:
# Compares original analysis with simulated analysis for nSim
def simulation(GOlist, geneL, nSim, nSites, origAnalysis, method, SMLcutoff, entrezDomains): # GOList is the new go list that we analyze through since we don't want whole GO list, geneL is windowDF
    sitesL = nRandSitesSim(nSites, nSim)
    outputAnalysis = origAnalysis # We will keep updating this dictionary and return when all sims are done 
    
    for sim in range(nSim):
        mapped = geneReadSites(sitesL[sim], geneL, method)
        counters = SMLcounterFAST(GOlist, mapped, SMLcutoff, entrezDomains)
        outputAnalysis = compareGOAnalysis(outputAnalysis, counters)
        
    return sorted(convertAnalysistoFormat(outputAnalysis, nSim), key = lambda x: x[1][1])

In [38]:
# First run that sets up for the simulation
def firstRun(userSitesL, window, method='TSS', simN=100): # userSites must be ordered by chromosome number (List of lists) # Default simN = 10000
    if method == 'TSS':
        geneL = addWindowTSS(window)
        windowDF = addWindowTSS20(window)
    elif method == 'BODY': 
        geneL = addWindowBODY(window)
        windowDF = addWindowBODY20(window)
    else:
        return 'Only TSS or BODY method allowed!'
    
    geneClassification = findSMLdomain(windowDF)
    
    mappedGenes = geneReadSites(userSitesL, geneL, method)
    
    numMapped = len(mappedGenes)
    numsites = 0
    for l in userSitesL:
        numsites += len(l)
    
    entrezDomains = getEntrezDomain(windowDF)
#     return mappedGenes, entrezDomains
    goAnalysis = conductAnalysisFIRST(mappedGenes, geneClassification, entrezDomains)
    newGOlist = getGOfromAnalysis(goAnalysis)
    
    # estimate num sites to sample: numSitesSamp
    nBSL = nRandSitesSim(numsites,5)
    nPrime2 = 0 # Total num genes mapped to figure out nPrime value eventually for estimation
    for BSL in nBSL:
        nPrime2 += len(geneReadSites(BSL, geneL, method))
    nPrime = int(nPrime2 / 5)
    numSitesSamp = int((numMapped * numsites) / nPrime) # Num mapped genes * num sites inputted divided by nPrime 
    
    return simulation(newGOlist, geneL, simN, numSitesSamp, goAnalysis, method, geneClassification, entrezDomains) # GeneL is list of genes with window bounds 

In [39]:
def inANOTB(a,b):
    a_set = set(a) 
    b_set = set(b) 
    return len(list(set(a_set) - set(b_set)))
    

In [40]:
# First run that sets up for the simulation
def adaptiveEnrichmentAnalysis(userSitesL, window, method='TSS'): # userSites must be ordered by chromosome number (List of lists) # Default simN = 10000
    if method == 'TSS':
        geneL = addWindowTSS(window)
        windowDF = addWindowTSS20(window)
    elif method == 'BODY': 
        geneL = addWindowBODY(window)
        windowDF = addWindowBODY20(window)
    else:
        return 'Only TSS or BODY method allowed!'
    
    geneClassification = findSMLdomain(windowDF)
    smallbound = geneClassification[0]
    largebound = geneClassification[1]
    
    mappedGenes = geneReadSites(userSitesL, geneL, method)

    entrezDomains = getEntrezDomain(windowDF)
    smallGenesMAPPED = []# in mapped
    mediumGenesMAPPED = []
    largeGenesMAPPED = []
    for entrez in mappedGenes:
        geneLEN = entrezDomains[entrez]
        if geneLEN < smallbound:
            smallGenesMAPPED.append(entrez)
        elif geneLEN > largebound:
            largeGenesMAPPED.append(entrez)
        else:
            mediumGenesMAPPED.append(entrez)
    
    smallGenesAll = []# in All
    mediumGenesAll = []
    largeGenesAll = []
    
    everygene = []
    for l in windowDF:
        everygene += list(l.entrezid.values)
    for entrez in everygene:
        geneLEN = entrezDomains[entrez]
        if geneLEN < smallbound:
            smallGenesAll.append(entrez)
        elif geneLEN > largebound:
            largeGenesAll.append(entrez)
        else:
            mediumGenesAll.append(entrez)
            
    goAssocGenes = getOntologyID(mappedGenes)
    go2pvals = []
    
    for go in goAssocGenes:
        associated2GO = go2gene[go]
        smallASSOCgo = common_member(smallGenesAll, associated2GO)
        mediumASSOCgo = common_member(mediumGenesAll, associated2GO)
        largeASSOCgo = common_member(largeGenesAll, associated2GO)
        
        if len(smallASSOCgo) > 0:
            A = len(common_member(smallASSOCgo, smallGenesMAPPED))
            B = len(smallASSOCgo) - A
            C = inANOTB(smallGenesMAPPED, smallASSOCgo)
            D = len(smallGenesAll) - A - B - C
            smallODDpval = stats.fisher_exact([[A, B], [C, D]], alternative='greater')
        else:
            smallODDpval = ('NA','NA')
        
        if len(mediumASSOCgo) > 0:
            E = len(common_member(mediumASSOCgo, mediumGenesMAPPED))
            F = len(mediumASSOCgo) - E
            G = inANOTB(mediumGenesMAPPED, mediumASSOCgo)
            H = len(mediumGenesAll) - E - F - G
            mediumODDpval = stats.fisher_exact([[E, F], [G, H]], alternative='greater')
        else:
            mediumODDpval = ('NA','NA')
        
        if len(largeASSOCgo) > 0: 
            I = len(common_member(largeASSOCgo, largeGenesMAPPED))
            J = len(largeASSOCgo) - I
            K = inANOTB(largeGenesMAPPED, largeASSOCgo)
            L = len(largeGenesAll) - I - J - K 
            largeODDpval = stats.fisher_exact([[I, J], [K, L]], alternative='greater')
        else:
            largeODDpval = ('NA','NA')

        go2pvals.append([go,smallODDpval, mediumODDpval, largeODDpval])

            
    return go2pvals
    

In [41]:
def adjP(dataSET):
    GOdataSet = dataSET
    goGrp = [[],[],[],[],[],[],[],[],[],[]]
    numAssGrp = [[1], [2], [3], [4], [5, 6], [7, 8], [9, 10, 11, 12], [13, 14, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 185, 187, 188, 189, 190, 191, 193, 194, 196, 198, 200, 201, 202, 203, 204, 205, 206, 208, 210, 211, 212, 213, 214, 216, 217, 218, 219, 221, 225, 226, 229, 232, 234, 235, 236, 238, 239, 240, 242, 244, 249, 250, 251, 252, 254, 262, 263, 264, 265, 268, 272, 273, 278, 280, 282, 285, 290, 295, 297, 301, 307, 308, 310, 311, 312, 313, 315, 324, 329, 330, 334, 343, 344, 347, 354, 360, 362, 364, 366, 368, 371, 374, 375, 385, 387, 395, 397, 405, 409, 411, 413, 420, 424, 427, 431, 439, 441, 457, 458, 467, 474, 481, 483, 491, 496, 502, 504, 505, 524, 529, 537, 540, 547, 548, 552, 565, 578, 605, 610, 611, 667, 670, 689, 703, 706, 812, 816, 859, 927, 960, 973, 980, 1059, 1132, 1134, 1150, 1227, 1304, 1381, 1388, 1456, 1473, 1546, 1778, 1911, 1987, 2163, 2292, 3168, 3637, 4438, 4482, 5026, 5627, 9691]]
    numGO = [5184, 2895, 1743, 1241, 1551, 1018, 1219, 1028, 1020, 1275] # Number of GO terms in each group above.
    
    for go in GOdataSet:
        numAss = len(go2gene[go[0]])
        for i in range(len(numAssGrp)):
            if numAss in numAssGrp[i]:
                goGrp[i].append(go)
    
    pos = 0
    for GOdata in goGrp:
        fishersP = []
#         refabsP = []
        for l in GOdata:
            fishersP.append(l[1][1])
#             refabsP.append(l[1][2])
         
        if len(fishersP) < numGO[pos]:
            numAppend = numGO[pos] - len(fishersP)
            fishersP += [1] * numAppend
#             refabsP += [1] * numAppend
        pos += 1         
        
        reject, fishersPadj, alphacSidak, alphacBonf = padjust(fishersP, method='fdr_bh', is_sorted=False)
#         reject2, refabsPadj, alphacSidak2, alphacBonf2 = padjust(refabsP, method='fdr_bh', is_sorted=False)
        correctedfishers = []
#         correctedrefabs = []
        for i in range(len(GOdata)):
            correctedfishers.append(float(fishersPadj[i]))
#             cohttp://localhost:8890/notebooks/GOfunctionalAnalysis.ipynb#rrectedrefabs.append(float(refabsPadj[i]))
        
        for i in range(len(GOdata)):
            GOdata[i][1].append(correctedfishers[i])
#             GOdata[i][1].append(correctedrefabs[i])
    groupedGO = []
    for i in goGrp:
        for j in i:
            groupedGO.append(j)
    groupedGO = sorted(groupedGO, key = lambda x: x[1][1])
    refabsP = []
    for go in groupedGO:
        refabsP.append(go[1][2])
    reject2, refabsPadj, alphacSidak2, alphacBonf2 = padjust(refabsP, method='fdr_bh', is_sorted=False)
    
    for i in range(len(groupedGO)):
        groupedGO[i][1].append(refabsPadj[i])
    
    return groupedGO # go: [odds, fishersP, refabsP, correctedfishers, correctedrefabs]

In [27]:
# geneAnnotationDF = pd.read_csv('entrez_id/geneAnnotationsDF_Selected_entrezID.csv', sep=',', comment='#', low_memory=False, header=0, names=geneColID)
linSites = pd.read_excel('bindingSites2Test.xlsx', header=0)