In [1]:
%run ontologyPackage/ontologySTATanalysis.ipynb

In [2]:
import pandas as pd 
import numpy as np
from numpy import random, array
import time 

import rpy2.robjects as robjects

import os
from goatools.obo_parser import GODag

pd.set_option('display.max_colwidth', None) # Values in columns won't be shortned | None over -1
pd.set_option('chained_assignment',None) # Disabling chained assignments 

geneColID = ["chromosome","source","type","start","end","score","strand","phase","gene_symbol","gene_ensID","length","entrezid"]
geneAnnotationDF = pd.read_csv('entrez_id/geneAnnotationsDF_Selected_entrezID.csv', sep=',', comment='#', low_memory=False, header=0, names=geneColID)

chromosomeColID = ['chromosome','source','type','start','end','score','strand','phase']
chromosomesDF = pd.read_csv('chromosomesDF.csv', sep=',', comment='#', low_memory=False, header=0, names=chromosomeColID)

In [3]:
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests as padjust
from statistics import median, stdev

In [4]:
# GO Term information
if not os.path.exists('go-basic.obo'):
    !wget http://geneontology.org/ontology/go-basic.obo
goDATA = GODag('go-basic.obo', optional_attrs=['relationship'])

go-basic.obo: fmt(1.2) rel(2019-07-01) 47,413 GO Terms; optional_attrs(relationship)


In [5]:
#Dropping unneccesary stuff and resetting index from 0
# geneAnnotationDF = geneAnnotationDF[geneAnnotationDF.gene_symbol != 'CTD-2207O23.3']
geneAnnotationDF = geneAnnotationDF.sort_values(by=['chromosome', 'start'])
chromosomesDF = chromosomesDF.sort_values(by=['chromosome'])
chromosomesDF = chromosomesDF.reset_index()
geneAnnotationDF = geneAnnotationDF.reset_index()
chromosomesDF = chromosomesDF.drop(columns=['score', 'strand', 'phase', 'index'])
geneAnnotationDF = geneAnnotationDF.drop(columns=['score', 'phase', 'index'])

In [6]:
# Making a list of DataFrames to be used in addwindow tss so that if - gene the start is its end and its end is its start 
u = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y'] # Chromosome ids 
geneDFL = []
for c in u:
    geneDF = geneAnnotationDF.loc[geneAnnotationDF['chromosome'] == c].copy()
    for i, row in geneDF.iterrows():
        if row['strand'] == '-':
            start = row['end']
            end = row['start']
            geneDF.at[i, 'start'] = start
            geneDF.at[i, 'end'] = end
    geneDF = geneDF.sort_values(by=['start'])
    geneDF = geneDF.reset_index()
    geneDF = geneDF.drop(columns=['index'])
    geneDFL.append(geneDF)

In [7]:
# Making a list of DataFrames to be used in future functions in var geneDFL
u = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
geneDFLBODY = []
for cc in u:
    geneDFbod = geneAnnotationDF.loc[geneAnnotationDF['chromosome'] == cc].copy()
    geneDFbod = geneDFbod.sort_values(by=['start'])
    geneDFbod = geneDFbod.reset_index()
    geneDFbod = geneDFbod.drop(columns=['index'])
    geneDFLBODY.append(geneDFbod)

In [8]:
# probability that a randomly chosen gene is in a certain chromosome. Ordered from 1 - MT,X,Y | As shown chromosome 1 has highest prob of selection
probabilityL = [0.08069467597786323, 0.07850260775517634, 0.06427388269957329, 0.06165457287524136, 0.05884231002379223, 0.05536363751707386, 0.05164908592141782, 0.0470440371698401, 0.044858119022639725, 0.043367989830914035, 0.043785860460049814, 0.04319875643982657, 0.037069107410936775, 0.034696265410969616, 0.033058580434273406, 0.029281523923645837, 0.026986378244674356, 0.026051531764496164, 0.018999829086634144, 0.02088839915494138, 0.015140187376094325, 0.016471877735809576, 5.370214538878023e-06, 0.05057780526426391, 0.017537607961181506]

In [9]:
# Generates random sites
dl = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25'] # 23, 24, 25
chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]}
def nRandSites(data):
    nSite = data[0]
    rand = data[1] 
    sitesbyC =[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]

    random.seed(rand)
        
    for site in range(nSite):
        chrN = random.choice(dl, p=probabilityL)
        randnum = round(random.uniform(chrD[chrN][0], chrD[chrN][1]))
        sitesbyC[int(chrN) - 1].append(randnum)
    for l in sitesbyC:
        l.sort()
    return array([array(l) for l in sitesbyC])

def nRandSitesSim(nSite, nSim):
    totalsites = [nSite] * nSim
    chunks = []
    for i in totalsites:
        chunks.append([i])
    for l in chunks:
        l.append(random.randint(100000))
    
    pool = Pool(5)
    result = pool.map(nRandSites, chunks)
    pool.close()
    return(result) 

In [10]:
##### Adds midpoint in gene and upper and lower bound values to geneDFLtss based on size of window or nearby genes. Adding around geneTSS.
def addWindowTSS(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    geneWindowDFL = []
    for i in range(len(geneDFL)):
        df = geneDFL[i].copy()
        geneWindowDFL.append(df)
    outList = []
    chrIDNUM = 0
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1   
        for i, row in dfG.iterrows():
            start = row['start']
            
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + start) # Middle of gene body 
            if i == 0 or i == dfIndexMax: # Case for start and end genes
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # chrD[chrIDNUM][0] = start val of the chromosome
                    dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                    dfG.at[i, 'upperB'] = min(start + window, chrD[str(chrIDNUM)][1]) # chrD[chrIDNUM][1] = end val of chromosome 
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
#         dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    
    return(outList)

In [11]:
##### Adds midpoint in gene and upper and lower bound values to geneDFLtss based on size of window or nearby genes. Adding around geneTSS.
def addWindowTSS20(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    geneWindowDFL = []
    for i in range(len(geneDFL)):
        df = geneDFL[i].copy()
        geneWindowDFL.append(df)
    chrIDNUM = 0
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1   
        for i, row in dfG.iterrows():
            start = row['start']
            
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + start) # Middle of gene body 
            if i == 0 or i == dfIndexMax: # Case for start and end genes
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # chrD[chrIDNUM][0] = start val of the chromosome
                    dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                    dfG.at[i, 'upperB'] = min(start + window, chrD[str(chrIDNUM)][1]) # chrD[chrIDNUM][1] = end val of chromosome 
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
    
    return(geneWindowDFL)

In [12]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    chrIDNUM = 0
    geneWindowDFL = []
    for i in range(len(geneDFLBODY)):
        df = geneDFLBODY[i].copy()
        geneWindowDFL.append(df)
    
    outList = []
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            start = row['start']
            end = row['end']
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = min(row['end'] + window, chrD[str(chrIDNUM)][1])
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
#         dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    
    return(outList)

In [13]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY20(window):
    chrD = {'1': [1, 248956422],'2': [1, 242193529],'3': [1, 198295559],'4': [1, 190214555],'5': [1, 181538259],'6': [1, 170805979],'7': [1, 159345973],'8': [1, 145138636],'9': [1, 138394717],'10': [1, 133797422],'11': [1, 135086622],'12': [1, 133275309],'13': [1, 114364328],'14': [1, 107043718],'15': [1, 101991189],'16': [1, 90338345],'17': [1, 83257441],'18': [1, 80373285],'19': [1, 58617616],'20': [1, 64444167],'21': [1, 46709983],'22': [1, 50818468],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]} 
    chrIDNUM = 0
    geneWindowDFL = []
    for i in range(len(geneDFLBODY)):
        df = geneDFLBODY[i].copy()
        geneWindowDFL.append(df)
    
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            start = row['start']
            end = row['end']
            
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = min(row['end'] + window, chrD[str(chrIDNUM)][1])
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - (abs(start - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        dfG['domainLEN'] = dfG['upperB'] - dfG['lowB']
    
    return(geneWindowDFL)

In [14]:
def findSMLdomain(windowDF):
    windowLens = windowDF
    allVALS = []
    for df in windowLens:
        values = list(df.domainLEN.values)
        allVALS += values
    allVALS = sorted(allVALS)
    cutoff = int(len(allVALS) / 3)
    small = allVALS[:cutoff][-1]
    medium = allVALS[cutoff:cutoff*2][-1]
    return [small, medium]

In [15]:
# Creating list of all the entrezids 
entrezIDLtss = []
for geneDF in geneDFL:
    entrezIDLtss.append(geneDF.entrezid.values)

In [16]:
entrezIDLbody = []
for geneDF in geneDFLBODY:
    entrezIDLbody.append(geneDF.entrezid.values)

In [43]:
testmappedFile = adaptiveEnrichmentAnalysis(FileSites, 10000, method='TSS')

In [47]:
len(testmappedFile)

15126

In [48]:
adjMappedFile = reFABSlistC(testmappedFile)
c=1
print('done')

done


In [49]:
len(adjMappedFile)

15126

In [50]:
for l in adjMappedFile:
    print(l)

['GO:0051284', 0.9999998000000101]
['GO:0017162', 0.2726565042016713]
['GO:0003873', 0.9999998000000101]
['GO:0099068', 0.5101152548959613]
['GO:0070838', 0.9999999]
['GO:1905451', 0.6334502061744569]
['GO:0048807', 0.9999998000000101]
['GO:0004473', 0.5802315580029597]
['GO:0018158', 0.33384979350257027]
['GO:0036257', 0.39456644144419684]
['GO:0001946', 0.9999998000000101]
['GO:0016846', 0.32031590990435577]
['GO:0005899', 0.07527022248675141]
['GO:0060644', 0.04046279031563656]
['GO:0060082', 0.9999999]
['GO:0001100', 0.9999999]
['GO:0007029', 0.9999998000000101]
['GO:0004956', 0.9999998000000101]
['GO:0044795', 0.39456644144419684]
['GO:0010742', 0.9999998000000101]
['GO:0034166', 0.9999999]
['GO:1905206', 0.9999999]
['GO:1903507', 0.5658343434152608]
['GO:0045322', 0.6334502061744569]
['GO:0006526', 0.9999998000000101]
['GO:1990071', 0.6334502061744569]
['GO:0044603', 0.9999999]
['GO:0006333', 0.15139090594028362]
['GO:1901964', 0.9999999]
['GO:0090107', 0.6334502061744569]
['GO:0

['GO:0047718', 0.5802315580029597]
['GO:0052746', 0.9999999]
['GO:0030546', 0.5101152548959613]
['GO:1901016', 0.31403669467145556]
['GO:0006915', 0.042592291307404224]
['GO:0034440', 0.9999999]
['GO:0006543', 0.9999998000000101]
['GO:0010040', 0.46002910715855216]
['GO:0072285', 0.9999999]
['GO:0043416', 0.39456644144419684]
['GO:0005298', 0.9999999]
['GO:1901216', 0.0015870217695059662]
['GO:0002162', 0.27645685428277444]
['GO:1905322', 0.9999999]
['GO:0060611', 0.9999999]
['GO:0001738', 0.9999999]
['GO:0010669', 0.08017040748241694]
['GO:0021773', 0.14022334389157606]
['GO:0004982', 0.33384979350257027]
['GO:0008582', 0.6400214977131269]
['GO:0048007', 0.9999998000000101]
['GO:1902953', 0.46002910715855216]
['GO:0016409', 0.2879632331395145]
['GO:0001872', 0.9999999]
['GO:0050786', 0.6270624261975852]
['GO:0010470', 0.438725516407805]
['GO:0036498', 0.7298215124695785]
['GO:0031021', 0.7716739113310207]
['GO:0000242', 0.33266242996208367]
['GO:0021534', 0.9999999]
['GO:0060028', 0.8

['GO:1990380', 0.16448357909506747]
['GO:0021860', 0.9999998000000101]
['GO:0072557', 0.9999999]
['GO:0034260', 0.7474076542994345]
['GO:0043610', 0.9999999]
['GO:0042500', 0.6547510568217676]
['GO:0005227', 0.16786387521365093]
['GO:0006796', 0.8173167584947394]
['GO:0015833', 0.11900629755412342]
['GO:0097120', 0.438725516407805]
['GO:0048592', 0.9999999]
['GO:0050896', 0.048604360052839794]
['GO:0032347', 0.39456644144419684]
['GO:0044245', 0.41242111748444166]
['GO:0030177', 0.064908913911158]
['GO:0007196', 0.7061727522082444]
['GO:0051124', 0.14300005724974105]
['GO:0003339', 0.39456644144419684]
['GO:0051265', 0.9999999]
['GO:0015252', 0.9999998000000101]
['GO:0071560', 0.0009068170471377432]
['GO:0048312', 0.33666866090254194]
['GO:0035313', 0.9999998000000101]
['GO:0042761', 0.6334502061744569]
['GO:0017053', 0.1850009407387778]
['GO:0000056', 0.9999998000000101]
['GO:0070193', 0.39456644144419684]
['GO:0043495', 0.7205106663030603]
['GO:0070129', 0.8116725480850819]
['GO:2000

['GO:0005509', 0.00022820796909466182]
['GO:0050861', 0.438725516407805]
['GO:0042063', 0.9999998000000101]
['GO:0031644', 0.9999999]
['GO:0060662', 0.07527022248675141]
['GO:0097267', 0.04407336142579305]
['GO:0005391', 0.2821164577883831]
['GO:0032489', 0.11900629755412342]
['GO:0071848', 0.9999999]
['GO:0043666', 0.5561437935592124]
['GO:0006814', 0.010391499646072114]
['GO:0055131', 0.46002910715855216]
['GO:0007379', 0.4096275175349541]
['GO:1904016', 0.9999998000000101]
['GO:0090383', 0.5011896775740428]
['GO:0005198', 0.09997126791244396]
['GO:1901509', 0.6400214977131269]
['GO:0033686', 0.2870714653365012]
['GO:0003829', 0.5802315580029597]
['GO:0048368', 0.33666866090254194]
['GO:0070984', 0.9999999]
['GO:0045717', 0.004090218356492517]
['GO:0120069', 0.9999999]
['GO:1904009', 0.9999999]
['GO:0009115', 0.5101152548959613]
['GO:0043299', 0.9999999]
['GO:0016198', 0.9999999]
['GO:0098894', 0.9999999]
['GO:0097542', 0.38040235980951764]
['GO:0007099', 0.7606897742954726]
['GO:003

['GO:0005863', 0.5802315580029597]
['GO:0042007', 0.9999998000000101]
['GO:0015193', 0.471332492300747]
['GO:0034719', 0.09987347286403789]
['GO:2000675', 0.6400429941256136]
['GO:0042585', 0.9999998000000101]
['GO:0032902', 0.9999999]
['GO:0045795', 0.9999999]
['GO:0005242', 0.36667703181593186]
['GO:0001660', 0.9999999]
['GO:1904862', 0.4986799559613665]
['GO:0061885', 0.6334502061744569]
['GO:2000664', 0.4096275175349541]
['GO:0042543', 0.39456644144419684]
['GO:0003058', 0.9999999]
['GO:1903828', 0.15564904532327395]
['GO:0050689', 0.9999998000000101]
['GO:1900005', 0.9999999]
['GO:0045932', 0.39456644144419684]
['GO:0043207', 0.5101152548959613]
['GO:1903568', 0.9999999]
['GO:0042219', 0.9999999]
['GO:0046068', 0.9999999]
['GO:0046947', 0.7716739113310207]
['GO:0098696', 0.2726565042016713]
['GO:0015793', 0.6400429941256136]
['GO:0033365', 0.9999998000000101]
['GO:0000835', 0.9999999]
['GO:0032971', 0.5101152548959613]
['GO:0072675', 0.22351680892762088]
['GO:0097187', 0.999999800

['GO:0034728', 0.9999998000000101]
['GO:0009952', 0.008695642220369627]
['GO:0043947', 0.9999999]
['GO:0042742', 0.0702085179710345]
['GO:0005858', 0.7716739113310207]
['GO:0070245', 0.46002910715855216]
['GO:0075525', 0.320393233405432]
['GO:0086063', 0.9999999]
['GO:0047374', 0.9999998000000101]
['GO:0019871', 0.9999999]
['GO:1990575', 0.9999999]
['GO:0055118', 0.9999998000000101]
['GO:0034647', 0.7716739113310207]
['GO:0016018', 0.13612178760323465]
['GO:0034669', 0.5101152548959613]
['GO:0032418', 0.9999998000000101]
['GO:0045298', 0.6334502061744569]
['GO:0090527', 0.11900629755412342]
['GO:1903070', 0.7600129365143511]
['GO:0043202', 0.2149134034700795]
['GO:0017018', 0.9999999]
['GO:0043132', 0.9999999]
['GO:0006525', 0.2870714653365012]
['GO:0045792', 0.6270624261975852]
['GO:1904139', 0.5101152548959613]
['GO:0048850', 0.9999999]
['GO:0008184', 0.265172882344801]
['GO:1902239', 0.9999999]
['GO:0009755', 0.2394047918220234]
['GO:0022027', 0.8704154778912997]
['GO:0071045', 0.99

['GO:0010965', 0.39456644144419684]
['GO:1905538', 0.9999998000000101]
['GO:0016428', 0.9999998000000101]
['GO:0050860', 0.21592810294086584]
['GO:2000179', 0.01733920058948745]
['GO:0006688', 0.14938660619690136]
['GO:0050428', 0.2870714653365012]
['GO:0072572', 0.9999999]
['GO:0044805', 0.6736872831676768]
['GO:0034365', 0.566032890745257]
['GO:0004053', 0.9999999]
['GO:0042797', 0.8704154778912997]
['GO:0010021', 0.9999999]
['GO:0030134', 0.33560384240295904]
['GO:0060929', 0.9999999]
['GO:0000278', 0.0699439550021346]
['GO:0032456', 0.9370766944791062]
['GO:0035970', 0.06306655356064168]
['GO:0005046', 0.2870714653365012]
['GO:2000057', 0.9999999]
['GO:0050658', 0.913616984558016]
['GO:0060492', 0.9999999]
['GO:0001894', 0.15885138557689069]
['GO:0045026', 0.33384979350257027]
['GO:0036510', 0.9999998000000101]
['GO:0031826', 0.2602175732775716]
['GO:0007224', 0.1602027704417094]
['GO:0021503', 0.9999999]
['GO:1902388', 0.6919476667895421]
['GO:0007440', 0.9999999]
['GO:0014032', 0

['GO:0048278', 0.24204481554011947]
['GO:0009396', 0.9999998000000101]
['GO:0034657', 0.9999998000000101]
['GO:0036149', 0.2870714653365012]
['GO:0001910', 0.9999999]
['GO:0048753', 0.9999999]
['GO:0031573', 0.16002866488999845]
['GO:0047066', 0.39456644144419684]
['GO:0046978', 0.22162105172320404]
['GO:0003219', 0.33666866090254194]
['GO:0102486', 0.9999999]
['GO:0071951', 0.9999999]
['GO:0030520', 0.05009118394396225]
['GO:0070976', 0.39456644144419684]
['GO:0007016', 0.320393233405432]
['GO:0044262', 0.9999998000000101]
['GO:0005814', 0.05049237838029505]
['GO:1903375', 0.265172882344801]
['GO:0061178', 0.08022932279993798]
['GO:0072034', 0.7600129365143511]
['GO:0070383', 0.16002866488999845]
['GO:0071885', 0.5101152548959613]
['GO:1905460', 0.9999998000000101]
['GO:1902884', 0.5101152548959613]
['GO:0007566', 0.2460136533514431]
['GO:0045178', 0.33266242996208367]
['GO:0031379', 0.9999999]
['GO:0060481', 0.39456644144419684]
['GO:0051355', 0.9999999]
['GO:2000348', 0.9999999]
['G

['GO:0006865', 0.42210658207550544]
['GO:1904058', 0.8704154778912997]
['GO:0097421', 0.3689314276040034]
['GO:0008465', 0.9999999]
['GO:0008336', 0.9999999]
['GO:0007418', 0.2602175732775716]
['GO:0006757', 0.39456644144419684]
['GO:1904440', 0.9999999]
['GO:0032817', 0.9999999]
['GO:0010836', 0.7716739113310207]
['GO:0009582', 0.32031590990435577]
['GO:1903598', 0.7600129365143511]
['GO:0106035', 0.8116725480850819]
['GO:0015742', 0.7716739113310207]
['GO:0048297', 0.9999999]
['GO:0050929', 0.9999999]
['GO:0045822', 0.6334502061744569]
['GO:0051100', 0.9999999]
['GO:0038003', 0.9136655484551223]
['GO:0070573', 0.265172882344801]
['GO:0048101', 0.5802315580029597]
['GO:0006614', 0.9380932042930699]
['GO:0000724', 0.29040040949483503]
['GO:1903580', 0.9999999]
['GO:0006700', 0.320393233405432]
['GO:0031839', 0.9999999]
['GO:0010866', 0.6334502061744569]
['GO:0008330', 0.15568267671393685]
['GO:0060685', 0.9999999]
['GO:0019550', 0.5101152548959613]
['GO:0021889', 0.40003582950780603]
[

['GO:0007076', 0.5612329407703087]
['GO:0035035', 0.4645528913668532]
['GO:0004021', 0.18383695458207186]


In [55]:
analysis = getgoINFO(adjMappedFile, 10000, type='TSS')

In [56]:
len(analysis)

15126

In [58]:
go2gene['GO:0051284']

[2281, 6262, 205251]

In [57]:
for l in analysis:
    print(l)

['GO:0051284', 15740.0, 17157, 5117.803141974103, 3, 'positive regulation of sequestering of calcium ion', 'biological_process']
['GO:0017162', 16767.333333333332, 20000, 5886.371781496646, 9, 'aryl hydrocarbon receptor binding', 'molecular_function']
['GO:0003873', 12975.5, 10895.0, 4697.54208780152, 4, '6-phosphofructo-2-kinase activity', 'molecular_function']
['GO:0099068', 20000.0, 20000.0, 0.0, 2, 'postsynapse assembly', 'biological_process']
['GO:0070838', 20000.0, 20000.0, 0.0, 2, 'divalent metal ion transport', 'biological_process']
['GO:1905451', 18094.8, 20000, 4260.156710732599, 5, 'positive regulation of Fc-gamma receptor signaling pathway involved in phagocytosis', 'biological_process']
['GO:0048807', 15303.0, 15422, 4757.61631492074, 3, 'female genitalia morphogenesis', 'biological_process']
['GO:0004473', 20000.0, 20000, 0.0, 3, 'malate dehydrogenase (decarboxylating) (NADP+) activity', 'molecular_function']
['GO:0018158', 13803.5, 14930.5, 6941.7621442013315, 4, 'protei

['GO:0061149', 20000.0, 20000, 0, 1, 'BMP signaling pathway involved in ureter morphogenesis', 'biological_process']
['GO:0006591', 19357.0, 19357, 0, 1, 'ornithine metabolic process', 'biological_process']
['GO:0000460', 14083.0, 10323, 5191.922797320211, 7, 'maturation of 5.8S rRNA', 'biological_process']
['GO:0000254', 16950.0, 20000, 5282.754963085075, 3, 'C-4 methylsterol oxidase activity', 'molecular_function']
['GO:0044530', 20000.0, 20000, 0.0, 3, 'supraspliceosomal complex', 'cellular_component']
['GO:0005545', 19733.866666666665, 20000, 1030.7299678706672, 15, '1-phosphatidylinositol binding', 'molecular_function']
['GO:0062029', 10388.5, 10388.5, 501.3387078612622, 2, 'positive regulation of stress granule assembly', 'biological_process']
['GO:0060017', 16587.285714285714, 20000, 6355.460243346811, 7, 'parathyroid gland development', 'biological_process']
['GO:0035064', 16950.066666666666, 20000.0, 4317.193202535948, 60, 'methylated histone binding', 'molecular_function']
['

['GO:0035295', 20000.0, 20000.0, 0.0, 2, 'tube development', 'biological_process']
['GO:1990459', 17407.125, 20000.0, 4321.733462975245, 8, 'transferrin receptor binding', 'molecular_function']
['GO:0046822', 20000.0, 20000.0, 0.0, 6, 'regulation of nucleocytoplasmic transport', 'biological_process']
['GO:0061052', 18507.3, 20000.0, 2522.4016796167366, 13, 'negative regulation of cell growth involved in cardiac muscle cell development', 'biological_process']
['GO:0060399', 14631.0, 13597, 4933.940717114465, 3, 'positive regulation of growth hormone receptor signaling pathway', 'biological_process']
['GO:0030395', 15841.0, 17445, 3827.0050953715754, 3, 'lactose binding', 'molecular_function']
['GO:0000033', 9906.666666666666, 10054, 256.0553325617987, 3, 'alpha-1,3-mannosyltransferase activity', 'molecular_function']
['GO:0051427', 17950.8, 19168, 2449.8887117581485, 5, 'hormone receptor binding', 'molecular_function']
['GO:0006626', 15652.833333333334, 17238.5, 4828.479592807753, 30, '

['GO:0060591', 16281.6, 20000, 5103.645491215078, 5, 'chondroblast differentiation', 'biological_process']
['GO:0043517', 16941.75, 20000.0, 4699.37297800075, 12, 'positive regulation of DNA damage response, signal transduction by p53 class mediator', 'biological_process']
['GO:0070836', 18905.0, 20000.0, 2190.0, 4, 'caveola assembly', 'biological_process']
['GO:0039564', 17240.0, 17240, 0, 1, 'suppression by virus of host STAT2 activity', 'biological_process']
['GO:0009749', 15842.558823529413, 19678.5, 5066.682633334745, 69, 'response to glucose', 'biological_process']
['GO:0002081', 19021.25, 20000.0, 1957.5, 4, 'outer acrosomal membrane', 'cellular_component']
['GO:0007026', 17556.04, 20000, 4021.446687035235, 25, 'negative regulation of microtubule depolymerization', 'biological_process']
['GO:0006253', 18053.0, 18053, 0, 1, 'dCTP catabolic process', 'biological_process']
['GO:0035633', 20000.0, 20000.0, 0.0, 2, 'maintenance of permeability of blood-brain barrier', 'biological_pro

['GO:0038002', 20000.0, 20000, 0, 1, 'endocrine signaling', 'biological_process']
['GO:0046886', 20000.0, 20000, 0.0, 3, 'positive regulation of hormone biosynthetic process', 'biological_process']
['GO:0055087', 12254.333333333334, 10033, 6908.253493708329, 3, 'Ski complex', 'cellular_component']
['GO:0001822', 16141.118279569893, 20000, 5117.605165227406, 93, 'kidney development', 'biological_process']
['GO:0051971', 17771.166666666668, 20000.0, 5459.5043883732405, 6, 'positive regulation of transmission of nerve impulse', 'biological_process']
['GO:0033025', 20000.0, 20000, 0, 1, 'regulation of mast cell apoptotic process', 'biological_process']
['GO:0009791', 16609.485714285714, 20000.0, 4719.444900246175, 70, 'post-embryonic development', 'biological_process']
['GO:0005606', 20000.0, 20000, 0.0, 3, 'laminin-1 complex', 'cellular_component']
['GO:0004983', 15693.777777777777, 20000, 5107.463675293682, 9, 'neuropeptide Y receptor activity', 'molecular_function']
['GO:0006702', 16599

['GO:0001573', 15261.0, 15261.0, 6701.958072086098, 2, 'ganglioside metabolic process', 'biological_process']
['GO:0031110', 16992.30769230769, 20000, 4698.465997617652, 13, 'regulation of microtubule polymerization or depolymerization', 'biological_process']
['GO:0006725', 15075.0, 15075.0, 6965.001794687493, 2, 'cellular aromatic compound metabolic process', 'biological_process']
['GO:0006185', 14945.0, 14945, 0, 1, 'dGDP biosynthetic process', 'biological_process']
['GO:0030894', 13421.0, 13421.0, 9304.111026852592, 2, 'replisome', 'cellular_component']
['GO:1990258', 18053.5, 18053.5, 2752.7666991592296, 2, 'histone glutamine methylation', 'biological_process']
['GO:0090502', 14124.898305084746, 15673, 5561.4064200335315, 64, 'RNA phosphodiester bond hydrolysis, endonucleolytic', 'biological_process']
['GO:1990144', 14783.0, 14783.0, 7377.952154900437, 2, 'intrinsic apoptotic signaling pathway in response to hypoxia', 'biological_process']
['GO:1903292', 15535.0, 16545, 5046.382367

['GO:0043179', 20000.0, 20000, 0, 1, 'rhythmic excitation', 'biological_process']
['GO:0051349', 20000.0, 20000.0, 0.0, 2, 'positive regulation of lyase activity', 'biological_process']
['GO:2000427', 13767.0, 14714.5, 6157.981747293507, 6, 'positive regulation of apoptotic cell clearance', 'biological_process']
['GO:0052314', 17634.0, 17634.0, 3346.029288574743, 2, 'phytoalexin metabolic process', 'biological_process']
['GO:0003374', 17504.25, 20000.0, 4991.5, 5, 'dynamin family protein polymerization involved in mitochondrial fission', 'biological_process']
['GO:0030259', 17857.416666666668, 20000.0, 3524.142873440894, 12, 'lipid glycosylation', 'biological_process']
['GO:0008312', 17345.14285714286, 20000, 7024.091766403479, 7, '7S RNA binding', 'molecular_function']
['GO:0005309', 10455.0, 10455, 0, 1, 'creatine:sodium symporter activity', 'molecular_function']
['GO:0004691', 19938.0, 20000.0, 177.260636728331, 10, 'cAMP-dependent protein kinase activity', 'molecular_function']
['G

['GO:0072560', 20000.0, 20000.0, 0.0, 2, 'type B pancreatic cell maturation', 'biological_process']
['GO:0010818', 15843.25, 17834.5, 5227.136572732723, 12, 'T cell chemotaxis', 'biological_process']
['GO:0003707', 17191.46153846154, 20000.0, 4368.044025228551, 52, 'steroid hormone receptor activity', 'molecular_function']
['GO:0008142', 18868.4, 20000, 2530.334523338762, 5, 'oxysterol binding', 'molecular_function']
['GO:1990034', 20000.0, 20000.0, 0.0, 4, 'calcium ion export across plasma membrane', 'biological_process']
['GO:0003190', 20000.0, 20000.0, 0.0, 4, 'atrioventricular valve formation', 'biological_process']
['GO:0002760', 20000.0, 20000, 0, 1, 'positive regulation of antimicrobial humoral response', 'biological_process']
['GO:0030274', 16405.571428571428, 20000, 5074.832143599854, 7, 'LIM domain binding', 'molecular_function']
['GO:0000706', 15032.5, 15032.5, 7025.10587108835, 2, 'meiotic DNA double-strand break processing', 'biological_process']
['GO:0032486', 18972.9, 20

['GO:0031416', 19119.5, 19119.5, 1245.2150416695101, 2, 'NatB complex', 'cellular_component']
['GO:0042643', 20000.0, 20000.0, 0.0, 2, 'actomyosin, actin portion', 'cellular_component']
['GO:0098942', 20000.0, 20000.0, 0.0, 2, 'retrograde trans-synaptic signaling by trans-synaptic protein complex', 'biological_process']
['GO:0055096', 20000.0, 20000, 0.0, 3, 'low-density lipoprotein particle mediated signaling', 'biological_process']
['GO:0051598', 12399.666666666666, 10090, 6748.732498279461, 3, 'meiotic recombination checkpoint', 'biological_process']
['GO:0086001', 20000.0, 20000, 0, 1, 'cardiac muscle cell action potential', 'biological_process']
['GO:0042587', 14355.714285714286, 17718, 6862.387478477291, 7, 'glycogen granule', 'cellular_component']
['GO:0006336', 11848.923076923076, 10494.5, 4968.262174427407, 26, 'DNA replication-independent nucleosome assembly', 'biological_process']
['GO:0060775', 20000.0, 20000.0, 0.0, 2, 'planar cell polarity pathway involved in gastrula med

['GO:0050779', 12733.5, 12733.5, 10276.382850984095, 2, 'RNA destabilization', 'biological_process']
['GO:0001502', 18152.55, 20000.0, 3262.692373437038, 20, 'cartilage condensation', 'biological_process']
['GO:0003099', 20000.0, 20000, 0, 1, 'positive regulation of the force of heart contraction by chemical signal', 'biological_process']
['GO:0060366', 20000.0, 20000, 0, 1, 'lambdoid suture morphogenesis', 'biological_process']
['GO:0055129', 15668.0, 17526.5, 5165.69991385485, 6, 'L-proline biosynthetic process', 'biological_process']
['GO:0070233', 17475.625, 20000.0, 3782.777095411707, 8, 'negative regulation of T cell apoptotic process', 'biological_process']
['GO:0032957', 16854.0, 20000, 5449.031840611688, 3, 'inositol trisphosphate metabolic process', 'biological_process']
['GO:0022414', 18571.85714285714, 20000, 3778.5108366732447, 7, 'reproductive process', 'biological_process']
['GO:0010652', 20000.0, 20000.0, 0.0, 2, 'positive regulation of cell communication by chemical co

In [17]:
# CLOSEST Genes potentionally regulated by sites in window with respect to the genes body. k = 1 
def geneReadSites(needSortBSL, geneWindow, method='TSS'): # geneWindow = windows lists
    bsL = []           # Sort sites smallest to largest 
    for l in needSortBSL:
        bsL.append(sorted(l))
    
    chromosomeI = 0    # 0 == chromosome group 1          
    mappedEntrezG = [] # Output list of entrez ids 
    
    if method == 'TSS':
        entrezIDL = entrezIDLtss.copy()
    elif method == 'BODY':
        entrezIDL = entrezIDLbody.copy()
    elif method != 'TSS'and method != 'BODY':
        return 'method must be TSS or BODY'
    
    for bounds in geneWindow:
        lowB = bounds[0]
        upperB = bounds[1]
        geneIDS = entrezIDL[chromosomeI]
        sitesL = bsL[chromosomeI]
        
        for site in sitesL:
            for i in range(len(lowB)):
                if site < lowB[i]:
                    lowB = lowB[i:] # Getting rid of the lower geneWindows as it has been mapped. 
                    upperB = upperB[i:]
                    geneIDS = geneIDS[i:]
                    break
                if lowB[i] <= site and upperB[i] > site:
                    mappedEntrezG.append(geneIDS[i])
                    
        chromosomeI += 1   
    return mappedEntrezG

In [18]:
# Given a list of lists with go term and its odds-ratio,p-val return a list of just GO terms 
def getGOfromAnalysis(goAnalysis):
    goTermL = []
    for key in goAnalysis:
        goTermL.append(key)
    return goTermL

In [19]:
# Given original analysis and simulated analysis. Adds count to go term in original list if the random go term has a lower p-val and higher odds ratio 
def compareGOAnalysis(origAnalysis, counters):
    for k in counters.keys():
        if origAnalysis[k][2] != 'NA':
            if (counters[k][0] < origAnalysis[k][2]): #  (simAnalysis[k][0] > origAnalysis[k][0]) and 
                origAnalysis[k][5] += 1
        if origAnalysis[k][3] != 'NA':
            if (counters[k][1] < origAnalysis[k][3]): 
                origAnalysis[k][6] += 1
        if origAnalysis[k][4] != 'NA':
            if (counters[k][2] < origAnalysis[k][4]): 
                origAnalysis[k][7] += 1
    return origAnalysis 

In [20]:
# converts analysis to list format and divides by nSim to get refabs p value
def convertAnalysistoFormat(analysis, nSim):
    analist = []
    for k in analysis.keys():
        val = analysis[k]
        outval = val[:2]
        app = val[5:]
        pval = []
        for i in app:
            p = (i + 1) / (nSim + 1)
            pval.append(p)
        nas = val[2:5]
        for i in range(3):
            if nas[i] == 'NA':
                pval[i] = 'NA'
        outval += pval
        analist.append([k, outval])
    return analist

In [21]:
r = robjects.r

In [22]:
# Combines the three different p-vals together \ Format: pvals = [x,y,z]
def reFABScalc(pvals):    
    r['source']("SMLmetapADJ.R")
    return r("reFABSp(c(" + str(pvals)[1:-1] + "))")   

In [23]:
# Combines the S M & L, pvals for each GO term into one from the adaptiveEnrichmentAnalysis() function || adaptiveEnrichmentAnalysis(randsites, 100000, method='TSS')
# Replaces 1.0 with 0.9999999 & removes 'NA' for genes without s/m/l
def reFABSlistC(enrichmentA):
    r['source']("SMLmetapADJ.R")
    goTerms = []
    pvals = []
    for GO in enrichmentA:
        pvals.append([GO[1][1],GO[2][1],GO[3][1]])
        goTerms.append(GO[0])
    adjPval = []
    for l in pvals:
        newL = []
        for ind in range(len(l)):
            if type(l[ind]) != str:
                if l[ind] > 0.9999999:
                    newL.append(0.9999999)
                else:
                    newL.append(l[ind])
        adjPval.append(newL)
#     return adjPval
    
    stringreturn = "reFABSp(list("
    for ind in range(len(adjPval)):
        stringreturn  += "c("+ str(adjPval[ind])[1:-1] +"), "   
    output = []
    data = r(stringreturn[:-2] + "))") 
    for goID in range(len(goTerms)):
        output.append([goTerms[goID], data[goID]])
    return output



In [31]:
testing=adaptiveEnrichmentAnalysis(randsites, 10000, method='TSS')

In [44]:
combined = reFABSlistC(testing)

In [25]:
File = pd.read_csv('polyenrich/SmLgzFiles/Converted Data-hg38 Site/Large Data/largeDatasetConverted.csv')

In [26]:
chrlabel = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrMT','chrX','chrY']




In [27]:
File

Unnamed: 0.1,Unnamed: 0,Chromo,Start,End,Site
0,0,chr1,225474969,225475318,225475143
1,1,chr16,55465105,55465447,55465276
2,2,chr10,64041215,64041557,64041386
3,3,chr5,139262670,139262962,139262816
4,4,chr17,43760373,43760708,43760540
...,...,...,...,...,...
75630,75638,chr8,80890024,80890264,80890144
75631,75639,chr10,74891915,74892155,74892035
75632,75640,chr6,112148183,112148423,112148303
75633,75641,chr2,214326179,214326419,214326299


In [28]:
len(File[File.Chromo.isin(chrlabel)])

75635

In [29]:
chrList = list(File['Chromo'].values)
SiteLoc = list(File['Site'].values)

In [30]:
len(SiteLoc)

75635

In [31]:
FileData = []
for item in range(len(SiteLoc)):
    FileData.append([chrList[item], SiteLoc[item]])

In [32]:
len(FileData)

75635

In [33]:
FileSites = []
for x in range(25):
    FileSites.append([])

In [34]:
for item in range(len(FileData)):
    FileSites[chrlabel.index(FileData[item][0])].append(FileData[item][1])

        

In [35]:
c = 0 
for i in FileSites:
    c+=len(i)
c

75635

In [36]:
len(FileSites)

25

In [54]:
# Gets dinfo for GO terms. (AVG domain, Median domain length, Domain length stdev, # genes, goname, go namespace) || TSS only ATM 
def getgoINFO(data, window, type='TSS'):
    output = []
    pos = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
    
    if type == 'TSS':
        windDF = addWindowTSS20(window)
    elif type == 'BODY':
        windDF = addWindowBODY20(window)
    else:
        raise InputError("Incorrect option")
    
    for item in data:
        goterm = item[0]
        add = []
        add.append(goterm)
        try:
            genes = go2gene[goterm]
        except KeyError:
            output.append(add)
            continue 
        
        geneDom = []
        for gene in genes:
            try:
                chromo = geneAnnotationDF.loc[geneAnnotationDF['entrezid'] == gene].chromosome.values[0]   
            except IndexError:
                continue
            
            try:
                geneDomain = int(windDF[pos.index(chromo)].loc[windDF[pos.index(chromo)]['entrezid'] == gene].domainLEN.values[0])
            except ValueError:
                continue
            
            geneDom.append(geneDomain)
            
        if len(geneDom) == 0:
            output.append(add)
            continue 
            
        domainAVG = sum(geneDom)/len(geneDom)
        
        medianValue = median(geneDom)
        if len(geneDom) > 1:
            standardDev = stdev(geneDom)
        else:
            standardDev = 0
        
        add.append(domainAVG)
        add.append(medianValue)  
        add.append(standardDev)  
        add.append(len(go2gene[goterm]))
        try:
            add.append(goinfo.nodes[goterm]['name'])
            add.append(goinfo.nodes[goterm]['namespace'])
        except KeyError:
            add.append('NA')
            add.append('NA')
            
        output.append(add)
        
    return output
        
            
    

In [38]:
# Compares original analysis with simulated analysis for nSim
def simulation(GOlist, geneL, nSim, nSites, origAnalysis, method, SMLcutoff, entrezDomains): # GOList is the new go list that we analyze through since we don't want whole GO list, geneL is windowDF
    sitesL = nRandSitesSim(nSites, nSim)
    outputAnalysis = origAnalysis # We will keep updating this dictionary and return when all sims are done 
    
    for sim in range(nSim):
        mapped = geneReadSites(sitesL[sim], geneL, method)
        counters = SMLcounterFAST(GOlist, mapped, SMLcutoff, entrezDomains)
        outputAnalysis = compareGOAnalysis(outputAnalysis, counters)
        
    return sorted(convertAnalysistoFormat(outputAnalysis, nSim), key = lambda x: x[1][1])

In [39]:
# First run that sets up for the simulation
def firstRun(userSitesL, window, method='TSS', simN=100): # userSites must be ordered by chromosome number (List of lists) # Default simN = 10000
    if method == 'TSS':
        geneL = addWindowTSS(window)
        windowDF = addWindowTSS20(window)
    elif method == 'BODY': 
        geneL = addWindowBODY(window)
        windowDF = addWindowBODY20(window)
    else:
        return 'Only TSS or BODY method allowed!'
    
    geneClassification = findSMLdomain(windowDF)
    
    mappedGenes = geneReadSites(userSitesL, geneL, method)
    
    numMapped = len(mappedGenes)
    numsites = 0
    for l in userSitesL:
        numsites += len(l)
    
    entrezDomains = getEntrezDomain(windowDF)
#     return mappedGenes, entrezDomains
    goAnalysis = conductAnalysisFIRST(mappedGenes, geneClassification, entrezDomains)
    newGOlist = getGOfromAnalysis(goAnalysis)
    
    # estimate num sites to sample: numSitesSamp
    nBSL = nRandSitesSim(numsites,5)
    nPrime2 = 0 # Total num genes mapped to figure out nPrime value eventually for estimation
    for BSL in nBSL:
        nPrime2 += len(geneReadSites(BSL, geneL, method))
    nPrime = int(nPrime2 / 5)
    numSitesSamp = int((numMapped * numsites) / nPrime) # Num mapped genes * num sites inputted divided by nPrime 
    
    return simulation(newGOlist, geneL, simN, numSitesSamp, goAnalysis, method, geneClassification, entrezDomains) # GeneL is list of genes with window bounds 

In [40]:
def inANOTB(a,b):
    a_set = set(a) 
    b_set = set(b) 
    return len(list(set(a_set) - set(b_set)))
    

In [63]:
# First run that sets up for the simulation
def adaptiveEnrichmentAnalysis(userSitesL, window, method='TSS'): # userSites must be ordered by chromosome number (List of lists) # Default simN = 10000
    if method == 'TSS':
        geneL = addWindowTSS(window)
        windowDF = addWindowTSS20(window)
    elif method == 'BODY': 
        geneL = addWindowBODY(window)
        windowDF = addWindowBODY20(window)
    else:
        return 'Only TSS or BODY method allowed!'
    
    geneClassification = findSMLdomain(windowDF)
    smallbound = geneClassification[0]
    largebound = geneClassification[1]
    
    mappedGenes = geneReadSites(userSitesL, geneL, method)

    entrezDomains = getEntrezDomain(windowDF)
    smallGenesMAPPED = []# in mapped
    mediumGenesMAPPED = []
    largeGenesMAPPED = []
    for entrez in mappedGenes:
        geneLEN = entrezDomains[entrez]
        if geneLEN < smallbound:
            smallGenesMAPPED.append(entrez)
        elif geneLEN > largebound:
            largeGenesMAPPED.append(entrez)
        else:
            mediumGenesMAPPED.append(entrez)
    
    smallGenesAll = []# in All
    mediumGenesAll = []
    largeGenesAll = []
    
    everygene = []
    for l in windowDF:
        everygene += list(l.entrezid.values)
    for entrez in everygene:
        geneLEN = entrezDomains[entrez]
        if geneLEN < smallbound:
            smallGenesAll.append(entrez)
        elif geneLEN > largebound:
            largeGenesAll.append(entrez)
        else:
            mediumGenesAll.append(entrez)
            
    goAssocGenes = getOntologyID(mappedGenes)
    go2pvals = []
    
    for go in goAssocGenes:
        associated2GO = go2gene[go]
        smallASSOCgo = common_member(smallGenesAll, associated2GO)
        mediumASSOCgo = common_member(mediumGenesAll, associated2GO)
        largeASSOCgo = common_member(largeGenesAll, associated2GO)
        
        if len(smallASSOCgo) > 0:
            A = len(common_member(smallASSOCgo, smallGenesMAPPED))
            B = len(smallASSOCgo) - A
            C = inANOTB(smallGenesMAPPED, smallASSOCgo)
            D = len(smallGenesAll) - A - B - C
            smallODDpval = stats.fisher_exact([[A, B], [C, D]], alternative='greater')
        else:
            smallODDpval = ('NA','NA')
        
        if len(mediumASSOCgo) > 0:
            E = len(common_member(mediumASSOCgo, mediumGenesMAPPED))
            F = len(mediumASSOCgo) - E
            G = inANOTB(mediumGenesMAPPED, mediumASSOCgo)
            H = len(mediumGenesAll) - E - F - G
            mediumODDpval = stats.fisher_exact([[E, F], [G, H]], alternative='greater')
        else:
            mediumODDpval = ('NA','NA')
        
        if len(largeASSOCgo) > 0: 
            I = len(common_member(largeASSOCgo, largeGenesMAPPED))
            J = len(largeASSOCgo) - I
            K = inANOTB(largeGenesMAPPED, largeASSOCgo)
            L = len(largeGenesAll) - I - J - K 
            largeODDpval = stats.fisher_exact([[I, J], [K, L]], alternative='greater')
        else:
            largeODDpval = ('NA','NA')

        go2pvals.append([go,smallODDpval, mediumODDpval, largeODDpval])

            
    return go2pvals
    

In [42]:
def adjP(dataSET):
    GOdataSet = dataSET
    goGrp = [[],[],[],[],[],[],[],[],[],[]]
    numAssGrp = [[1], [2], [3], [4], [5, 6], [7, 8], [9, 10, 11, 12], [13, 14, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 185, 187, 188, 189, 190, 191, 193, 194, 196, 198, 200, 201, 202, 203, 204, 205, 206, 208, 210, 211, 212, 213, 214, 216, 217, 218, 219, 221, 225, 226, 229, 232, 234, 235, 236, 238, 239, 240, 242, 244, 249, 250, 251, 252, 254, 262, 263, 264, 265, 268, 272, 273, 278, 280, 282, 285, 290, 295, 297, 301, 307, 308, 310, 311, 312, 313, 315, 324, 329, 330, 334, 343, 344, 347, 354, 360, 362, 364, 366, 368, 371, 374, 375, 385, 387, 395, 397, 405, 409, 411, 413, 420, 424, 427, 431, 439, 441, 457, 458, 467, 474, 481, 483, 491, 496, 502, 504, 505, 524, 529, 537, 540, 547, 548, 552, 565, 578, 605, 610, 611, 667, 670, 689, 703, 706, 812, 816, 859, 927, 960, 973, 980, 1059, 1132, 1134, 1150, 1227, 1304, 1381, 1388, 1456, 1473, 1546, 1778, 1911, 1987, 2163, 2292, 3168, 3637, 4438, 4482, 5026, 5627, 9691]]
    numGO = [5184, 2895, 1743, 1241, 1551, 1018, 1219, 1028, 1020, 1275] # Number of GO terms in each group above.
    
    for go in GOdataSet:
        numAss = len(go2gene[go[0]])
        for i in range(len(numAssGrp)):
            if numAss in numAssGrp[i]:
                goGrp[i].append(go)
    
    pos = 0
    for GOdata in goGrp:
        fishersP = []
#         refabsP = []
        for l in GOdata:
            fishersP.append(l[1][1])
#             refabsP.append(l[1][2])
         
        if len(fishersP) < numGO[pos]:
            numAppend = numGO[pos] - len(fishersP)
            fishersP += [1] * numAppend
#             refabsP += [1] * numAppend
        pos += 1         
        
        reject, fishersPadj, alphacSidak, alphacBonf = padjust(fishersP, method='fdr_bh', is_sorted=False)
#         reject2, refabsPadj, alphacSidak2, alphacBonf2 = padjust(refabsP, method='fdr_bh', is_sorted=False)
        correctedfishers = []
#         correctedrefabs = []
        for i in range(len(GOdata)):
            correctedfishers.append(float(fishersPadj[i]))
#             cohttp://localhost:8890/notebooks/GOfunctionalAnalysis.ipynb#rrectedrefabs.append(float(refabsPadj[i]))
        
        for i in range(len(GOdata)):
            GOdata[i][1].append(correctedfishers[i])
#             GOdata[i][1].append(correctedrefabs[i])
    groupedGO = []
    for i in goGrp:
        for j in i:
            groupedGO.append(j)
    groupedGO = sorted(groupedGO, key = lambda x: x[1][1])
    refabsP = []
    for go in groupedGO:
        refabsP.append(go[1][2])
    reject2, refabsPadj, alphacSidak2, alphacBonf2 = padjust(refabsP, method='fdr_bh', is_sorted=False)
    
    for i in range(len(groupedGO)):
        groupedGO[i][1].append(refabsPadj[i])
    
    return groupedGO # go: [odds, fishersP, refabsP, correctedfishers, correctedrefabs]

In [27]:
# geneAnnotationDF = pd.read_csv('entrez_id/geneAnnotationsDF_Selected_entrezID.csv', sep=',', comment='#', low_memory=False, header=0, names=geneColID)
linSites = pd.read_excel('bindingSites2Test.xlsx', header=0)