In [291]:
%run reFABSwithVectors.ipynb
%run ontologyPackage/ontologySTATanalysis.ipynb

In [292]:
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.stats.multitest import multipletests as padjust

In [293]:
# Given a list of lists with go term and its odds-ratio,p-val return a list of just GO terms 
def getGOfromAnalysis(goAnalysis):
    goTermL = []
    for key in goAnalysis:
        goTermL.append(key)
    return goTermL

In [294]:
# Given a list of GO terms returns a dict with key as GO term and values are gene entrez ids associated with term
def updateGOdict(goL):
    GOdict = { key: go2gene[key] for key in goL }
    return GOdict

In [295]:
# First run that sets up for the simulation
def firstRun(userSitesL, window, method='TSS', simN=100): # userSites must be ordered by chromosome number (List of lists) # Default simN = 10000
    if method == 'TSS':
        geneL = addWindowTSS(window)
    elif method == 'BODY': 
        geneL = addWindowBODY(window)
    else:
        return 'Only TSS or BODY method allowed!'
    
    mappedGenes = geneReadSites(userSitesL, geneL, method)
    numMapped = len(mappedGenes)
    numsites = 0
    for l in userSitesL:
        numsites += len(l)
    
    goAnalysis = conductAnalysisFAST(mappedGenes)
    newGOlist = getGOfromAnalysis(goAnalysis)
    
    # estimate num sites to sample: numSitesSamp
    nBSL = nRandSitesSim(numsites,5)
    nPrime2 = 0 # Total num genes mapped to figure out nPrime value eventually for estimation
    for BSL in nBSL:
        nPrime2 += len(geneReadSites(BSL, geneL, method))
    nPrime = int(nPrime2 / 5)
    numSitesSamp = int((numMapped * numsites) / nPrime) # Num mapped genes * num sites inputted divided by nPrime 
    
    return simulation(newGOlist, geneL, simN, numSitesSamp, goAnalysis, method) # GeneL is list of genes with window bounds 

In [296]:
# Given original analysis and simulated analysis. Adds count to go term in original list if the random go term has a lower p-val and higher odds ratio 
def compareGOAnalysis(origAnalysis, simAnalysis):
    for k in simAnalysis:
        if (simAnalysis[k][1] < origAnalysis[k][1]): #  (simAnalysis[k][0] > origAnalysis[k][0]) and 
            origAnalysis[k][2] += 1
    return origAnalysis        

In [297]:
def convert2List(anlDict):
    out=[]
    for k in anlDict.keys():
        out.append([k,anlDict[k]])
    out.sort(key=lambda x: x[1][1])
    return list(filter(lambda x: (x[1][1] <= 0.05), out))

In [298]:
# converts analysis to list format and divides by nSim to get refabs p value
def convertAnalysistoFormat(analysis, nSim):
    analist = []
    for k in analysis.keys():
        analysis[k][2] = (analysis[k][2] + 1) / (nSim + 1)
        analist.append([k, analysis[k]])
    return analist

In [299]:
# Compares original analysis with simulated analysis for nSim
def simulation(GOlist, geneL, nSim, nSites, origAnalysis, method): # GOList is the new go list that we analyze through since we don't want whole GO list, geneL is windowDF

    sitesL = nRandSitesSim(nSites, nSim)
    outputAnalysis = origAnalysis # We will keep updating this dictionary and return when all sims are done 
    
    for sim in range(nSim):
        mapped = geneReadSites(sitesL[sim], geneL, method)
        randAnalysis = conductAnalysisFAST(mapped, OntologyL=GOlist) # Updating OntologyL to make the calculations quicker 
        outputAnalysis = compareGOAnalysis(outputAnalysis, randAnalysis)
        
    return sorted(convertAnalysistoFormat(outputAnalysis, nSim), key = lambda x: x[1][1])

In [300]:
numAssGrp = [[1], [2], [3], [4], [5, 6], [7, 8], [9, 10, 11, 12], [13, 14, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 185, 187, 188, 189, 190, 191, 193, 194, 196, 198, 200, 201, 202, 203, 204, 205, 206, 208, 210, 211, 212, 213, 214, 216, 217, 218, 219, 221, 225, 226, 229, 232, 234, 235, 236, 238, 239, 240, 242, 244, 249, 250, 251, 252, 254, 262, 263, 264, 265, 268, 272, 273, 278, 280, 282, 285, 290, 295, 297, 301, 307, 308, 310, 311, 312, 313, 315, 324, 329, 330, 334, 343, 344, 347, 354, 360, 362, 364, 366, 368, 371, 374, 375, 385, 387, 395, 397, 405, 409, 411, 413, 420, 424, 427, 431, 439, 441, 457, 458, 467, 474, 481, 483, 491, 496, 502, 504, 505, 524, 529, 537, 540, 547, 548, 552, 565, 578, 605, 610, 611, 667, 670, 689, 703, 706, 812, 816, 859, 927, 960, 973, 980, 1059, 1132, 1134, 1150, 1227, 1304, 1381, 1388, 1456, 1473, 1546, 1778, 1911, 1987, 2163, 2292, 3168, 3637, 4438, 4482, 5026, 5627, 9691]]
numGO = [5184, 2895, 1743, 1241, 1551, 1018, 1219, 1028, 1020, 1275]

In [301]:
# Given output from firstrun function generate data required for fdr-sensitivity graph 
def getGraphData(analysis):
    reFabsPositives = 0 
    # Get total true positives 
    for go in analysis:
        if go[1][2] < 0.05:
            reFabsPositives += 1
    FDRList = []
    SensList = []
    
    FDRc = 0 
    POSc = 0 
    goterm = 1
    for go in analysis:
        if go[1][2] < 0.05:
            FDR = FDRc / goterm
            POSc += 1 
            Sens = POSc / reFabsPositives
            goterm += 1 
            FDRList.append(FDR) 
            SensList.append(Sens) 

        else:
            FDRc += 1 
            FDR = FDRc / goterm
            Sens = POSc / reFabsPositives
            goterm += 1 
            FDRList.append(FDR) 
            SensList.append(Sens)
            
    FDRList = small2largeFDR(FDRList)
    return [FDRList, SensList]

In [302]:
def small2largeFDR(FDRL): # FDR Correction
    length = len(FDRL)
    FDRL.reverse()
    for i in range(length):
        if i == length - 1:
            break 
        if FDRL[i] < FDRL[i+1]:
            FDRL[i+1] = FDRL[i]
    FDRL.reverse()
    return FDRL 

In [303]:
numAssGrp = [[1], [2], [3], [4], [5, 6], [7, 8], [9, 10, 11, 12], [13, 14, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 185, 187, 188, 189, 190, 191, 193, 194, 196, 198, 200, 201, 202, 203, 204, 205, 206, 208, 210, 211, 212, 213, 214, 216, 217, 218, 219, 221, 225, 226, 229, 232, 234, 235, 236, 238, 239, 240, 242, 244, 249, 250, 251, 252, 254, 262, 263, 264, 265, 268, 272, 273, 278, 280, 282, 285, 290, 295, 297, 301, 307, 308, 310, 311, 312, 313, 315, 324, 329, 330, 334, 343, 344, 347, 354, 360, 362, 364, 366, 368, 371, 374, 375, 385, 387, 395, 397, 405, 409, 411, 413, 420, 424, 427, 431, 439, 441, 457, 458, 467, 474, 481, 483, 491, 496, 502, 504, 505, 524, 529, 537, 540, 547, 548, 552, 565, 578, 605, 610, 611, 667, 670, 689, 703, 706, 812, 816, 859, 927, 960, 973, 980, 1059, 1132, 1134, 1150, 1227, 1304, 1381, 1388, 1456, 1473, 1546, 1778, 1911, 1987, 2163, 2292, 3168, 3637, 4438, 4482, 5026, 5627, 9691]]
numGO = [5184, 2895, 1743, 1241, 1551, 1018, 1219, 1028, 1020, 1275] # Number of GO terms in each group above.

In [304]:
def adjP(dataSET):
    GOdataSet = dataSET
    goGrp = [[],[],[],[],[],[],[],[],[],[]]
    numAssGrp = [[1], [2], [3], [4], [5, 6], [7, 8], [9, 10, 11, 12], [13, 14, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33], [34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 160, 161, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 185, 187, 188, 189, 190, 191, 193, 194, 196, 198, 200, 201, 202, 203, 204, 205, 206, 208, 210, 211, 212, 213, 214, 216, 217, 218, 219, 221, 225, 226, 229, 232, 234, 235, 236, 238, 239, 240, 242, 244, 249, 250, 251, 252, 254, 262, 263, 264, 265, 268, 272, 273, 278, 280, 282, 285, 290, 295, 297, 301, 307, 308, 310, 311, 312, 313, 315, 324, 329, 330, 334, 343, 344, 347, 354, 360, 362, 364, 366, 368, 371, 374, 375, 385, 387, 395, 397, 405, 409, 411, 413, 420, 424, 427, 431, 439, 441, 457, 458, 467, 474, 481, 483, 491, 496, 502, 504, 505, 524, 529, 537, 540, 547, 548, 552, 565, 578, 605, 610, 611, 667, 670, 689, 703, 706, 812, 816, 859, 927, 960, 973, 980, 1059, 1132, 1134, 1150, 1227, 1304, 1381, 1388, 1456, 1473, 1546, 1778, 1911, 1987, 2163, 2292, 3168, 3637, 4438, 4482, 5026, 5627, 9691]]
    numGO = [5184, 2895, 1743, 1241, 1551, 1018, 1219, 1028, 1020, 1275] # Number of GO terms in each group above.
    
    for go in GOdataSet:
        numAss = len(go2gene[go[0]])
        for i in range(len(numAssGrp)):
            if numAss in numAssGrp[i]:
                goGrp[i].append(go)
    
    pos = 0
    for GOdata in goGrp:
        fishersP = []
#         refabsP = []
        for l in GOdata:
            fishersP.append(l[1][1])
#             refabsP.append(l[1][2])
         
        if len(fishersP) < numGO[pos]:
            numAppend = numGO[pos] - len(fishersP)
            fishersP += [1] * numAppend
#             refabsP += [1] * numAppend
        pos += 1         
        
        reject, fishersPadj, alphacSidak, alphacBonf = padjust(fishersP, method='fdr_bh', is_sorted=False)
#         reject2, refabsPadj, alphacSidak2, alphacBonf2 = padjust(refabsP, method='fdr_bh', is_sorted=False)
        correctedfishers = []
#         correctedrefabs = []
        for i in range(len(GOdata)):
            correctedfishers.append(float(fishersPadj[i]))
#             correctedrefabs.append(float(refabsPadj[i]))
        
        for i in range(len(GOdata)):
            GOdata[i][1].append(correctedfishers[i])
#             GOdata[i][1].append(correctedrefabs[i])
    groupedGO = []
    for i in goGrp:
        for j in i:
            groupedGO.append(j)
    groupedGO = sorted(groupedGO, key = lambda x: x[1][1])
    refabsP = []
    for go in groupedGO:
        refabsP.append(go[1][2])
    reject2, refabsPadj, alphacSidak2, alphacBonf2 = padjust(refabsP, method='fdr_bh', is_sorted=False)
    
    for i in range(len(groupedGO)):
        groupedGO[i][1].append(refabsPadj[i])
    
    return groupedGO # go: [odds, fishersP, refabsP, correctedfishers, correctedrefabs]

In [306]:
def addWindowTSS20(window):
    chrD = {'1': [1, 248956422],'2': [1, 133797422],'3': [1, 135086622],'4': [1, 133275309],'5': [1, 114364328],'6': [1, 107043718],'7': [1, 101991189],'8': [1, 90338345],'9': [1, 83257441],'10': [1, 80373285],'11': [1, 58617616],'12': [1, 242193529],'13': [1, 64444167],'14': [1, 46709983],'15': [1, 50818468],'16': [1, 198295559],'17': [1, 190214555],'18': [1, 181538259],'19': [1, 170805979],'20': [1, 159345973],'21': [1, 145138636],'22': [1, 138394717],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]}
    geneWindowDFL = geneDFLtss
#     outList = []
    chrIDNUM = 0
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1   
        for i, row in dfG.iterrows():
            if row['strand'] == '+':
                start = row['start']
            if row['strand'] == '-':
                start = row['end']
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + start) # Middle of gene body 
            if i == 0 or i == dfIndexMax: # Case for start and end genes
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # chrD[chrIDNUM][0] = start val of the chromosome
                    dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                    dfG.at[i, 'upperB'] = min(start + window, chrD[str(chrIDNUM)][1]) # chrD[chrIDNUM][1] = end val of chromosome 
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                dfG.at[i, 'upperB'] = min(start + window, ((dfG.iat[i+1, 3] - start) / 2) + start) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
    
    return geneWindowDFL

In [307]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY20(window):
    geneWindowDFL = geneDFL # body version 
    chrD = {'1': [1, 248956422],'2': [1, 133797422],'3': [1, 135086622],'4': [1, 133275309],'5': [1, 114364328],'6': [1, 107043718],'7': [1, 101991189],'8': [1, 90338345],'9': [1, 83257441],'10': [1, 80373285],'11': [1, 58617616],'12': [1, 242193529],'13': [1, 64444167],'14': [1, 46709983],'15': [1, 50818468],'16': [1, 198295559],'17': [1, 190214555],'18': [1, 181538259],'19': [1, 170805979],'20': [1, 159345973],'21': [1, 145138636],'22': [1, 138394717],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]}
    chrIDNUM = 0
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        chrIDNUM += 1 
        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            start = row['start']
            end = row['end']
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(start - window, chrD[str(chrIDNUM)][0]) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end)
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = min(row['end'] + window, chrD[str(chrIDNUM)][1])
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(start - window, start - ((start - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(end + window, ((dfG.iat[i+1, 3] - end) / 2) + end) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
    
    return geneWindowDFL

In [336]:
test = addWindowTSS20(10000)[0]

In [339]:
addWindowTSS20(10000)[0].iloc[675:680]

Unnamed: 0,chromosome,source,type,start,end,strand,gene_symbol,gene_ensID,length,entrezid,midBody,lowB,upperB,domainLEN
675,1,ensembl_havana,gene,45750650,45694324,-,IPP,ENSG00000197429,56327,3652,45722487,45691218,45704324,13106
676,1,ensembl_havana,gene,45786987,46036124,+,MAST2,ENSG00000086015,249138,23139,45911556,45776987,45796987,20000
677,1,ensembl_havana,gene,46133036,46040140,-,PIK3R3,ENSG00000117461,92897,8503,46086588,46030140,46050140,20000
679,1,havana,gene,46134531,46139081,+,RP4-533D7.5,ENSG00000227857,4551,101929626,46136806,46134531,46144531,10000
680,1,ensembl_havana,gene,46175073,46185958,+,TSPAN1,ENSG00000117472,10886,10103,46180516,46175073,46175780,707


In [338]:
test.loc[test['upperB'] < test['lowB']]

Unnamed: 0,chromosome,source,type,start,end,strand,gene_symbol,gene_ensID,length,entrezid,midBody,lowB,upperB,domainLEN
678,1,havana,gene,46176488,46043661,-,PIK3R3,ENSG00000278139,132828,8503,46110075,46088348,46053661,-34687
717,1,ensembl_havana,gene,48776969,48727523,-,BEND5,ENSG00000162373,49447,79656,48752246,48752246,48737523,-14723
719,1,havana,gene,49472085,49374201,-,AGBL4-IT1,ENSG00000225623,97885,100874313,49423143,49423143,49384201,-38942
792,1,ensembl_havana,gene,58546802,58415384,-,OMA1,ENSG00000162600,131419,115209,58481093,58481059,58425384,-55675
856,1,havana,gene,71837012,71794232,-,NEGR1-IT1,ENSG00000228853,42781,100852409,71815622,71815622,71804232,-11390
961,1,havana,gene,93775444,93752924,-,RP11-488P3.1,ENSG00000230439,22521,723788,93764184,93764184,93762924,-1260
1606,1,ensembl_havana,gene,163266576,163355764,+,NUF2,ENSG00000143228,89189,83540,163311170,163294183,163276576,-17607
1605,1,havana,gene,163321894,163244505,-,RGS5,ENSG00000232995,77390,8490,163283200,163255540,163254505,-1035
2093,1,ensembl_havana,gene,235328219,235131634,-,ARID4B,ENSG00000054267,196586,51742,235229927,235146545,235141634,-4911
2149,1,ensembl_havana,gene,247078755,247036784,-,ZNF670,ENSG00000277462,41972,93474,247057770,247057769,247046784,-10985


In [332]:
def findSMLdomain(window, method):
    if method == 'TSS':
        windowLens = addWindowTSS20(window)
    elif method == 'BODY':
        windowLens = addWindowBODY20(window)
    else:
        return 'Error in method'
    allVALS = []
    for df in windowLens:
        df['domainLEN'] = df['upperB'] - df['lowB']
        values = list(df.domainLEN.values)
        allVALS += values
    
    return sorted(allVALS)

In [333]:
newnew = findSMLdomain(10000, 'TSS')

In [335]:
newnew

[-108193787,
 -75750888,
 -66992940,
 -63521577,
 -63056665,
 -59249746,
 -57026986,
 -56888465,
 -54704033,
 -54620348,
 -53240610,
 -51148090,
 -49860346,
 -76919,
 -61710,
 -61144,
 -55675,
 -44928,
 -44568,
 -42128,
 -41952,
 -40677,
 -39715,
 -38942,
 -38749,
 -34718,
 -34687,
 -34323,
 -33648,
 -32607,
 -31693,
 -30463,
 -30267,
 -30128,
 -29493,
 -27916,
 -27606,
 -26661,
 -25347,
 -25232,
 -24319,
 -23793,
 -22899,
 -22242,
 -21901,
 -21856,
 -21488,
 -21273,
 -20728,
 -20671,
 -20648,
 -20371,
 -20165,
 -19742,
 -19287,
 -18905,
 -18604,
 -18314,
 -17607,
 -17207,
 -16596,
 -16114,
 -15830,
 -15398,
 -15282,
 -15166,
 -14723,
 -14323,
 -14157,
 -13103,
 -12910,
 -12747,
 -12471,
 -11390,
 -11255,
 -10985,
 -10570,
 -9845,
 -9772,
 -9585,
 -9547,
 -9435,
 -9307,
 -9299,
 -9118,
 -8998,
 -8945,
 -8883,
 -8427,
 -7691,
 -7613,
 -7598,
 -7417,
 -6833,
 -6825,
 -6810,
 -6551,
 -6437,
 -6384,
 -6153,
 -5636,
 -5347,
 -5322,
 -5032,
 -4911,
 -4839,
 -4783,
 -4646,
 -4610,
 -4522,
 -3

In [34]:
def randSitesSimNoMP(nSites, nSims):
    siteslist = []
    for i in range(nSims):
        sites = nRandSites(nSites)
        siteslist.append(sites)
    return siteslist

In [14]:
listofsites = randSitesSimNoMP(10000, 2000)

In [85]:
dataNoMP = firstRun(linSITES, 100000, method='BODY', simN=2000) 

In [129]:
#x
x = ['GO:0090737', 'GO:0046331', 'GO:0038102', 'GO:0042745', 'GO:0051386', 'GO:0001519', 'GO:0036186', 'GO:0061290', 'GO:0048841', 'GO:0052746', 'GO:1905069', 'GO:0090673', 'GO:0071418', 'GO:0060072', 'GO:0051389', 'GO:0003290', 'GO:0001838', 'GO:0090657', 'GO:0006725', 'GO:0099612', 'GO:0071207', 'GO:1902911', 'GO:0034040', 'GO:0045027', 'GO:0000301', 'GO:0032328', 'GO:1900077', 'GO:0021784', 'GO:0014040', 'GO:0052314', 'GO:0098901', 'GO:0014737', 'GO:1902202', 'GO:0008281', 'GO:2001065', 'GO:0021551', 'GO:0021644', 'GO:0004068', 'GO:0021558', 'GO:0097233', 'GO:1905521', 'GO:2000845', 'GO:0045425', 'GO:0070256', 'GO:0070253', 'GO:0007284', 'GO:0050906', 'GO:2000107', 'GO:1901342', 'GO:0055026', 'GO:1990630', 'GO:0097498', 'GO:0021740', 'GO:0062028', 'GO:1903748', 'GO:0010734', 'GO:0099642', 'GO:0004140', 'GO:0004782', 'GO:0060488', 'GO:1905289', 'GO:1901142', 'GO:1904338', 'GO:0097324', 'GO:1990597', 'GO:0035795', 'GO:0005668', 'GO:0071349', 'GO:0072134', 'GO:0004447', 'GO:0045298', 'GO:0060125', 'GO:0021940', 'GO:0006597', 'GO:0007174', 'GO:1904556', 'GO:0042720', 'GO:0048069', 'GO:0036505', 'GO:0017109', 'GO:0060083', 'GO:0099178', 'GO:0039529', 'GO:0007620', 'GO:0044210', 'GO:0000152', 'GO:0032812', 'GO:0003062', 'GO:0048669', 'GO:0052901', 'GO:1905450', 'GO:1902253', 'GO:0003880', 'GO:0019087', 'GO:0016072', 'GO:0008294', 'GO:0036488', 'GO:0072179', 'GO:0071676', 'GO:0071140', 'GO:0062000', 'GO:0007549', 'GO:1990716', 'GO:0072610', 'GO:0042418', 'GO:0004357', 'GO:0060279', 'GO:0015893', 'GO:0004743', 'GO:0042412', 'GO:0034143', 'GO:1904251', 'GO:0097474', 'GO:0099609', 'GO:0042480', 'GO:1990184', 'GO:1990890', 'GO:2000758', 'GO:0034395', 'GO:0009450', 'GO:0016316', 'GO:0032620', 'GO:0031751', 'GO:0021693', 'GO:0003989', 'GO:1902170', 'GO:1902257', 'GO:1902882', 'GO:0021570', 'GO:0002121', 'GO:0060490', 'GO:0004699', 'GO:0006808', 'GO:0031894', 'GO:0002930', 'GO:0045153', 'GO:0031291', 'GO:0043000', 'GO:0010705', 'GO:0032290', 'GO:0140030', 'GO:0060689', 'GO:0032596', 'GO:2001295', 'GO:2001204', 'GO:0043006', 'GO:0031627', 'GO:0072182', 'GO:0031682', 'GO:0044062', 'GO:0097535', 'GO:0098989', 'GO:0043313', 'GO:0042323', 'GO:0003883', 'GO:2000342', 'GO:0009330', 'GO:0097026', 'GO:0002439', 'GO:0002093', 'GO:1902396', 'GO:0097102', 'GO:2000194', 'GO:0019322', 'GO:0097536', 'GO:0070813', 'GO:1901329', 'GO:0045003', 'GO:0004478', 'GO:2000866', 'GO:0046469', 'GO:0031635', 'GO:1903296', 'GO:0038109', 'GO:1902680', 'GO:0004818', 'GO:0040010', 'GO:1903489', 'GO:0045994', 'GO:1905303', 'GO:0010559', 'GO:0047322', 'GO:0060489', 'GO:2000170', 'GO:0000825', 'GO:0003096', 'GO:0071379', 'GO:2000504', 'GO:0043169', 'GO:1903044', 'GO:0097750', 'GO:0033878', 'GO:2000490', 'GO:0008050', 'GO:0006971', 'GO:0140009', 'GO:0033065', 'GO:0042883', 'GO:1902750', 'GO:0015196', 'GO:1905030', 'GO:0097156', 'GO:0034684', 'GO:0001607', 'GO:2000482', 'GO:0070895', 'GO:0032762', 'GO:0042924', 'GO:1904562', 'GO:0021903', 'GO:0001791', 'GO:1903109', 'GO:0004551', 'GO:0097069', 'GO:0030617', 'GO:0043370', 'GO:2000791', 'GO:0019085', 'GO:0001544', 'GO:0014740', 'GO:0004878', 'GO:0030546', 'GO:0007497', 'GO:0098507', 'GO:0004345', 'GO:0015910', 'GO:0071242', 'GO:0000939', 'GO:1901898', 'GO:0072070', 'GO:2000317', 'GO:0018963', 'GO:0070001', 'GO:0046208', 'GO:0032416', 'GO:0036313', 'GO:0004828', 'GO:0004651', 'GO:0006481', 'GO:0006424', 'GO:0050405', 'GO:0097022', 'GO:0004074', 'GO:1904016', 'GO:0006434', 'GO:0097021', 'GO:0038178', 'GO:0032986', 'GO:0021571', 'GO:0072224', 'GO:0046592', 'GO:0021557', 'GO:0051446']



In [130]:
len(x)

253

In [131]:
mapped = geneReadSites(nRandSites([100000, random.randint(100000)]), addWindowBODY(100000), method='BODY')

In [132]:
wind = addWindowBODY20(100000)

In [133]:
def findGeneinDFL(gene, dfl):
    for df in dfl:
        search = df.loc[df['entrezid'] == gene]
        
        if len(search) == 0:
            pass
        else:
            
            lb = search.lowB.values[0]
            up = search.upperB.values[0]
            domain = up - lb
            return domain

In [134]:
analysisList = []
analysisList.append(['GOterm', 'Gene1', 'Domain Len1', 'IsMapped?', 'Gene2', 'Domain Len2', 'IsMapped?'])
for go in x:
    inputs = []
    inputs.append(go)
    genes = go2gene[go]
    inputs.append(genes[0])
    inputs.append(findGeneinDFL(genes[0], wind))
    if genes[0] in mapped:
        inputs.append('Mapped')
    else:
        inputs.append('NOT')
    inputs.append(genes[1])
    inputs.append(findGeneinDFL(genes[1], wind))
    if genes[1] in mapped:
        inputs.append('Mapped')
    else:
        inputs.append('NOT')
    analysisList.append(inputs)

In [135]:
for l in analysisList:
    print(l)

['GOterm', 'Gene1', 'Domain Len1', 'IsMapped?', 'Gene2', 'Domain Len2', 'IsMapped?']
['GO:0090737', 4683, 2724, 'Mapped', 7517, 45156, 'NOT']
['GO:0046331', 3280, 202587, 'NOT', 28514, 13936, 'NOT']
['GO:0038102', 3547, -105393, 'Mapped', 10468, 143474, 'Mapped']
['GO:0042745', 191, 31178, 'Mapped', 7054, 51612, 'NOT']
['GO:0051386', 10479, 86430, 'Mapped', 55816, 375574, 'Mapped']
['GO:0001519', 5066, 421646, 'Mapped', 114770, 13279, 'Mapped']
['GO:0036186', 11031, 246660, 'Mapped', 55198, 2081, 'Mapped']
['GO:0061290', 2625, 140083, 'Mapped', 55366, 17567, 'Mapped']
['GO:0048841', 10371, -336947, 'Mapped', 91584, -325356, 'NOT']
['GO:0052746', 3705, 811, 'NOT', 64768, 107089, 'NOT']
['GO:1905069', 653, 78081, 'Mapped', 655, 82624, 'Mapped']
['GO:0090673', 5175, 70450, 'Mapped', 56999, 12281, 'Mapped']
['GO:0071418', 445, 100593, 'NOT', 7390, 9413, 'NOT']
['GO:0060072', 3778, -568994, 'Mapped', 157855, 351804, 'Mapped']
['GO:0051389', 2048, 257231, 'Mapped', 3480, 437191, 'NOT']
['GO: