In [1]:
import pandas as pd 
import numpy as np
import time 
from heapq import merge 

pd.set_option('display.max_colwidth', -1) # Values in columns won't be shortned 
pd.set_option('chained_assignment',None) # Disabling chained assignments 

geneColID = ["chromosome","source","type","start","end","score","strand","phase","gene_symbol","gene_ensID","length","entrezid"]
geneAnnotationDF = pd.read_csv('entrez_id/geneAnnotationsDF_Selected_entrezID.csv', sep=',', comment='#', low_memory=False, header=0, names=geneColID)
chromosomeColID = ['chromosome','source','type','start','end','score','strand','phase']
chromosomesDF = pd.read_csv('chromosomesDF.csv', sep=',', comment='#', low_memory=False, header=0, names=chromosomeColID)

In [2]:
#Dropping unneccesary stuff and resetting index from 0
chromosomesDF = chromosomesDF.reset_index()
geneAnnotationDF = geneAnnotationDF.reset_index()
chromosomesDF = chromosomesDF.drop(columns=['score', 'strand', 'phase', 'index'])
geneAnnotationDF = geneAnnotationDF.drop(columns=['score', 'phase', 'index'])

In [3]:
# Making a list of DataFrames to be used in future functions in var geneDFL
u = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
geneDFL = []
for c in u:
    geneDF = geneAnnotationDF.loc[geneAnnotationDF['chromosome'] == c]
    geneDF = geneDF.reset_index()
    geneDF = geneDF.drop(columns=['index'])
    geneDFL.append(geneDF)

In [4]:
# probability that a randomly chosen gene is in a certain chromosome. Ordered from 1 - MT,X,Y | As shown chromosome 1 has highest prob of selection
probabilityL = [0.10328990533320836,
                0.06298622176398913,
                0.056378292248570626,
                0.03875714687412129,
                0.043959133939450744,
                0.05164495266660418,
                0.04658355984628362,
                0.03463304902052676,
                0.03828849939075827,
                0.03786671665573156,
                0.06411097572406037,
                0.05136376417658637,
                0.01762114537444934,
                0.030180897928578122,
                0.030837004405286344,
                0.04199081450932608,
                0.05844034117536789,
                0.01405942450089043,
                0.0687974505576905,
                0.02746274252507264,
                0.01241915830911988,
                0.02230762020807948,
                0.0007029712250445215,
                0.04213140875433499,
                0.0031868028868684973]

In [5]:
# Returns list of randSites ordered by chromosome number from 1 - MT,X,Y
# Weights are so chromosomes with more genes in them are chosen for a rand site more often. Chr one has highest probability.
def nRandSites(n):
    chrIDS = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
    randChr = chromosomesDF.sample(n, replace=True, weights=probabilityL) # Replace allows same row to be sampled again 
    randChr['BindingSite'] = (np.random.uniform(randChr.start, randChr.end).round()).astype(int)
    randChrL = [] 
    for i in chrIDS: 
        DF = randChr.loc[randChr['chromosome'] == i]
        sortDF = DF.iloc[(DF['BindingSite']).argsort()]         
        randChrL.append(sortDF.BindingSite.values)
    return randChrL  

In [6]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY(window):
    geneWindowDFL = geneDFL
    outList = []
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y

        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(row['start'] - window, 1) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(row['end'] + window, ((dfG.iat[i+1, 3] - row['end']) / 2) + row['end'])
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(row['start'] - window, row['start'] - ((row['start'] - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = row['end'] + window
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(row['start'] - window, row['start'] - ((row['start'] - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(row['end'] + window, ((dfG.iat[i+1, 3] - row['end']) / 2) + row['end']) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    return(outList)

In [7]:
# Creating list of all the entrezids 
entrezIDL = []
for geneDF in geneDFL:
    entrezIDL.append(geneDF.entrezid.values)

In [39]:
# Genes potentionally regulated by sites in window with respect to the genes body.
def genesReadSitesBody(bsL, window):
    geneL = addWindowBODY(window)
    chromosomeI = 0 # 0 == chromosome group 1          
    entrezidOutL = [] # Output list of entrez ids 
    
    for bounds in geneL:
        lowB = bounds[0]
        upperB = bounds[1]
        geneIDS = entrezIDL[chromosomeI]
        sitesL = bsL[chromosomeI]
        
        for site in sitesL:
            for i in range(len(lowB)):
                if lowB[i] <= site and upperB[i] > site:
                    entrezidOutL.append(geneIDS[i])
                    lowB = lowB[i:] # Getting rid of the gene windows because it has been mapped. Genes at a lower positions are removed casuse they are small
                    upperB = upperB[i:]
                    geneIDS = geneIDS[i:]
                    break # Done with the current site so break 
        chromosomeI += 1   
    return entrezidOutL