In [1]:
import pandas as pd 
import numpy as np
from numpy import random, array
import time 
import os # Dont think this is used 
from multiprocessing import Pool

pd.set_option('display.max_colwidth', -1) # Values in columns won't be shortned 
pd.set_option('chained_assignment',None) # Disabling chained assignments 

geneColID = ["chromosome","source","type","start","end","score","strand","phase","gene_symbol","gene_ensID","length","entrezid"]
geneAnnotationDF = pd.read_csv('entrez_id/geneAnnotationsDF_Selected_entrezID.csv', sep=',', comment='#', low_memory=False, header=0, names=geneColID)
chromosomeColID = ['chromosome','source','type','start','end','score','strand','phase']
chromosomesDF = pd.read_csv('chromosomesDF.csv', sep=',', comment='#', low_memory=False, header=0, names=chromosomeColID)

In [2]:
#Dropping unneccesary stuff and resetting index from 0
chromosomesDF = chromosomesDF.reset_index()
geneAnnotationDF = geneAnnotationDF.reset_index()
chromosomesDF = chromosomesDF.drop(columns=['score', 'strand', 'phase', 'index'])
geneAnnotationDF = geneAnnotationDF.drop(columns=['score', 'phase', 'index'])

In [3]:
# Making a list of DataFrames to be used in future functions in var geneDFL
u = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
geneDFL = []
for c in u:
    geneDF = geneAnnotationDF.loc[geneAnnotationDF['chromosome'] == c]
    geneDF = geneDF.reset_index()
    geneDF = geneDF.drop(columns=['index'])
    geneDFL.append(geneDF)

In [4]:
# probability that a randomly chosen gene is in a certain chromosome. Ordered from 1 - MT,X,Y | As shown chromosome 1 has highest prob of selection
probabilityL = [0.08069467567425805,
                0.04336798981767917,
                0.043785860443564265,
                0.04319875642790819,
                0.03706910744670193,
                0.0346962654651935,
                0.0330585805012371,
                0.029281524019991866,
                0.02698637835887469,
                0.02605153188596882,
                0.018999829262963137,
                0.07850260746862361,
                0.02088839931657888,
                0.015140187582448091,
                0.016471877931803908,
                0.06427388252370811,
                0.061654572719752214,
                0.058842309890180126,
                0.055363637410522915,
                0.05164908584376297,
                0.04704403712800866,
                0.044858118997812904,
                5.370538628831923e-06,
                0.050577805194942725,
                0.01753760814888535]

In [5]:
# Returns list of randSites ordered by chromosome number from 1 - MT,X,Y
# Weights are so chromosomes with more genes in them are chosen for a rand site more often. Chr one has highest probability.
# def nRandSites(n):
#     chrIDS = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','MT','X','Y']
#     randChr = chromosomesDF.sample(n, replace=True, weights=probabilityL) # Replace allows same row to be sampled again 
#     randChr['BindingSite'] = (np.random.uniform(randChr.start, randChr.end).round()).astype(int)
#     randChrL = [] 
#     for i in chrIDS: 
#         DF = randChr.loc[randChr['chromosome'] == i]
#         sortDF = DF.iloc[(DF['BindingSite']).argsort()]         
#         randChrL.append(sortDF.BindingSite.values)
#     return randChrL  
dl = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25'] # 23, 24, 25
chrD = {'1': [1, 248956422],'2': [1, 133797422],'3': [1, 135086622],'4': [1, 133275309],'5': [1, 114364328],'6': [1, 107043718],'7': [1, 101991189],'8': [1, 90338345],'9': [1, 83257441],'10': [1, 80373285],'11': [1, 58617616],'12': [1, 242193529],'13': [1, 64444167],'14': [1, 46709983],'15': [1, 50818468],'16': [1, 198295559],'17': [1, 190214555],'18': [1, 181538259],'19': [1, 170805979],'20': [1, 159345973],'21': [1, 145138636],'22': [1, 138394717],'23': [1, 16569],'24': [1, 156040895],'25': [2781480, 56887902]}

def nRandSites(nSite):
    sitesbyC =[[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
    random.seed()
    for site in range(nSite):
        chr = random.choice(dl, p=probabilityL)
        randnum = round(random.uniform(chrD[chr][0], chrD[chr][1]))
        sitesbyC[int(chr) - 1].append(randnum)
    for l in sitesbyC:
        l.sort()
    return array([array(l) for l in sitesbyC])

def nRandSitesSim(nSite, nSim):
    chunks = [nSite] * nSim
    pool = Pool(processes = 8)
    result = pool.map(nRandSites, chunks)
    return(result) 

In [6]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneTSS.
def addWindowTSS(window):
    geneWindowDFL = geneDFL
    outList = []
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        dfIndexMax = len(dfG) - 1   
        for i, row in dfG.iterrows():
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end genes
                if i == 0:
                    dfG.at[i, 'lowB'] = max(row['start'] - window, 1) 
                    dfG.at[i, 'upperB'] = min(row['start'] + window, ((dfG.iat[i+1, 3] - row['start']) / 2) + row['start'])
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(row['start'] - window, row['start'] - ((row['start'] - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                    dfG.at[i, 'upperB'] = row['start'] + window
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(row['start'] - window, row['start'] - ((row['start'] - dfG.iat[i-1, 3]) / 2)) # dfG.iat[i-1, 3] = start of gene below
                dfG.at[i, 'upperB'] = min(row['start'] + window, ((dfG.iat[i+1, 3] - row['start']) / 2) + row['start']) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    return(outList)

In [7]:
# Adds midpoint in gene and upper and lower bound values to geneDFL based on size of window or nearby genes. Adding around geneBODY.
def addWindowBODY(window):
    geneWindowDFL = geneDFL
    outList = []
    for dfG in geneWindowDFL: # ordered by chromosome 1- MT,X,Y
        dfIndexMax = len(dfG) - 1         
        for i, row in dfG.iterrows():
            dfG.at[i, 'midBody'] = (((row['length'] / 2)) + row['start']) # Middle of gene body 
            
            if i == 0 or i == dfIndexMax: # Case for start and end sites
                if i == 0:
                    dfG.at[i, 'lowB'] = max(row['start'] - window, 1) # end of the gene below position
                    dfG.at[i, 'upperB'] = min(row['end'] + window, ((dfG.iat[i+1, 3] - row['end']) / 2) + row['end'])
                else: # index max case 
                    dfG.at[i, 'lowB'] = max(row['start'] - window, row['start'] - ((row['start'] - dfG.iat[i-1, 4]) / 2)) 
                    dfG.at[i, 'upperB'] = row['end'] + window
            else: # i > 0 or i < len(geneDF) - 1 
                dfG.at[i, 'lowB'] = max(row['start'] - window, row['start'] - ((row['start'] - dfG.iat[i-1, 4]) / 2)) # end of the gene below position [i,4]
                dfG.at[i, 'upperB'] = min(row['end'] + window, ((dfG.iat[i+1, 3] - row['end']) / 2) + row['end']) # Start of the gene above position [i,3]
        dfG['midBody'] = dfG['midBody'].astype(int)
        dfG['lowB'] = dfG['lowB'].astype(int)
        dfG['upperB'] = dfG['upperB'].astype(int)
        LBandUB = []
        LBandUB.append(dfG.lowB.values)
        LBandUB.append(dfG.upperB.values)
        outList.append(LBandUB)
    return(outList)

In [8]:
# Creating list of all the entrezids 
entrezIDL = []
for geneDF in geneDFL:
    entrezIDL.append(geneDF.entrezid.values)

In [9]:
# CLOSEST Genes potentionally regulated by sites in window with respect to the genes body. k = 1 
def geneReadSites(bsL, geneL): # geneL = windows lists
#     geneL = addWindowBODY(window)
    chromosomeI = 0 # 0 == chromosome group 1          
    entrezidOutL = [] # Output list of entrez ids 
    
    for bounds in geneL:
        lowB = bounds[0]
        upperB = bounds[1]
        geneIDS = entrezIDL[chromosomeI]
        sitesL = bsL[chromosomeI]
        
        for site in sitesL:
            for i in range(len(lowB)):
                if site < lowB[i]:
                    break
                if lowB[i] <= site and upperB[i] > site:
                    entrezidOutL.append(geneIDS[i])
                    lowB = lowB[i+1:] # Getting rid of the gene windows because it has been mapped. Genes at a lower positions are removed casuse they are small
                    upperB = upperB[i+1:]
                    geneIDS = geneIDS[i+1:]
                    break # Done with the current site so break 
        chromosomeI += 1   
    return entrezidOutL

In [10]:
# k = 1 
def nSimulations(nSim, nSites, window, method='TSS'):
    if method == 'TSS':
        geneL = addWindowTSS(window)
    elif method == 'BODY': 
        geneL = addWindowBODY(window)
    else:
        return 'Only TSS or BODY method allowed!'
    sitesL = nRandSitesSim(nSites,nSim)
    output = []
    
    for sim in range(nSim):
        mapped = geneReadSites(sitesL[sim], geneL)
        output.append(mapped)
    return output