In [1]:
import pandas as pd 
import re as re 
import numpy as np
import time 
import mygene

pd.set_option('display.max_colwidth', -1)
pd.set_option('mode.chained_assignment', 'warn')

col_names = ['chromosome', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'] # For df object
#df = data frame (object of the Homo Sapians file)
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',
                         sep='\t', comment='#', low_memory=False,
                         header=None, names=col_names)

# Old Functions

In [2]:
# Made this function in a better way below 
# def closestGeneBody(chrNum, site, window):
#     out = ['No Result', 9**90] # This is just a random num that will be replace by func below. 
#     geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
#     for index, row in geneDF.iterrows():
#         if inRange(row['start'], site, window):
#             if isNearBody(row['start'], row['end'], site, window):
#                 if distanceBody(row['start'], row['end'], site) < out[1]:
#                     out[0] = row['gene_name']
#                     out[1] = distanceBody(row['start'], row['end'], site)
#         else: 
#             return(out)
#     # Returns closest gene and its distance from body. 
#     return(out)

# def closestGenesNsitesBody(df, window):
#     count = 0 
#     geneL = []
# #     for index, row in df.iterrows():
# #         x = closestGeneBody(row['chromosome'], row['BindingSite'], window)
# #         geneL.append(x[0])
# #         count = count + 1
#     for row in zip(df['chromosome'], df['BindingSite']):
#         x = closestGeneBody(row[0], row[1], window)
#         geneL.append(x[0])
#         count = count + 1
#     return(count, geneL)

# finds unique in list | Might use later 
# def unique(list1): 
#     unique_list = [] 
#     for x in list1: 
#         if x not in unique_list: 
#             unique_list.append(x) 
#     return unique_list

# A. Estimating number of sites to be sampled randomly

# <font color=#4286f4>Setting up my DataFrame as variable: genes/chromosome</font>

In [3]:
geneAnnotations = df[df.type.isin(['gene'])].copy()
chromosome = df[df.type.isin(['chromosome'])].copy()

# <font color=#4286f4>Adding Columns to geneAnnotations: gene_symbol, gene_ensID, length</font>

In [4]:
# ONLY NEEDS TO BE DONE ONCE!!! (BOTH UP AND DOWN)
RE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);') 
#|Below| Searching for data in attributes col and then creating new col for that data
def extract_gene_name(attributes_str): 
    res = RE_GENE_NAME.search(attributes_str)
    return res.group('gene_name')
geneAnnotations['gene_symbol'] = geneAnnotations.attributes.apply(extract_gene_name)

In [5]:
RE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')
def extract_gene_id(attributes_str): # New COL again
    res = RE_GENE_ID.search(attributes_str)
    return res.group('gene_id')
geneAnnotations['gene_ensID'] = geneAnnotations.attributes.apply(extract_gene_id)

In [6]:
geneAnnotations['length'] = geneAnnotations.end - geneAnnotations.start + 1

In [7]:
geneAnnotationsDF = geneAnnotations.drop(['attributes'], axis=1)
chromosomesDF = chromosome.drop(['attributes'], axis=1)

# <font color=#4286f4>Creating DataFrame for n Chromosomes with n sites </font>

In [8]:
#I need random chromosome combined with random position (Somewhere between start and end)
def nBindingsites(n):    
    randChr = chromosomesDF.sample(n, replace=True)
    randChr['BindingSite'] = (np.random.uniform(1, randChr.end).round()).astype(int)
    return randChr
    #Returns a DataFrame with Random sample number with random BindingSite

In [9]:
#MIGHT NOT NEED ANY OF THIS ACTUALLY COULD BE FASTER 

# Distance from Body
def distanceBody(start, end, site):
    return min(abs(start - site), abs(end - site))

# Distance from TSS
def distanceTSS(start, site):
    return abs(start - site)

#Returns True or False to wheather the binding site is within the window size to Body | Start Site below
def isNearBody(start, end, site, window):
    if abs(start - site) <= window:
        return True
    elif abs(end - site) <= window:
        return True
    else:
        return False 
    
def isNearTSS(start, site, window):
    if abs(start - site) <= window:
        return True
    else: 
        return False
#-------------------------------------------- ## Should there be anymore iterations? Checker function 
def inRange(start, site, window):
    if start > (site + window):
        return False 
    else:
        return True

# For: k = Infinity 

In [10]:
#Given specific chromosome, rdm binding site, window: Will calculate number of nearby gene(bodies) from its site.
def genesInWindowBody(chrNum, site, window):
    count = 0
    geneL = []
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearBody(row['start'], row['end'], site, window):
            count = count + 1
            geneL.append(row['gene_symbol'])
    return(count, geneL)

#Given specific chromosome, rdm binding site, window: Will calculate number of nearby gene(TSS) from the site.
def genesInWindowTSS(chrNum, site, window):
    count = 0
    geneL = []
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearTSS(row['start'], site, window):
            count = count + 1
            geneL.append(row['gene_symbol'])    
    return(count, geneL)

## <font color=#4286f4>Given a window: Find all the genes inside window for the dataframe of chromosomes</font>

In [11]:
# Takes DF and Window size: Returnes Total Gene count and List of Genes Near Body of GENE (K=INFINITY)
def geneCountNsitesBody(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        c = genesInWindowBody(row['chromosome'], row['BindingSite'], window)
        count = count + c[0]
        geneL.append(c[1])
    return(count, geneL)

# Takes DF and Window size: Returnes Total Gene count and List of Genes Near TSS of GENE (K=INFINITY)
def geneCountNsitesTSS(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        c = genesInWindowTSS(row['chromosome'], row['BindingSite'], window)
        count = count + c[0]
        geneL.append(c[1])
    return(count, geneL)

# For: k = 1

In [12]:
#Given specific chromosome, rdm binding site, window: Will calculate closest gene(body) from its site. and distance
def closestGeneBody(chrNum, site, window):
    geneDF = geneAnnotationsDF.loc[geneAnnotationsDF['chromosome'] == str(chrNum)]
    
    startG = geneDF.iloc[(geneDF['start'] - site).abs().argsort()].head(1)
    endG = geneDF.iloc[(geneDF['end'] - site).abs().argsort()].head(1)
    
    startDist = distanceBody(startG['start'].iloc[0], startG['end'].iloc[0], site)
    endDist = distanceBody(endG['start'].iloc[0], endG['end'].iloc[0], site)
    
    if startDist < endDist:
        if isNearBody(startG['start'].iloc[0], startG['end'].iloc[0], site, window):
            return(startG['gene_symbol'].iloc[0], startDist)
        else:
            return('No Result')
    else:
        if isNearBody(endG['start'].iloc[0], endG['end'].iloc[0], site, window):
            return(endG['gene_symbol'].iloc[0], endDist)
        else:
            return('No Result')

#Given specific chromosome, rdm binding site, window: Will calculate closest gene(TSS) from its site. and Distance
def closestGeneTSS(chrNum, site, window):
    out = ['No Result', 9**90]
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearTSS(row['start'], site, window):
            if distanceTSS(row['start'], site) < out[1]:
                out[0] = row['gene_symbol']
                out[1] = distanceTSS(row['start'], site)
    # Returns closest gene and its distance from TSS. 
    return(out)

In [13]:
# Takes DF and Window size: Returnes Total Gene count and List of CLOSEST Gene Near its BODY for every site(K = 1)
def closestGenesNsitesBody(df, window):
    count = 0 
    geneL = []
    
    for row in zip(df['chromosome'].astype(str), df['BindingSite'].values):
        x = closestGeneBody(row[0], row[1], window)
        if x == 'No Result':
            pass
        else:
            geneL.append(x[0])
            count = count + 1
    return(count, geneL)
    
# Takes DF and Window size: Returnes Total Gene count and List of CLOSEST Gene Near its TSS for every site (K = 1)
def closestGenesNsitesTSS(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        x = closestGeneTSS(row['chromosome'], row['BindingSite'], window)
        geneL.append(x[0])
        count = count + 1
    return(count, geneL)

In [14]:
geneAnnotationsDF.index

Int64Index([     16,      28,      71,      74,      77,     108,     112,
                118,     121,     166,
            ...
            2601762, 2601780, 2601786, 2601793, 2601796, 2601818, 2601821,
            2601826, 2601841, 2601846],
           dtype='int64', length=42485)

In [15]:
geneAnnotationsDF.reset_index()

Unnamed: 0,index,chromosome,source,type,start,end,score,strand,phase,gene_symbol,gene_ensID,length
0,16,1,havana,gene,11869,14409,.,+,.,DDX11L1,ENSG00000223972,2541
1,28,1,havana,gene,14404,29570,.,-,.,WASH7P,ENSG00000227232,15167
2,71,1,havana,gene,52473,53312,.,+,.,OR4G4P,ENSG00000268020,840
3,74,1,havana,gene,62948,63887,.,+,.,OR4G11P,ENSG00000240361,940
4,77,1,ensembl_havana,gene,69091,70008,.,+,.,OR4F5,ENSG00000186092,918
5,108,1,havana,gene,131025,134836,.,+,.,CICP27,ENSG00000233750,3812
6,112,1,havana,gene,135141,135895,.,-,.,RP11-34P13.15,ENSG00000268903,755
7,118,1,havana,gene,137682,137965,.,-,.,RP11-34P13.16,ENSG00000269981,284
8,121,1,havana,gene,139790,140339,.,-,.,RP11-34P13.14,ENSG00000239906,550
9,166,1,ensembl,gene,182393,184158,.,+,.,FO538757.2,ENSG00000279928,1766
