In [56]:
import pandas as pd 
import matplotlib as plto
import re as re 
import random as rand
import numpy as np
import time 

pd.set_option('display.max_colwidth', -1)
pd.set_option('mode.chained_assignment', 'warn')

col_names = ['chromosome', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'] # For df object
#df = data frame (object of the Homo Sapians file)
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',
                         sep='\t', comment='#', low_memory=False,
                         header=None, names=col_names)

# <font color=#4286f4>Setting up my DataFrame as variable: genes/chromosome</font>

In [57]:
geneAnnotations = df[df.type.isin(['gene'])].copy()
chromosome = df[df.type.isin(['chromosome'])].copy()

# <font color=#4286f4>Adding Columns to geneAnnotations: gene_name, gene_id, length</font>

In [58]:
# ONLY NEEDS TO BE DONE ONCE!!! (BOTH UP AND DOWN)
RE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);') 
#|Below| Searching for data in attributes col and then creating new col for that data
def extract_gene_name(attributes_str): 
    res = RE_GENE_NAME.search(attributes_str)
    return res.group('gene_name')
geneAnnotations['gene_name'] = geneAnnotations.attributes.apply(extract_gene_name)

In [59]:
RE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')
def extract_gene_id(attributes_str): # New COL again
    res = RE_GENE_ID.search(attributes_str)
    return res.group('gene_id')
geneAnnotations['gene_id'] = geneAnnotations.attributes.apply(extract_gene_id)

In [60]:
geneAnnotations['length'] = geneAnnotations.end - geneAnnotations.start + 1

# <font color=#4286f4>Creating DataFrame for n Chromosomes with n sites </font>

In [61]:
#I need random chromosome combined with random position (Somewhere between start and end)
def nBindingsites(n):    
    randChr = chromosome.sample(n, replace=True)
    randChr['BindingSite'] = (np.random.uniform(1, randChr.end).round()).astype(int)
    return randChr
    #Returns a DataFrame with Random sample number with random BindingSite

In [62]:
# Distance from Body
def distanceBody(start, end, site):
    return min(abs(start - site), abs(end - site))

# Distance from TSS
def distanceTSS(start, site):
    return abs(start - site)

#Returns True or False to wheather the binding site is within the window size to Body | Start Site below
def isNearBody(start, end, site, window):
    if abs(start - site) <= window:
        return True
    elif abs(end - site) <= window:
        return True
    else:
        return False 
    
def isNearTSS(start, site, window):
    if abs(start - site) <= window:
        return True
    else: 
        return False

# For: k = Infinity 

In [63]:
#Given specific chromosome, rdm binding site, window: Will calculate number of nearby gene(bodies) from its site.
def genesInWindowBody(chrNum, site, window):
    count = 0
    geneL = []
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearBody(row['start'], row['end'], site, window):
            count = count + 1
            geneL.append(row['gene_name'])
    return(count, geneL)

#Given specific chromosome, rdm binding site, window: Will calculate number of nearby gene(TSS) from the site.
def genesInWindowTSS(chrNum, site, window):
    count = 0
    geneL = []
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearTSS(row['start'], site, window):
            count = count + 1
            geneL.append(row['gene_name'])    
    return(count, geneL)

## <font color=#4286f4>Given a window: Find all the genes inside window for the dataframe of chromosomes</font>

In [64]:
# Takes DF and Window size: Returnes Total Gene count and List of Genes Near Body of GENE (K=INFINITY)
def geneCountNsitesBody(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        c = genesInWindowBody(row['chromosome'], row['BindingSite'], window)
        count = count + c[0]
        geneL.append(c[1])
    return(count, geneL)

# Takes DF and Window size: Returnes Total Gene count and List of Genes Near TSS of GENE (K=INFINITY)
def geneCountNsitesTSS(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        c = genesInWindowTSS(row['chromosome'], row['BindingSite'], window)
        count = count + c[0]
        geneL.append(c[1])
    return(count, geneL)

# For: k = 1

In [65]:
#Given specific chromosome, rdm binding site, window: Will calculate closest gene(body) from its site. and distance
def closestGeneBody(chrNum, site, window):
    out = ['No Result', 9**90] # This is just a random num that will be replace by func below. 
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearBody(row['start'], row['end'], site, window):
            if distanceBody(row['start'], row['end'], site) < out[1]:
                out[0] = row['gene_name']
                out[1] = distanceBody(row['start'], row['end'], site)
    # Returns closest gene and its distance from body. 
    return(out)

#Given specific chromosome, rdm binding site, window: Will calculate closest gene(TSS) from its site. and Distance
def closestGeneTSS(chrNum, site, window):
    out = ['No Result', 9**90]
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearTSS(row['start'], site, window):
            if distanceTSS(row['start'], site) < out[1]:
                out[0] = row['gene_name']
                out[1] = distanceTSS(row['start'], site)
    # Returns closest gene and its distance from TSS. 
    return(out)

In [66]:
# Takes DF and Window size: Returnes Total Gene count and List of CLOSEST Gene Near its BODY for every site(K = 1)
def closestGenesNsitesBody(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        x = closestGeneBody(row['chromosome'], row['BindingSite'], window)
        geneL.append(x[0])
        count = count + 1
    return(count, geneL)
    
# Takes DF and Window size: Returnes Total Gene count and List of CLOSEST Gene Near its TSS for every site (K = 1)
def closestGenesNsitesTSS(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        x = closestGeneTSS(row['chromosome'], row['BindingSite'], window)
        geneL.append(x[0])
        count = count + 1
    return(count, geneL)

In [67]:
# finds unique in list | Might use later 
def unique(list1): 
    unique_list = [] 
    for x in list1: 
        if x not in unique_list: 
            unique_list.append(x) 
    return unique_list

In [68]:
testingDF = nBindingsites(100)

In [69]:
geneAnnotationsDF = geneAnnotations.drop(['attributes'], axis=1)

In [70]:
chromosomesDF = chromosome.drop(['attributes'], axis=1)

In [71]:
chromosomesDF.sample(1)

Unnamed: 0,chromosome,source,type,start,end,score,strand,phase
674767,14,GRCh38,chromosome,1,107043718,.,.,.


In [72]:
geneAnnotationsDF.sample(5)

Unnamed: 0,chromosome,source,type,start,end,score,strand,phase,gene_name,gene_id,length
2525959,X,ensembl_havana,gene,24693919,24996986,.,+,.,POLA1,ENSG00000101868,303068
199288,1,havana,gene,203400266,203400581,.,-,.,LARP7P1,ENSG00000271588,316
2204949,7,ensembl_havana,gene,6161776,6272644,.,-,.,CYTH3,ENSG00000008256,110869
521232,12,havana,gene,31443792,31444208,.,-,.,RP11-771K4.3,ENSG00000275769,417
655808,13,ensembl_havana,gene,52132639,52159861,.,-,.,NEK3,ENSG00000136098,27223


In [73]:
x = genesInWindowTSS(14, 53521859, 100009990)