In [75]:
import pandas as pd 
import matplotlib as plto
import re as re 
import random as rand
import numpy as np

pd.set_option('display.max_colwidth', -1)

col_names = ['chromosome', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'] # For df object
#df = data frame object of the Homo Sapians file 
df = pd.read_csv('Homo_sapiens.GRCh38.85.gff3.gz', compression='gzip',
                         sep='\t', comment='#', low_memory=False,
                         header=None, names=col_names)

# <font color=#4286f4>Setting up my DataFrame as variable: genes/chromosome</font>

In [76]:
geneAnnotations = df[df.type.isin(['gene'])]
chromosome = df[df.type.isin(['chromosome'])]

# <font color=#4286f4>Adding Columns to geneAnnotations: gene_name, gene_id, length</font>

In [77]:
# ONLY NEEDS TO BE DONE ONCE!!!
RE_GENE_NAME = re.compile(r'Name=(?P<gene_name>.+?);') 

#|Below| Searching for data in attributes col and then creating new col for that data
def extract_gene_name(attributes_str): 
    res = RE_GENE_NAME.search(attributes_str)
    return res.group('gene_name')

geneAnnotations['gene_name'] = geneAnnotations.attributes.apply(extract_gene_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [78]:
# THIS ALSO ONLY NEEDS TO BE DONE ONCE!!!
RE_GENE_ID = re.compile(r'gene_id=(?P<gene_id>ENSG.+?);')
def extract_gene_id(attributes_str): # New COL again
    res = RE_GENE_ID.search(attributes_str)
    return res.group('gene_id')

geneAnnotations['gene_id'] = geneAnnotations.attributes.apply(extract_gene_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [79]:
# THIS ALSO ONLY NEEDS TO BE DONE ONCE!!!
geneAnnotations['length'] = geneAnnotations.end - geneAnnotations.start + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


# <font color=#4286f4>Creating DataFrame for n Chromosomes with n sites </font>

In [80]:
#I need random chromosome combined with random position (Somewhere between start and end)
def nBindingsites(n):    
    randChr = chromosome.sample(n, replace=True)
    randChr['BindingSite'] = (np.random.uniform(1, randChr.end).round()).astype(int)
    return randChr
    #Returns a DataFrame with Random sample number with random BindingSite

In [159]:
# Distance from Body
def distanceBody(start, end, site):
    return min(abs(start - site), abs(end - site))

# Distance from TSS
def distanceTSS(start, site):
    return abs(start - site)

#Returns True or False to wheather the binding site is within the window size to Body | Start Site below
def isNearBody(start, end, site, window):
    if abs(start - site) <= window:
        return True
    elif abs(end - site) <= window:
        return True
    else:
        return False 
    
def isNearTSS(start, site, window):
    if abs(start - site) <= window:
        return True
    else: 
        return False

In [158]:
# geneAnnotations = geneAnnotations.drop(['attributes'], axis=1)

# k = Infinity 

In [83]:
#Given specific chromosome, rdm binding site, window: Will calculate number of nearby gene(bodies) from its site.
def genesInWindowBody(chrNum, site, window):
    count = 0
    geneL = []
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearBody(row['start'], row['end'], site, window):
            count = count + 1
            geneL.append(row['gene_name'])
    return(count, geneL)

#Given specific chromosome, rdm binding site, window: Will calculate number of nearby gene(TSS) from the site.
def genesInWindowTSS(chrNum, site, window):
    count = 0
    geneL = []
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearTSS(row['start'], site, window):
            count = count + 1
            geneL.append(row['gene_name'])    
    return(count, geneL)


## <font color=#4286f4>Given a window: Find all the genes inside window for the dataframe of chromosomes</font>

In [85]:
# Takes DF and Window size: Returnes Total Gene count and List of Genes Near Body of GENE (K=INFINITY)
def geneCountNsitesBody(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        c = genesInWindowBody(row['chromosome'], row['BindingSite'], window)
        count = count + c[0]
        geneL.append(c[1])
    return(count, geneL)

# Takes DF and Window size: Returnes Total Gene count and List of Genes Near TSS of GENE (K=INFINITY)
def geneCountNsitesTSS(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        c = genesInWindowTSS(row['chromosome'], row['BindingSite'], window)
        count = count + c[0]
        geneL.append(c[1])
    return(count, geneL)

geneCountNsitesBody(chromoDF, 5045299)

(126,
 [['CLUL1',
   'RP11-806L2.2',
   'RP11-806L2.5',
   'TYMS',
   'ENOSF1',
   'RP11-806L2.6',
   'YES1',
   'RP11-769O8.1',
   'RP11-769O8.2',
   'RP11-769O8.3',
   'BOLA2P1',
   'RP11-672L10.5',
   'RP11-672L10.3',
   'ADCYAP1',
   'RP11-78F17.3',
   'COX6CP3',
   'RP11-288C17.3',
   'RP11-288C17.1',
   'CTD-2533G20.1',
   'METTL4',
   'RP11-715F3.2',
   'RP11-715F3.1',
   'NDC80',
   'KATNBL1P3',
   'CBX3P2',
   'SMCHD1',
   'RP11-703M24.5',
   'EMILIN2',
   'LPIN2',
   'RP11-737O24.5',
   'RP11-737O24.3',
   'RP11-737O24.2',
   'RP11-737O24.1',
   'SNRPCP4',
   'RP11-193E15.4',
   'MYOM1',
   'RP13-270P17.2',
   'MYL12A',
   'RP13-270P17.1',
   'MYL12B',
   'RPL31P59',
   'IGLJCOR18',
   'RPL21P127',
   'TGIF1',
   'BOD1P1',
   'DLGAP1',
   'RP11-710M11.1',
   'DLGAP1-AS1',
   'DLGAP1-AS2',
   'RP11-874J12.4',
   'RP11-874J12.3',
   'DLGAP1-AS3',
   'DLGAP1-AS4',
   'GAPDHP66',
   'DLGAP1-AS5',
   'RP11-138C24.2',
   'RP11-183C12.1',
   'PPIAP14',
   'BOD1P2',
   'AKAIN1',
   '

# k = 1

In [133]:
#Given specific chromosome, rdm binding site, window: Will calculate closest gene(body) from its site. and distance
def closestGeneBody(chrNum, site, window):
    out = ['gene_name', 999999999999999999999999999999999999999999]
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearBody(row['start'], row['end'], site, window):
            #count = count + 1
            #geneL.append(pd.DataFrame([[row['chromosome'], row['source'], row['type'], row['start'], row['end'], row['score'], row['strand'], row['phase'], row['attributes']]], columns=col_names))
            if distanceBody(row['start'], row['end'], site) < out[1]:
                out[0] = row['gene_name']
                out[1] = distanceBody(row['start'], row['end'], site)
    # Returns closest gene and its distance from body. 
    return(out)

#Given specific chromosome, rdm binding site, window: Will calculate closest gene(TSS) from its site. and Distance
def closestGeneTSS(chrNum, site, window):
    out = ['gene_name', 999999999999999999999999999999999999999999]
    geneDF = geneAnnotations.loc[geneAnnotations['chromosome'] == str(chrNum)]
    
    for index, row in geneDF.iterrows():
        if isNearTSS(row['start'], site, window):
            #count = count + 1
            #geneL.append(pd.DataFrame([[row['chromosome'], row['source'], row['type'], row['start'], row['end'], row['score'], row['strand'], row['phase'], row['attributes']]], columns=col_names))
            if distanceTSS(row['start'], site) < out[1]:
                out[0] = row['gene_name']
                out[1] = distanceTSS(row['start'], site)
    # Returns closest gene and its distance from body. 
    return(out)

In [150]:
# Takes DF and Window size: Returnes Total Gene count and List of CLOSEST Gene Near its BODY for every site(K = 1)
def closestGenesNsitesBody(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        x = closestGeneBody(row['chromosome'], row['BindingSite'], window)
        geneL.append(x[0])
        count = count + 1
    return(count, geneL)
    

# Takes DF and Window size: Returnes Total Gene count and List of CLOSEST Gene Near its TSS for every site (K = 1)
def closestGenesNsitesTSS(df, window):
    count = 0 
    geneL = []
    
    for index, row in df.iterrows():
        x = closestGeneTSS(row['chromosome'], row['BindingSite'], window)
        geneL.append(x[0])
        count = count + 1
    return(count, geneL)


testingDF = nBindingsites(100)


In [None]:
# finds unique in list | Might use later 
def unique(list1): 
    unique_list = [] 
      
    for x in list1: 
        if x not in unique_list: 
            unique_list.append(x) 
    for x in unique_list: 
        print(x) 