Added gene utils to formats, a convenience module for obtaining infor…

…mation about genes
jeffhsu3 · Jul 13, 2015 · 1678a52 · 1678a52
1 parent 68f8481
commit 1678a52
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 34 deletions.
diff --git a/genda/eQTL/plotting.py b/genda/eQTL/plotting.py
@@ -23,6 +23,18 @@ def __init__(self, chrom, gene, rsID = None):
         self.rsID = rsID
 
 
+def subset_meQTL(meQTL, gene_name):
+    """ 
+    """
+    try:
+        index = meQTL.gene == gene_name
+        subset = meQTL.ix[index, :]
+    except AttributeError:
+        # Case where eQTL matrix is already 1 gene
+        subset = meQTL
+    return(subset)
+
+
 def plot_dosage_by_rsID(gene_reference, dos, cov_mat, counts,
         title = None, ax = None,
         additional_covar=None, 
@@ -104,16 +116,6 @@ def plot_dosage_by_rsID(gene_reference, dos, cov_mat, counts,
         return(fig, test)
 
 
-def subset_meQTL(meQTL, gene_name):
-    """
-    """
-    try:
-        index = meQTL.gene == gene_name
-        subset = meQTL.ix[index, :]
-    except AttributeError:
-        # Case where eQTL matrix is already 1 gene
-        subset = meQTL
-    return(subset)
 
 
 

diff --git a/genda/formats/__init__.py b/genda/formats/__init__.py
@@ -0,0 +1,6 @@
+""" Formats submodule contains classes and functions
+to parse various formats into pandas dataframes as
+well as lookup utilities to various formats
+"""
+
+from .gene_utils import *
diff --git a/genda/formats/dosages.py b/genda/formats/dosages.py
@@ -4,7 +4,6 @@
 import numpy as np
 import pandas as pd
 import pysam
-import pickle as pkl
 import statsmodels.api as sm
 
 class Dosage(object):
@@ -65,20 +64,6 @@ def get_dosages_by_range(chrm, start, end, gene_name, annotation_file,
     return Dosage(out_dos, annot, gene_name)
 
 
-def grab_gene_location(hgnc, cis_buffer=0, ensid = False):
-    """ :TODO maybe query ensembl for this
-    :TODO ensid matching
-    """
-    ann_file = "/proj/genetics/Projects/shared/Subject_Sources/" +\
-            "External/Ensembl/OutputData/EnsemblAnnotationAllHumanGenes.bed.gz"
-    with gzip.open(ann_file) as annot:
-        for line in annot:
-            if hgnc in line:
-                line = line.split("\t")
-                return(line[0], int(line[1]) - cis_buffer, 
-                        int(line[2]) + cis_buffer, str(line[4]))
-
-
 
 def generate_dosage_mapping(dosage_file, mapping_file = None, interval=50):
     """
@@ -106,10 +91,6 @@ def eQTL_func(snps, cov, expression):
     model = sm.OLS(expression, cov)
     return(model.fit().pvalues['snps'])
 
-def two_eQTL_func(snps, cov, expression, snp2):
-    """
-    """
-    pass
 
 class eQTL(object):
     """ Python class for completing eQTLs.  Does lazy loading of all large

diff --git a/genda/formats/gene_utils.py b/genda/formats/gene_utils.py
@@ -0,0 +1,44 @@
+"""
+"""
+
+import json
+import requests
+import gzip
+
+
+def get_symbol_ensembl(ensembl):
+    """Get symbol from ensembl ID from 
+    ensembl REST API   
+    """
+
+    server = "http://rest.ensembl.org"  
+    ext = "/lookup/id/{0}?expand=1"  
+    r = requests.get(server + ext.format(str(ensembl)), 
+                    headers={ "Content-Type" : "application/json"})
+    if not r.ok:   
+        return('NA')
+    else:
+        decoded = r.json()
+        try:
+            return(decoded['display_name']) 
+        except KeyError:
+            return('NA')
+
+
+def grab_gene_location(id, ann_file = None, cis_buffer=0):
+    """ Grab the gene boundries as a tuple buffered by 
+    cis_buffer
+
+    returns (chr, pos0, pos1)
+    """
+
+    if ann_file:
+        with gzip.open(ann_file) as annot:
+            for line in annot:
+                if id in line:
+                    line = line.split("\t")
+                    return(line[0], int(line[1]) - cis_buffer, 
+                            int(line[2]) + cis_buffer, str(line[4]))
+    else:
+        raise NotImplementedError 
+
diff --git a/genda/plotting/plotting_utils.py b/genda/plotting/plotting_utils.py
@@ -108,7 +108,8 @@ def plot_transcript(transcript_id, paths, ax, y=0, height=2.,
 def add_gene_bounderies(ax, gene_annot, gene_name, x_scale):
     """ Add gene bounderies to an axis
     """
-    gene_bounds = grab_gene_location(gene_name, cis_buffer=0)
+    gene_bounds = grab_gene_location(gene_name,
+            ann_file = gene_annot, cis_buffer=0)
     path_a = make_rectangle(float(gene_bounds[1])/x_scale, 
             float(gene_bounds[2])/x_scale,
             0, 200)
@@ -231,7 +232,3 @@ def get_path_max_and_min(gene_dict):
                 points.append(k[0])
     return(min(points), max(points))
 
-
-class read_plots(object):
-    def __init__(self):
-        pass