transcript utils

jeffhsu3 · Dec 17, 2015 · 0d02f38 · 0d02f38
1 parent 6710143
commit 0d02f38
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 37 deletions.
diff --git a/genda/plotting/plotting_utils.py b/genda/plotting/plotting_utils.py
@@ -239,9 +239,14 @@ def plot_transcript_i(transcript, ax, y=0, height=2.,
     Arguments
     =========
     transcript - a list of exons (start, end, exon_number)
-    :TODO transcript could be an object.  
+    ax - matplotlib.axes
+    y - where to place the base of the transcript on the y-axis
+    height - height of the boxes of the transcript
     ax - matplotlib axes object
 
+    :TODO transcript could be an object.  
+    :TODO write tests for this
+
     Returns:
     =======
     matplotlib.axis
@@ -268,6 +273,11 @@ def plot_transcript_i(transcript, ax, y=0, height=2.,
     return(ax, (beg_exon[0], xmax))
 
 
+def intron_scaling(transcripts):
+    """ Scale all features shrinking introns
+    """
+    pass
+
 
 def add_size_legends(ax, size_base=200):
     sizes = [((size_base * i ) + 20) for i in np.linspace(0,0.5, num = 4)]

diff --git a/genda/transcripts/__init__.py b/genda/transcripts/__init__.py
@@ -1,2 +1,2 @@
-from .transcript_utils import *
+from .transcripts_utils import *
 
diff --git a/genda/transcripts/transcripts_utils.py b/genda/transcripts/transcripts_utils.py
@@ -1,3 +1,4 @@
+from bx.intervals.intersection import Intersecter, Interval
 
 
 class SNPInfo(object):
@@ -99,23 +100,30 @@ def unique_sets(set_list):
     return(out_sets)
 
 
+
+
+
 def compare_two_transcripts(trans1, trans2, transcript_dict):
     """
     Returns the splice differences between two transcripts.
     Note this ignores TSS and just looks for splice junctions
 
     Parameters
     ----------
-    :TODO refactor this
-    :TODO still not finished
+    trans1 - string of transcript of interest
+    trans2 - string of second transcript of interest
+    transcript_dict - a dictionary of transcript names with 
+    values being a list of exons
+
+    Returns:
+    5' upstream exons - 
+    3' downstram exons - 
     """
-
     t1 = transcript_dict[trans1]
     t2 = transcript_dict[trans2]
-
+    tree = Intersecter()
     starts1 = [i[0] for i in t1]
     starts2 = [i[0] for i in t2]
-    # Assume sorted
     reverse = False
     if min(starts1) >= min(starts2):
         s1 = t2 
@@ -124,37 +132,58 @@ def compare_two_transcripts(trans1, trans2, transcript_dict):
     else:
         s1 = t1
         s2 = t2
-    # Assume sorted order, ordering in reverse by construction
     if reverse:
         torder = (trans2, trans1)
     else:
         torder = (trans1, trans2)
-    oc = 0
-    #start_matches = False
-    exclusive_juncs = []
+
+    for i in s1:
+        tree.add_interval(Interval(int(i[0]), int(i[1]), 
+            value={'anno':i[2]}))
     matching_exons = []
+    exclusive_juncs = []
     skipped_exons = []
-    for i in s1:
-        if (i[0] < s2[0][0]) and (i[1] <= s2[0][0]): 
+    # Perform the query
+    start_of_exons = None
+    s1.sort(key=lambda x: x[2])
+    s2.sort(key=lambda x: x[2])
+    for start, end, exon_n in s2:
+        start = int(start)
+        end = int(end)
+        overlap = tree.find(int(start), int(end))
+        if len(overlap) == 0:
+            if start_of_exons:
+                skipped_exons.append((start, end, (None, exon_n), (0,end-start)))
+            else: pass
+        elif len(overlap) == 1:
+            if start_of_exons: pass
+            else: start_of_exons = overlap[0].value['anno']
+            if start == overlap[0].start and end == overlap[0].end:
+                matching_exons.append((start, end, (overlap[0].value['anno'], 
+                    exon_n), (0, 0)))
+            else:
+                sstart = min(start, overlap[0].start)
+                ssend = max(end, overlap[0].end)
+                exclusive_juncs.append((sstart , ssend,
+                                        (overlap[0].value['anno'], exon_n), 
+                                        (overlap[0].start - start, 
+                                        overlap[0].end - end) 
+                                        ))
+        else:
+            if start_of_exons:
+                pass
+            else: start_of_exons = overlap[0].value['anno']
+    # Checking for skipped exons of t1 missing in t2
+    hit_exon = [i[2][0] for i in exclusive_juncs] 
+    hit_exon.extend([i[2][0] for i in matching_exons])
+
+    for start, end, exon_n in s1:
+        if exon_n <= start_of_exons:
             pass
-        elif (i[0] < s2[oc][0]) and (i[1] > s2[oc][0]):
-            # 5' difference
-            print('5 primt diff')
-            sdiff = i[0] - s2[oc][0]
-            ediff = i[1] - s2[oc][1]
-            print(s2[oc])
-            exclusive_juncs.append((i[0], s2[oc][0], 
-                sdiff, ediff, (i[2], s2[oc][2])))
-            oc += 1
-        elif (i[0] > s2[oc][0]) and (i[0] < s2[oc][1]):
-            print('3 prime diff')
-            print(i[0] - s2[oc][0])
-            print(i)
-            print(s2[oc])
-            #:TODO FIX this
-            oc += 1
-        elif (i[0] == s2[oc][0]) and (i[1] == s2[oc][1]):
-            matching_exons.append((i[2], s2[oc][2]))
-            oc += 1
+        elif exon_n in hit_exon:
             pass
-    return(exclusive_juncs, torder)
+        else:
+            skipped_exons.append((start, end, (exon_n, None), (end-start)) ) 
+
+
+    return(exclusive_juncs, torder, matching_exons, skipped_exons)
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,5 @@ Cython==0.19.1
 argparse==1.2.1
 numpy==1.7.1
 pandas==0.11.0
-pysam==0.7.4
+pysam==0.8.4
 python-dateutil==2.1
diff --git a/tests/transcriptTests.py b/tests/transcriptTests.py
@@ -1,4 +1,5 @@
-from genda.transcripts import Exon, Gene, Transcript, unique_sets
+from genda.transcripts import (Exon, Gene, Transcript, unique_sets,
+        compare_two_transcripts)
 import unittest
 
 class TestGeneBreaks(unittest.TestCase):
@@ -32,11 +33,32 @@ def setUp(self):
         self.set_b = set(['a', 'b'])
         self.set_c = set(['a', 'b', 'd'])
 
+
     def find_sets(self):
         self.assertEqual(unique_sets([self.set_a, self.set_b, self.set_c]),
                                      [set(['a', 'b']), set(['c']), set(['d'])])
 
 
-if __name__ == '__main__':
-    unittests.main()
 
+class TestCompareTwoTranscripts(unittest.TestCase):
+    """
+    """
+    def setUp(self):
+        self.transcript_dict = {
+                't1' : [(10, 20, 1), 
+                    (50, 60, 2), 
+                    (70, 90, 3)],
+                't2' : [(10, 20, 1), (70, 90, 2)]
+                }
+
+    def test_compare_transcripts_skipped_exon(self):
+        exclusive_juncs, torder, matching_exons, skipped_exons =\
+                compare_two_transcripts('t1', 't2', self.transcript_dict)
+        print(torder)
+        print(exclusive_juncs)
+        self.assertEqual([(50, 60, (None, 2), (0, 10)),], skipped_exons)
+
+
+
+if __name__ == '__main__':
+    unittest.main()