Merge pull request #144 from hammerlab/fix-normalize-chromosome

fixed normalize_chromosome, made all_feature_values private
openvax · Mar 29, 2016 · 22dd18c · 22dd18c
2 parents 327c7e7 + 89c94c0
commit 22dd18c
Show file tree

Hide file tree

Showing 8 changed files with 41 additions and 28 deletions.
diff --git a/pyensembl/__init__.py b/pyensembl/__init__.py
@@ -35,7 +35,9 @@
 )
 from .transcript import Transcript
 
-__version__ = '0.8.10'
+__version__ = '0.8.11'
+
+
 _cache = {}
 
 def cached_release(version, species="human"):

diff --git a/pyensembl/genome.py b/pyensembl/genome.py
@@ -362,7 +362,7 @@ def delete_index_files(self):
         if exists(db_path):
             remove(db_path)
 
-    def all_feature_values(
+    def _all_feature_values(
             self,
             column,
             feature,
@@ -565,12 +565,14 @@ def locus_of_exon_id(self, exon_id):
     #                  Contigs
     #
     ###################################################
+
     @memoize
     def contigs(self):
-      """
-      Returns all contigs ("seqname") for all genes
-      """
-      return self.db.query_feature_values("seqname", "gene")
+        """
+        Returns all contig names for any gene in the genome
+        (field called "seqname" in Ensembl GTF files)
+        """
+        return self.db.query_feature_values("seqname", "gene")
 
     ###################################################
     #
@@ -687,7 +689,7 @@ def gene_names(self, contig=None, strand=None):
         Return all genes in the database,
         optionally restrict to a chromosome and/or strand.
         """
-        return self.all_feature_values(
+        return self._all_feature_values(
             column="gene_name",
             feature="gene",
             contig=contig,
@@ -734,7 +736,7 @@ def gene_ids(self, contig=None, strand=None):
         What are all the gene IDs
         (optionally restrict to a given chromosome/contig and/or strand)
         """
-        return self.all_feature_values(
+        return self._all_feature_values(
             column="gene_id",
             feature="gene",
             contig=contig,
@@ -872,7 +874,7 @@ def transcript_names(self, contig=None, strand=None):
         What are all the transcript names in the database
         (optionally, restrict to a given chromosome and/or strand)
         """
-        return self.all_feature_values(
+        return self._all_feature_values(
             column="transcript_name",
             feature="transcript",
             contig=contig,
@@ -916,7 +918,7 @@ def _query_transcript_ids(
 
     @memoize
     def transcript_ids(self, contig=None, strand=None):
-        return self.all_feature_values(
+        return self._all_feature_values(
             column="transcript_id",
             feature="transcript",
             contig=contig,
@@ -1020,7 +1022,7 @@ def _query_exon_ids(self, property_name, value):
 
     @memoize
     def exon_ids(self, contig=None, strand=None):
-        return self.all_feature_values(
+        return self._all_feature_values(
             column="exon_id",
             feature="exon",
             contig=contig,
@@ -1054,7 +1056,7 @@ def protein_ids(self, contig=None, strand=None):
         What are all the protein IDs
         (optionally restrict to a given chromosome and/or strand)
         """
-        protein_ids = self.all_feature_values(
+        protein_ids = self._all_feature_values(
             column="protein_id",
             feature="CDS",
             contig=contig,

diff --git a/pyensembl/locus.py b/pyensembl/locus.py
@@ -41,17 +41,20 @@ def normalize_chromosome(c):
     if result.startswith("chr"):
         result = result[3:]
 
+    # just in case someone is being lazy, capitalize "M", "MT", X", "Y"
+    result = result.upper()
+
     # standardize mitochondrial genome to be "MT"
     if result == "M":
         result = "MT"
-    else:
-        # just in case someone is being lazy, capitalize "X" and "Y"
-        result = result.upper()
+
     # interning strings since the chromosome names probably get constructed
     # or parsed millions of times, can save memory in tight situations
     # (such as parsing GTF files)
     result = intern(result)
+
     NORMALIZE_CHROMOSOME_CACHE[c] = result
+
     return result
 
 def normalize_strand(strand):
@@ -61,7 +64,6 @@ def normalize_strand(strand):
         return "+"
     elif strand == -1:
         return "-"
-
     raise ValueError("Invalid strand: %s" % (strand,))
 
 class Locus(object):
@@ -200,10 +202,8 @@ def can_overlap(self, contig, strand=None):
         """
         Is this locus on the same contig and (optionally) on the same strand?
         """
-        return (
-            self.on_contig(contig)
-            and
-            (strand is None or self.on_strand(strand)))
+        return (self.on_contig(contig) and
+                (strand is None or self.on_strand(strand)))
 
     def distance_to_interval(self, start, end):
         """

diff --git a/setup.py b/setup.py
@@ -38,9 +38,11 @@
     print("Failed to convert %s to reStructuredText", readme_filename)
     pass
 
-with open('pyensembl/__init__.py', 'r') as fd:
-    version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
-                        fd.read(), re.MULTILINE).group(1)
+with open('pyensembl/__init__.py', 'r') as f:
+    version = re.search(
+        r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
+        f.read(),
+        re.MULTILINE).group(1)
 
 if not version:
     raise RuntimeError('Cannot find version information')

diff --git a/test/common.py b/test/common.py
@@ -15,7 +15,7 @@
     ensembl_grch38
 ]
 
-contigs = list(range(1, 23)) + ["X", "Y", "M"]
+contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"]
 
 @nottest
 def test_ensembl_releases(*versions):

diff --git a/test/test_contigs.py b/test/test_contigs.py
@@ -0,0 +1,6 @@
+from pyensembl import ensembl_grch38
+
+def test_contig_names():
+    contig_names = set(ensembl_grch38.contigs())
+    for chrom in list(range(1, 23)) + ["X", "Y", "MT"]:
+        assert str(chrom) in contig_names, (chrom, contig_names)
diff --git a/test/test_locus.py b/test/test_locus.py
@@ -17,6 +17,9 @@ def test_normalize_chromosome():
     assert normalize_chromosome("chrMT") == "MT"
     assert normalize_chromosome("M") == "MT"
     assert normalize_chromosome("MT") == "MT"
+    assert normalize_chromosome("m") == "MT"
+    assert normalize_chromosome("chrm") == "MT"
+    assert normalize_chromosome("mt") == "MT"
 
     with assert_raises(TypeError):
         normalize_chromosome({"a": "b"})

diff --git a/test/test_transcript_objects.py b/test/test_transcript_objects.py
@@ -78,10 +78,8 @@ def test_transcript_exons():
 # below
 @test_ensembl_releases(75, 77)
 def test_sequence_parts(ensembl):
-    """
-    test_sequence_parts : Ensure that the UTRs and coding sequence can be
-    combined to make the full transcript.
-    """
+    # Ensure that the UTRs and coding sequence can be
+    # combined to make the full transcript.
     transcript = ensembl.transcript_by_id(FOXP3_001_transcript_id)
 
     # The combined lengths of the upstream untranslated region,