Skip to content

Commit

Permalink
Merge pull request #144 from hammerlab/fix-normalize-chromosome
Browse files Browse the repository at this point in the history
fixed normalize_chromosome, made all_feature_values private
  • Loading branch information
iskandr committed Mar 29, 2016
2 parents 327c7e7 + 89c94c0 commit 22dd18c
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 28 deletions.
4 changes: 3 additions & 1 deletion pyensembl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
)
from .transcript import Transcript

__version__ = '0.8.10'
__version__ = '0.8.11'


_cache = {}

def cached_release(version, species="human"):
Expand Down
24 changes: 13 additions & 11 deletions pyensembl/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def delete_index_files(self):
if exists(db_path):
remove(db_path)

def all_feature_values(
def _all_feature_values(
self,
column,
feature,
Expand Down Expand Up @@ -565,12 +565,14 @@ def locus_of_exon_id(self, exon_id):
# Contigs
#
###################################################

@memoize
def contigs(self):
"""
Returns all contigs ("seqname") for all genes
"""
return self.db.query_feature_values("seqname", "gene")
"""
Returns all contig names for any gene in the genome
(field called "seqname" in Ensembl GTF files)
"""
return self.db.query_feature_values("seqname", "gene")

###################################################
#
Expand Down Expand Up @@ -687,7 +689,7 @@ def gene_names(self, contig=None, strand=None):
Return all genes in the database,
optionally restrict to a chromosome and/or strand.
"""
return self.all_feature_values(
return self._all_feature_values(
column="gene_name",
feature="gene",
contig=contig,
Expand Down Expand Up @@ -734,7 +736,7 @@ def gene_ids(self, contig=None, strand=None):
What are all the gene IDs
(optionally restrict to a given chromosome/contig and/or strand)
"""
return self.all_feature_values(
return self._all_feature_values(
column="gene_id",
feature="gene",
contig=contig,
Expand Down Expand Up @@ -872,7 +874,7 @@ def transcript_names(self, contig=None, strand=None):
What are all the transcript names in the database
(optionally, restrict to a given chromosome and/or strand)
"""
return self.all_feature_values(
return self._all_feature_values(
column="transcript_name",
feature="transcript",
contig=contig,
Expand Down Expand Up @@ -916,7 +918,7 @@ def _query_transcript_ids(

@memoize
def transcript_ids(self, contig=None, strand=None):
return self.all_feature_values(
return self._all_feature_values(
column="transcript_id",
feature="transcript",
contig=contig,
Expand Down Expand Up @@ -1020,7 +1022,7 @@ def _query_exon_ids(self, property_name, value):

@memoize
def exon_ids(self, contig=None, strand=None):
return self.all_feature_values(
return self._all_feature_values(
column="exon_id",
feature="exon",
contig=contig,
Expand Down Expand Up @@ -1054,7 +1056,7 @@ def protein_ids(self, contig=None, strand=None):
What are all the protein IDs
(optionally restrict to a given chromosome and/or strand)
"""
protein_ids = self.all_feature_values(
protein_ids = self._all_feature_values(
column="protein_id",
feature="CDS",
contig=contig,
Expand Down
16 changes: 8 additions & 8 deletions pyensembl/locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,20 @@ def normalize_chromosome(c):
if result.startswith("chr"):
result = result[3:]

# just in case someone is being lazy, capitalize "M", "MT", X", "Y"
result = result.upper()

# standardize mitochondrial genome to be "MT"
if result == "M":
result = "MT"
else:
# just in case someone is being lazy, capitalize "X" and "Y"
result = result.upper()

# interning strings since the chromosome names probably get constructed
# or parsed millions of times, can save memory in tight situations
# (such as parsing GTF files)
result = intern(result)

NORMALIZE_CHROMOSOME_CACHE[c] = result

return result

def normalize_strand(strand):
Expand All @@ -61,7 +64,6 @@ def normalize_strand(strand):
return "+"
elif strand == -1:
return "-"

raise ValueError("Invalid strand: %s" % (strand,))

class Locus(object):
Expand Down Expand Up @@ -200,10 +202,8 @@ def can_overlap(self, contig, strand=None):
"""
Is this locus on the same contig and (optionally) on the same strand?
"""
return (
self.on_contig(contig)
and
(strand is None or self.on_strand(strand)))
return (self.on_contig(contig) and
(strand is None or self.on_strand(strand)))

def distance_to_interval(self, start, end):
"""
Expand Down
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@
print("Failed to convert %s to reStructuredText", readme_filename)
pass

with open('pyensembl/__init__.py', 'r') as fd:
version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
fd.read(), re.MULTILINE).group(1)
with open('pyensembl/__init__.py', 'r') as f:
version = re.search(
r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
f.read(),
re.MULTILINE).group(1)

if not version:
raise RuntimeError('Cannot find version information')
Expand Down
2 changes: 1 addition & 1 deletion test/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
ensembl_grch38
]

contigs = list(range(1, 23)) + ["X", "Y", "M"]
contigs = [str(c) for c in range(1, 23)] + ["X", "Y", "M"]

@nottest
def test_ensembl_releases(*versions):
Expand Down
6 changes: 6 additions & 0 deletions test/test_contigs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from pyensembl import ensembl_grch38

def test_contig_names():
contig_names = set(ensembl_grch38.contigs())
for chrom in list(range(1, 23)) + ["X", "Y", "MT"]:
assert str(chrom) in contig_names, (chrom, contig_names)
3 changes: 3 additions & 0 deletions test/test_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def test_normalize_chromosome():
assert normalize_chromosome("chrMT") == "MT"
assert normalize_chromosome("M") == "MT"
assert normalize_chromosome("MT") == "MT"
assert normalize_chromosome("m") == "MT"
assert normalize_chromosome("chrm") == "MT"
assert normalize_chromosome("mt") == "MT"

with assert_raises(TypeError):
normalize_chromosome({"a": "b"})
Expand Down
6 changes: 2 additions & 4 deletions test/test_transcript_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,8 @@ def test_transcript_exons():
# below
@test_ensembl_releases(75, 77)
def test_sequence_parts(ensembl):
"""
test_sequence_parts : Ensure that the UTRs and coding sequence can be
combined to make the full transcript.
"""
# Ensure that the UTRs and coding sequence can be
# combined to make the full transcript.
transcript = ensembl.transcript_by_id(FOXP3_001_transcript_id)

# The combined lengths of the upstream untranslated region,
Expand Down

0 comments on commit 22dd18c

Please sign in to comment.