Merge pull request #137 from hammerlab/exonic-splice-site-can-be-coding

Modest change to filtering of coding mutations include ExonicSpliceSite
openvax · Feb 25, 2016 · 7d1845c · 7d1845c
2 parents 9c9b648 + cdc58f0
commit 7d1845c
Show file tree

Hide file tree

Showing 15 changed files with 134 additions and 83 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -36,12 +36,15 @@ install:
   - pip install .
   - pip install coveralls
 script:
-  # human releases
+  # older human Ensembl releases
   - pyensembl install --release 75 --species human
   - pyensembl install --release 77 --species human
   - pyensembl install --release 81 --species human
-  # mouse releases
+  # latest human Ensembl release
+  - pyensembl install --release 83 --species human
+  # mouse tests written for mouse Ensembl #81
   - pyensembl install --release 81 --species mouse
+  # now actually run the tests, generate a coverage report and run linter
   - nosetests test --with-coverage --cover-package=varcode && ./lint.sh
 after_success:
   coveralls

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 numpy>=1.7, <2.0
 pandas>=0.15
-pyensembl>=0.8.2
+pyensembl>=0.8.8
 biopython>=1.64
 pyvcf>=0.6.7
 memoized_property>=1.0.2

diff --git a/setup.py b/setup.py
@@ -40,7 +40,7 @@
     setup(
         name='varcode',
         packages=find_packages(),
-        version="0.4.1",
+        version="0.4.2",
         description="Variant annotation in Python",
         long_description=readme,
         url="https://github.com/hammerlab/varcode",
@@ -58,7 +58,7 @@
         install_requires=[
             'numpy>=1.7, <2.0',
             'pandas>=0.15',
-            'pyensembl>=0.8.2',
+            'pyensembl>=0.8.8',
             'biopython>=1.64',
             'pyvcf>=0.6.7',
             'memoized_property>=1.0.2',

diff --git a/test/benchmark_vcf_load.py b/test/benchmark_vcf_load.py
@@ -13,13 +13,26 @@
 import varcode
 
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument("path", help="Path or URL to VCF")
-parser.add_argument("--profile", action="store_true", default=False,
+
+parser.add_argument(
+    "path", help="Path or URL to VCF")
+
+parser.add_argument(
+    "--profile", action="store_true",
+    default=False,
     help="Run in a profiler.")
-parser.add_argument("--no-info-field",
-    dest="info_field", action="store_false", default=True)
-parser.add_argument("--pyvcf",
-    help="use pyvcf implementation", action="store_true", default=False)
+
+parser.add_argument(
+    "--no-info-field",
+    dest="info_field",
+    action="store_false",
+    default=True)
+
+parser.add_argument(
+    "--pyvcf",
+    help="use pyvcf implementation",
+    action="store_true",
+    default=False)
 
 def run():
     args = parser.parse_args()
@@ -29,7 +42,7 @@ def run():
         extra_args["include_info"] = False
 
     start = time.time()
-    
+
     if args.pyvcf:
         result = varcode.load_vcf(
             args.path,

diff --git a/test/data.py b/test/data.py
@@ -51,4 +51,4 @@ def data_path(name):
     snp_rs4244285,
     snp_rs1537415,
     snp_rs3892097,
-])
+])
diff --git a/test/test_dbnsfp_validation.py b/test/test_dbnsfp_validation.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pyensembl import EnsemblRelease
+from pyensembl import ensembl_grch37
 from varcode import (
     ExonicSpliceSite,
     Substitution,
@@ -23,8 +23,6 @@
 
 from . import data_path
 
-ensembl = EnsemblRelease(75)
-
 def validate_transcript_mutation(
         ensembl_transcript_id,
         chrom,
@@ -33,7 +31,7 @@ def validate_transcript_mutation(
         dna_alt,
         aa_pos,
         aa_alt):
-    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
+    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl_grch37)
     effects = variant.effects()
     transcript_id_dict = {
         effect.transcript.id: effect
@@ -58,14 +56,14 @@ def validate_transcript_mutation(
     assert (
         effect_aa_pos + 1 == aa_pos and
         effect_aa_alt == aa_alt), \
-            "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
-                aa_alt,
-                aa_pos,
-                chrom,
-                dna_position,
-                dna_ref,
-                dna_alt,
-                effect)
+        "Mutant amino acid %s not found at %d for chr%s:%s %s>%s : %s" % (
+            aa_alt,
+            aa_pos,
+            chrom,
+            dna_position,
+            dna_ref,
+            dna_alt,
+            effect)
 
 def test_dbnsfp_validation_set():
     # check that amino acid substitution gives

diff --git a/test/test_effect_classes.py b/test/test_effect_classes.py
@@ -24,7 +24,7 @@
     # transcript effects
     #
     IncompleteTranscript,
-    NoncodingTranscript,
+    # NoncodingTranscript, TODO: write a noncoding transcript test
     FivePrimeUTR,
     ThreePrimeUTR,
     Intronic,
@@ -41,10 +41,14 @@
     ExonicSpliceSite,
     # TODO: SpliceDonor, SpliceReceptor
 )
-from pyensembl import ensembl_grch37, ensembl_grch38
+from pyensembl import ensembl_grch37, cached_release
 
 from .common import expect_effect
 
+# tried using more recent releases but found that many of them
+# are very specific to Ensembl data between releases 77-81
+ensembl_grch38 = cached_release(81)
+
 def test_incomplete():
     # transcript EGFR-009 (ENST00000450046 in Ensembl 78)
     # has an incomplete 3' end
@@ -153,9 +157,9 @@ def test_exon_loss():
         "17",
         43082404,
         ref="".join([
-          "CTTTTTCTGATGTGCTTTGTTCTGGATTTCGCAGGTCCTCAAGGGCAGAAGAGTCACTTATGATG",
-          "GAAGGGTAGCTGTTAGAAGGCTGGCTCCCATGCTGTTCTAACACAGCTTCAGTAATTAGATTAGT",
-          "TAAAGTGATGTGGTGTTTTCTGGCAAACTTGTACACGAGCAT"
+            "CTTTTTCTGATGTGCTTTGTTCTGGATTTCGCAGGTCCTCAAGGGCAGAAGAGTCACTTATGATG",
+            "GAAGGGTAGCTGTTAGAAGGCTGGCTCCCATGCTGTTCTAACACAGCTTCAGTAATTAGATTAGT",
+            "TAAAGTGATGTGGTGTTTTCTGGCAAACTTGTACACGAGCAT"
         ]),
         alt="",
         ensembl=ensembl_grch38)

diff --git a/test/test_timings.py b/test/test_timings.py
@@ -22,15 +22,14 @@ def _time_variant_annotation(variant_collection):
     effects = variant_collection.effects()
     end_t = time.time()
     assert len(effects.groupby_variant()) == len(variant_collection)
-
     elapsed_t = end_t - start_t
     return elapsed_t
 
 
 def test_effect_timing(
         n_variants=100,
         random_seed=0,
-        n_warmup_variants=20):
+        n_warmup_variants=5):
     warmup_collection = random_variants(
         n_warmup_variants,
         random_seed=None)

diff --git a/test/test_variant.py b/test/test_variant.py
@@ -20,7 +20,9 @@
     import cPickle as pickle
 except ImportError:
     import pickle
-from pyensembl import ensembl77
+
+from pyensembl import ensembl_grch38
+
 from varcode import Variant
 from nose.tools import eq_
 
@@ -106,7 +108,7 @@ def test_deletion_no_suffix():
 def test_serialization():
     variants = [
         Variant(
-            1, start=10, ref="AA", alt="AAT", ensembl=ensembl77),
+            1, start=10, ref="AA", alt="AAT", ensembl=ensembl_grch38),
         Variant(10, start=15, ref="A", alt="G"),
         Variant(20, start=150, ref="", alt="G"),
     ]

diff --git a/test/test_variant_collection.py b/test/test_variant_collection.py
@@ -65,12 +65,13 @@ def test_gene_counts():
     # eq_(coding_gene_counts, expected_counts)
 
 def test_serialization():
-    original = VariantCollection([
+    original = VariantCollection(
+        [
             Variant(
                 1, start=10, ref="AA", alt="AAT", ensembl=77),
             Variant(10, start=15, ref="A", alt="G"),
             Variant(20, start=150, ref="", alt="G"),
-    ])
+        ])
     original.metadata[original[0]] = {"a": "b"}
     original.metadata[original[2]] = {"bar": 2}
 

diff --git a/test/test_vcf.py b/test/test_vcf.py
@@ -78,8 +78,7 @@ def test_pandas_and_pyvcf_implementations_equivalent():
         {'path': data_path("multiallelic.vcf")},
         {'path': data_path("mutect-example.vcf")},
         {'path': data_path("strelka-example.vcf")},
-        {'path': data_path("mutect-example-headerless.vcf"),
-          'genome': cached_release(75)},
+        {'path': data_path("mutect-example-headerless.vcf"), 'genome': cached_release(75)},
     ]
     if RUN_TESTS_REQUIRING_INTERNET:
         paths.append({'path': VCF_EXTERNAL_URL})

diff --git a/varcode/effect_collection.py b/varcode/effect_collection.py
@@ -19,7 +19,7 @@
 
 from .collection import Collection
 from .common import memoize
-from .effects import MutationEffect, NonsilentCodingMutation
+from .effects import MutationEffect
 from .effect_ordering import (
     effect_priority,
     effect_sort_key,
@@ -143,8 +143,7 @@ def drop_silent_and_noncoding(self):
         """
         Create a new EffectCollection containing only non-silent coding effects
         """
-        return self.filter(
-            lambda effect: isinstance(effect, NonsilentCodingMutation))
+        return self.filter(lambda effect: effect.modifies_protein_sequence)
 
     def detailed_string(self):
         """

diff --git a/varcode/effect_ordering.py b/varcode/effect_ordering.py
@@ -12,13 +12,38 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .effects import *
+from .effects import (
+    Failure,
+    IncompleteTranscript,
+    Intergenic,
+    Intragenic,
+    NoncodingTranscript,
+    Intronic,
+    ThreePrimeUTR,
+    FivePrimeUTR,
+    Silent,
+    Substitution,
+    Insertion,
+    Deletion,
+    ComplexSubstitution,
+    AlternateStartCodon,
+    IntronicSpliceSite,
+    ExonicSpliceSite,
+    StopLoss,
+    SpliceDonor,
+    SpliceAcceptor,
+    PrematureStop,
+    FrameShiftTruncation,
+    StartLoss,
+    FrameShift,
+    ExonLoss,
+)
 
 transcript_effect_priority_list = [
     Failure,
+    IncompleteTranscript,
     Intergenic,
     Intragenic,
-    IncompleteTranscript,
     NoncodingTranscript,
     Intronic,
     ThreePrimeUTR,

diff --git a/varcode/effects.py b/varcode/effects.py
@@ -261,6 +261,14 @@ def mutant_protein_sequence(self):
         """
         return self.alternate_effect.mutant_protein_sequence
 
+    @memoized_property
+    def modifies_protein_sequence(self):
+        return self.alternate_effect.modifies_protein_sequence
+
+    @memoized_property
+    def modifies_coding_sequence(self):
+        return self.alternate_effect.modifies_coding_sequence
+
 class CodingMutation(Exonic):
     """
     Base class for all mutations which result in a modified coding sequence.
@@ -410,9 +418,9 @@ def short_description(self):
                 self.aa_mutation_start_offset)
         else:
             return "p.%s%d%s" % (
-                    self.aa_ref,
-                    self.aa_mutation_start_offset + 1,
-                    self.aa_alt)
+                self.aa_ref,
+                self.aa_mutation_start_offset + 1,
+                self.aa_alt)
 
     @memoized_property
     def mutant_protein_sequence(self):