openvax · iskandr · Feb 23, 2015 · Feb 20, 2015 · Feb 20, 2015 · Feb 20, 2015
diff --git a/test/test_cosmic_mutations.py b/test/test_cosmic_mutations.py
@@ -1,6 +1,7 @@
+from pyensembl import EnsemblRelease
 from varcode import (
-    VariantAnnotator,
     Variant,
+    # transcript effects
     Substitution,
     Deletion,
     Insertion,
@@ -9,15 +10,15 @@
     FrameShiftTruncation,
 )
 
-annot = VariantAnnotator(75)
+ensembl = EnsemblRelease(75)
 
 def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
-    variant = Variant(chrom, pos, dna_ref, dna_alt)
-    result = annot.effect(variant)
-    assert transcript_id in result.transcript_effects, \
+    variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
+    effect_collection = variant.effects()
+    assert transcript_id in effect_collection.transcript_effect_dict, \
         "Expected transcript ID %s for variant %s not found in %s" % (
             transcript_id, variant, result)
-    return result.transcript_effects[transcript_id]
+    return effect_collection.transcript_effect_dict[transcript_id]
 
 def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt):
     effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)

diff --git a/test/test_dbnsfp_validation.py b/test/test_dbnsfp_validation.py
@@ -13,20 +13,22 @@
 # limitations under the License.
 
 import pandas as pd
-from varcode import VariantAnnotator, Substitution, Variant
+from pyensembl import EnsemblRelease
+from varcode import Substitution, Variant
 
-annot = VariantAnnotator(75)
+ensembl = EnsemblRelease(75)
 
 def validate_transcript_mutation(
 	ensembl_transcript,
         chrom, dna_position,
         dna_ref, dna_alt,
         aa_pos, aa_alt):
-    result = annot.effect(
-        Variant(chrom, dna_position, dna_ref, dna_alt))
-    assert ensembl_transcript in result.transcript_effects, \
+    variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
+    result = variant.effects()
+
+    assert ensembl_transcript in result.transcript_effect_dict, \
         "%s not found in %s" % (ensembl_transcript, result)
-    effect = result.transcript_effects[ensembl_transcript]
+    effect = result.transcript_effect_dict[ensembl_transcript]
     assert (
         isinstance(effect, Substitution) and
         effect.aa_pos + 1 == aa_pos and

diff --git a/test/test_drop_duplicates.py b/test/test_drop_duplicates.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pyensembl import EnsemblRelease
 from varcode import Variant, VariantCollection
 
 def test_drop_duplicates():
-    v1 = Variant("1", 3000, "A", "G")
-    v1_copy = Variant("1", 3000, "A", "G")
-    v2 = Variant("2", 10, "G", "T")
+    ensembl = EnsemblRelease(78)
+    v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
+    v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
+    v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
     collection_with_duplicates = VariantCollection(
-        variants=[v1, v1, v1_copy, v2],
-        reference_name="hg19")
+        variants=[v1, v1, v1_copy, v2])
     assert len(collection_with_duplicates) == 4
     collection_without_duplicates = collection_with_duplicates.drop_duplicates()
     assert len(collection_without_duplicates) == 2
diff --git a/test/test_maf.py b/test/test_maf.py
@@ -1,15 +1,19 @@
-from varcode import load_variants, VariantCollection, Variant
 from nose.tools import eq_
+from pyensembl import EnsemblRelease
+from varcode import load_maf, VariantCollection, Variant
 
 def test_maf():
-    variant_collection_from_maf = load_variants("data/tcga_ov.head.maf")
-    eq_(variant_collection_from_maf.reference_name, "GRCh37")
+    ensembl = EnsemblRelease(75)
+    variant_collection_from_maf = load_maf("data/tcga_ov.head.maf")
     expected_variants = [
-        Variant(1, 1650797, "A", "G"),
-        Variant(1, 231401797, "A", "C"),
-        Variant(1, 23836447, "C", "A"),
-        Variant(11,124617502, "C", "G"),
+        Variant(1, 1650797, "A", "G", ensembl),
+        Variant(1, 231401797, "A", "C", ensembl),
+        Variant(1, 23836447, "C", "A", ensembl),
+        Variant(11,124617502, "C", "G", ensembl),
     ]
     eq_(len(variant_collection_from_maf), len(expected_variants))
-    for v1, v2 in zip(expected_variants, variant_collection_from_maf):
-        eq_(v1, v2)
+    for v_expect, v_maf in zip(expected_variants, variant_collection_from_maf):
+        eq_(v_expect, v_maf)
+        gene_name = v_maf.info['Hugo_Symbol']
+        assert any(gene.name == gene_name for gene in v_maf.genes()), \
+            "Expected gene name %s but got %s" % (gene_name, v_maf.genes())
diff --git a/test/test_mutation_effects.py b/test/test_mutation_effects.py
@@ -1,7 +1,8 @@
 from varcode import (
-    VariantAnnotator,
     Variant,
-    infer_transcript_effect,
+    #
+    # transcript effects
+    #
     NoncodingTranscript,
     IncompleteTranscript,
     FivePrimeUTR,
@@ -17,8 +18,9 @@
     FrameShift,
     # TODO: SpliceDonor, SpliceReceptor
 )
+from pyensembl import EnsemblRelease
 
-annot = VariantAnnotator(ensembl_release=77)
+ensembl = EnsemblRelease(release=78)
 
 def test_incomplete():
     # transcript EGFR-009 (ENST00000450046 in Ensembl 77)
@@ -27,10 +29,10 @@ def test_incomplete():
     # first exon begins: ATCATTCCTTTGGGCCTAGGA
 
     # change the first nucleotide of the 5' UTR A>T
-    variant = Variant("7", 55109723, "A", "T")
+    variant = Variant("7", 55109723, "A", "T", ensembl=ensembl)
 
-    transcript = annot.ensembl.transcript_by_id("ENST00000450046")
-    effect = infer_transcript_effect(variant, transcript)
+    transcript = ensembl.transcript_by_id("ENST00000450046")
+    effect = variant.transcript_effect(transcript)
     assert isinstance(effect, IncompleteTranscript), \
         "Expected %s on %s to be IncompleteTranscript, got %s" % (
             variant, transcript, effect)
@@ -43,9 +45,9 @@ def test_start_loss():
     # which is 55,019,034 + 244 of chr7 = 55019278
     # change the first nucleotide of the 5' UTR A>T
     # making what used to be a start codon into TTG (Leucine)
-    variant = Variant("7", 55019278, "A", "T")
-    transcript = annot.ensembl.transcript_by_id("ENST00000420316")
-    effect = infer_transcript_effect(variant, transcript)
+    variant = Variant("7", 55019278, "A", "T", ensembl=ensembl)
+    transcript = ensembl.transcript_by_id("ENST00000420316")
+    effect = variant.transcript_effect(transcript)
     assert isinstance(effect, StartLoss), \
         "Expected StartLoss, got %s for %s on %s" % (
             effect, variant, transcript, )
diff --git a/test/test_trim_shared.py → test/test_string_helpers.py b/test/test_trim_shared.py → test/test_string_helpers.py
@@ -1,6 +1,6 @@
 from nose.tools import eq_
 
-from varcode.common import trim_shared_flanking_strings
+from varcode.string_helpers import trim_shared_flanking_strings
 
 def test_trim_shared_string_endings():
     # empty strings

diff --git a/test/test_vcf.py b/test/test_vcf.py
@@ -12,33 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from varcode import load_variants
+from varcode import load_vcf
 
 VCF_FILENAME = "data/somatic_hg19_14muts.vcf"
 
 def test_vcf_reference_name():
-    variants = load_variants(VCF_FILENAME)
-    # the raw reference name can be a file path to the hg19 FASTA file
-    assert variants.reference_path and "hg19" in variants.reference_path, \
-        "Expected hg19 reference, got %s" % (variants.reference_path,)
+    variants = load_vcf(VCF_FILENAME)
     # after normalization, hg19 should be remapped to GRCh37
-    assert variants.reference_name == "GRCh37"
+    assert variants.reference_names() ==  { "GRCh37" }
 
 def test_vcf_number_entries():
     # there are 14 mutations listed in the VCF, make sure they are all parsed
-    variants = load_variants(VCF_FILENAME)
+    variants = load_vcf(VCF_FILENAME)
     assert len(variants) == 14, \
         "Expected 14 mutations, got %d" % (len(variants),)
 
-def _check_effect_gene_name(effect):
-    variant = effect.variant
+def _check_variant_gene_name(variant):
     expected_gene_names = variant.info['GE']
-    gene_names = [gene.name for gene in effect.genes]
-    assert expected_gene_names == gene_names, \
+    assert variant.gene_names() == expected_gene_names, \
         "Expected gene name %s for variant %s, got %s" % (
-            expected_gene_name, variant, gene_names)
+            expected_gene_name, variant, variant.gene_names())
 
 def test_vcf_gene_names():
-    variants = load_variants(VCF_FILENAME)
-    for effect in variants.variant_effects():
-        yield (_check_effect_gene_name, effect)
+    variants = load_vcf(VCF_FILENAME)
+    for variant in variants:
+        yield (_check_variant_gene_name, variant)
diff --git a/varcode/__init__.py b/varcode/__init__.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .core_logic import infer_transcript_effect
+from .effects import *
 from .effect_ordering import effect_priority, top_priority_transcript_effect
-from .load_variants import load_variants
-from .transcript_mutation_effects import *
+from .maf import load_maf, load_maf_dataframe
 from .variant import Variant
-from .variant_annotator import VariantAnnotator
 from .variant_collection import VariantCollection
+from .vcf import load_vcf