Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions test/test_cosmic_mutations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pyensembl import EnsemblRelease
from varcode import (
VariantAnnotator,
Variant,
# transcript effects
Substitution,
Deletion,
Insertion,
Expand All @@ -9,15 +10,15 @@
FrameShiftTruncation,
)

annot = VariantAnnotator(75)
ensembl = EnsemblRelease(75)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest renaming this default_ensembl

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you say more about why that's a better name?
On Feb 20, 2015 11:35 AM, "timodonnell" notifications@github.com wrote:

In test/test_cosmic_mutations.py
#20 (comment):

@@ -9,11 +10,11 @@
FrameShiftTruncation,
)

-annot = VariantAnnotator(75)
+ensembl = EnsemblRelease(75)

I'd suggest renaming this default_ensembl


Reply to this email directly or view it on GitHub
https://github.com/hammerlab/varcode/pull/20/files#r25082212.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used only as a default parameter, right? Otherwise it looks to me like we're declaring that this module uses a particular ensembl release

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah wait sorry, nvm. somehow didn't realize this was in a test method.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


def _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id):
variant = Variant(chrom, pos, dna_ref, dna_alt)
result = annot.effect(variant)
assert transcript_id in result.transcript_effects, \
variant = Variant(chrom, pos, dna_ref, dna_alt, ensembl=ensembl)
effect_collection = variant.effects()
assert transcript_id in effect_collection.transcript_effect_dict, \
"Expected transcript ID %s for variant %s not found in %s" % (
transcript_id, variant, result)
return result.transcript_effects[transcript_id]
return effect_collection.transcript_effect_dict[transcript_id]

def _substitution(chrom, pos, dna_ref, dna_alt, transcript_id, aa_ref, aa_alt):
effect = _get_effect(chrom, pos, dna_ref, dna_alt, transcript_id)
Expand Down
14 changes: 8 additions & 6 deletions test/test_dbnsfp_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,22 @@
# limitations under the License.

import pandas as pd
from varcode import VariantAnnotator, Substitution, Variant
from pyensembl import EnsemblRelease
from varcode import Substitution, Variant

annot = VariantAnnotator(75)
ensembl = EnsemblRelease(75)

def validate_transcript_mutation(
ensembl_transcript,
chrom, dna_position,
dna_ref, dna_alt,
aa_pos, aa_alt):
result = annot.effect(
Variant(chrom, dna_position, dna_ref, dna_alt))
assert ensembl_transcript in result.transcript_effects, \
variant = Variant(chrom, dna_position, dna_ref, dna_alt, ensembl)
result = variant.effects()

assert ensembl_transcript in result.transcript_effect_dict, \
"%s not found in %s" % (ensembl_transcript, result)
effect = result.transcript_effects[ensembl_transcript]
effect = result.transcript_effect_dict[ensembl_transcript]
assert (
isinstance(effect, Substitution) and
effect.aa_pos + 1 == aa_pos and
Expand Down
11 changes: 6 additions & 5 deletions test/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from pyensembl import EnsemblRelease
from varcode import Variant, VariantCollection

def test_drop_duplicates():
v1 = Variant("1", 3000, "A", "G")
v1_copy = Variant("1", 3000, "A", "G")
v2 = Variant("2", 10, "G", "T")
ensembl = EnsemblRelease(78)
v1 = Variant("1", 3000, "A", "G", ensembl=ensembl)
v1_copy = Variant("1", 3000, "A", "G", ensembl=ensembl)
v2 = Variant("2", 10, "G", "T", ensembl=ensembl)
collection_with_duplicates = VariantCollection(
variants=[v1, v1, v1_copy, v2],
reference_name="hg19")
variants=[v1, v1, v1_copy, v2])
assert len(collection_with_duplicates) == 4
collection_without_duplicates = collection_with_duplicates.drop_duplicates()
assert len(collection_without_duplicates) == 2
22 changes: 13 additions & 9 deletions test/test_maf.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
from varcode import load_variants, VariantCollection, Variant
from nose.tools import eq_
from pyensembl import EnsemblRelease
from varcode import load_maf, VariantCollection, Variant

def test_maf():
variant_collection_from_maf = load_variants("data/tcga_ov.head.maf")
eq_(variant_collection_from_maf.reference_name, "GRCh37")
ensembl = EnsemblRelease(75)
variant_collection_from_maf = load_maf("data/tcga_ov.head.maf")
expected_variants = [
Variant(1, 1650797, "A", "G"),
Variant(1, 231401797, "A", "C"),
Variant(1, 23836447, "C", "A"),
Variant(11,124617502, "C", "G"),
Variant(1, 1650797, "A", "G", ensembl),
Variant(1, 231401797, "A", "C", ensembl),
Variant(1, 23836447, "C", "A", ensembl),
Variant(11,124617502, "C", "G", ensembl),
]
eq_(len(variant_collection_from_maf), len(expected_variants))
for v1, v2 in zip(expected_variants, variant_collection_from_maf):
eq_(v1, v2)
for v_expect, v_maf in zip(expected_variants, variant_collection_from_maf):
eq_(v_expect, v_maf)
gene_name = v_maf.info['Hugo_Symbol']
assert any(gene.name == gene_name for gene in v_maf.genes()), \
"Expected gene name %s but got %s" % (gene_name, v_maf.genes())
20 changes: 11 additions & 9 deletions test/test_mutation_effects.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from varcode import (
VariantAnnotator,
Variant,
infer_transcript_effect,
#
# transcript effects
#
NoncodingTranscript,
IncompleteTranscript,
FivePrimeUTR,
Expand All @@ -17,8 +18,9 @@
FrameShift,
# TODO: SpliceDonor, SpliceReceptor
)
from pyensembl import EnsemblRelease

annot = VariantAnnotator(ensembl_release=77)
ensembl = EnsemblRelease(release=78)

def test_incomplete():
# transcript EGFR-009 (ENST00000450046 in Ensembl 77)
Expand All @@ -27,10 +29,10 @@ def test_incomplete():
# first exon begins: ATCATTCCTTTGGGCCTAGGA

# change the first nucleotide of the 5' UTR A>T
variant = Variant("7", 55109723, "A", "T")
variant = Variant("7", 55109723, "A", "T", ensembl=ensembl)

transcript = annot.ensembl.transcript_by_id("ENST00000450046")
effect = infer_transcript_effect(variant, transcript)
transcript = ensembl.transcript_by_id("ENST00000450046")
effect = variant.transcript_effect(transcript)
assert isinstance(effect, IncompleteTranscript), \
"Expected %s on %s to be IncompleteTranscript, got %s" % (
variant, transcript, effect)
Expand All @@ -43,9 +45,9 @@ def test_start_loss():
# which is 55,019,034 + 244 of chr7 = 55019278
# change the first nucleotide of the 5' UTR A>T
# making what used to be a start codon into TTG (Leucine)
variant = Variant("7", 55019278, "A", "T")
transcript = annot.ensembl.transcript_by_id("ENST00000420316")
effect = infer_transcript_effect(variant, transcript)
variant = Variant("7", 55019278, "A", "T", ensembl=ensembl)
transcript = ensembl.transcript_by_id("ENST00000420316")
effect = variant.transcript_effect(transcript)
assert isinstance(effect, StartLoss), \
"Expected StartLoss, got %s for %s on %s" % (
effect, variant, transcript, )
2 changes: 1 addition & 1 deletion test/test_trim_shared.py → test/test_string_helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from nose.tools import eq_

from varcode.common import trim_shared_flanking_strings
from varcode.string_helpers import trim_shared_flanking_strings

def test_trim_shared_string_endings():
# empty strings
Expand Down
25 changes: 10 additions & 15 deletions test/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from varcode import load_variants
from varcode import load_vcf

VCF_FILENAME = "data/somatic_hg19_14muts.vcf"

def test_vcf_reference_name():
variants = load_variants(VCF_FILENAME)
# the raw reference name can be a file path to the hg19 FASTA file
assert variants.reference_path and "hg19" in variants.reference_path, \
"Expected hg19 reference, got %s" % (variants.reference_path,)
variants = load_vcf(VCF_FILENAME)
# after normalization, hg19 should be remapped to GRCh37
assert variants.reference_name == "GRCh37"
assert variants.reference_names() == { "GRCh37" }

def test_vcf_number_entries():
# there are 14 mutations listed in the VCF, make sure they are all parsed
variants = load_variants(VCF_FILENAME)
variants = load_vcf(VCF_FILENAME)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regarding the assert one line below: not a big deal, and no need to change this in this PR, but I'd suggest using something like the numpy testing library (numpy.testing) to avoid having to write out assertion failure strings like this

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use nose.tools.eq_ but sometimes I find it more informative to have a string explaining where the value came from.

assert len(variants) == 14, \
"Expected 14 mutations, got %d" % (len(variants),)

def _check_effect_gene_name(effect):
variant = effect.variant
def _check_variant_gene_name(variant):
expected_gene_names = variant.info['GE']
gene_names = [gene.name for gene in effect.genes]
assert expected_gene_names == gene_names, \
assert variant.gene_names() == expected_gene_names, \
"Expected gene name %s for variant %s, got %s" % (
expected_gene_name, variant, gene_names)
expected_gene_name, variant, variant.gene_names())

def test_vcf_gene_names():
variants = load_variants(VCF_FILENAME)
for effect in variants.variant_effects():
yield (_check_effect_gene_name, effect)
variants = load_vcf(VCF_FILENAME)
for variant in variants:
yield (_check_variant_gene_name, variant)
7 changes: 3 additions & 4 deletions varcode/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from .core_logic import infer_transcript_effect
from .effects import *
from .effect_ordering import effect_priority, top_priority_transcript_effect
from .load_variants import load_variants
from .transcript_mutation_effects import *
from .maf import load_maf, load_maf_dataframe
from .variant import Variant
from .variant_annotator import VariantAnnotator
from .variant_collection import VariantCollection
from .vcf import load_vcf
Loading