Skip to content

Commit

Permalink
Merge pull request #160 from hammerlab/fix-issue-158
Browse files Browse the repository at this point in the history
Fix load_vcf_fast for sample names containing a space character
  • Loading branch information
iskandr committed Jul 15, 2016
2 parents 4bb441f + b965d71 commit 78f789e
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 5 deletions.
2 changes: 1 addition & 1 deletion test/data/multiallelic.vcf
Expand Up @@ -12,5 +12,5 @@
##contig=<ID=chr15,length=102531392>
##contig=<ID=chr16,length=90354753>
##contig=<ID=chr17,length=81195210>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis
chr1 1431105 rs199599542 A C,G 593.69 PASS DP=17;GE=Wuzzle GT 0/1
29 changes: 29 additions & 0 deletions test/data/somatic_hg19_14muts.space_in_sample_name.vcf
@@ -0,0 +1,29 @@
##fileformat=VCFv4.1
##reference=file:///projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta
##INFO=<ID=GE,Number=.,Type=String,Description="HGNC Gene Symbol (could be more than one)">
##INFO=<ID=EG,Number=.,Type=String,Description="Entrez Gene ID (could be more than one)">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chrM,length=16571>
##contig=<ID=chr1,length=249250621>
##contig=<ID=chr10,length=135534747>
##contig=<ID=chr11,length=135006516>
##contig=<ID=chr12,length=133851895>
##contig=<ID=chr14,length=107349540>
##contig=<ID=chr15,length=102531392>
##contig=<ID=chr16,length=90354753>
##contig=<ID=chr17,length=81195210>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT metastasis foo
chr1 53513530 . A C . . GE=SCP2;EG=6342 GT 0/1
chr1 228295398 . G T . . GE=MRPL55;EG=128308 GT 0/1
chr10 49658590 . T C . . GE=ARHGAP22;EG=58504 GT 0/1
chr10 51585166 . G T . . GE=NCOA4;EG=8031 GT 0/1
chr10 96709040 . A C . . GE=CYP2C9;EG=1559 GT 0/1
chr10 119134281 . G T . . GE=PDZD8;EG=118987 GT 0/1
chr11 118244286 . G G . . GE=UBE4A;EG=9354 GT 0/1
chr12 14794076 . C A . . GE=GUCY2C;EG=2984 GT 0/1
chr12 25398284 . C G . . GE=KRAS;EG=3845 GT 0/1
chr12 42778752 . T A . . GE=PPHLN1;EG=51535 GT 0/1
chr14 31144202 . A C . . GE=SCFD1;EG=23256 GT 0/1
chr16 25704209 . G A . . GE=HS3ST4;EG=9951 GT 0/1
chr17 7577548 . C CA . . GE=TP53;EG=7157 GT 0/1
chr17 36731197 . C AAT . . GE=SRCIN1;EG=80725 GT 0/1
1 change: 1 addition & 0 deletions test/test_vcf.py
Expand Up @@ -73,6 +73,7 @@ def test_vcf_reference_name():
def test_pandas_and_pyvcf_implementations_equivalent():
paths = [
{'path': data_path("somatic_hg19_14muts.vcf")},
{'path': data_path("somatic_hg19_14muts.space_in_sample_name.vcf")},
{'path': "/" + data_path("somatic_hg19_14muts.vcf")},
{'path': data_path("somatic_hg19_14muts.vcf.gz")},
{'path': data_path("multiallelic.vcf")},
Expand Down
13 changes: 9 additions & 4 deletions varcode/vcf.py
Expand Up @@ -151,8 +151,8 @@ def load_vcf_fast(
genome : {pyensembl.Genome, reference name, Ensembl version int}, optional
Optionally pass in a PyEnsembl Genome object, name of reference, or
PyEnsembl release version to specify the reference associated with a VCF
(otherwise infer reference from VCF using reference_vcf_key)
PyEnsembl release version to specify the reference associated with a
VCF (otherwise infer reference from VCF using reference_vcf_key)
reference_vcf_key : str, optional
Name of metadata field which contains path to reference FASTA
Expand Down Expand Up @@ -469,7 +469,9 @@ def __init__(self, path):
self.path = path
parsed_path = parse_url_or_path(path)
if not parsed_path.scheme or parsed_path.scheme.lower() == 'file':
self.vcf_reader = pyvcf.Reader(filename=parsed_path.path)
self.vcf_reader = pyvcf.Reader(
filename=parsed_path.path,
strict_whitespace=True)
elif parsed_path.scheme.lower() in ("http", "https", "ftp"):
self._to_close = response = requests.get(path, stream=True)
response.raise_for_status() # raise error on 404, etc.
Expand All @@ -478,7 +480,10 @@ def __init__(self, path):
response.iter_content())
else:
lines = response.iter_lines(decode_unicode=True)
self.vcf_reader = pyvcf.Reader(fsock=lines, compressed=False)
self.vcf_reader = pyvcf.Reader(
fsock=lines,
compressed=False,
strict_whitespace=True)
else:
raise ValueError("Unsupported scheme: %s" % parsed_path.scheme)

Expand Down

0 comments on commit 78f789e

Please sign in to comment.