Permalink
Browse files

update version to 0.3.0, doc improvements, fixes issue #16

  • Loading branch information...
1 parent 29b3373 commit 3811a8160b38b52e0914237df50fed8bc01070b7 James Casbon committed Feb 8, 2012
Showing with 104 additions and 32 deletions.
  1. +25 −11 README.rst
  2. +13 −4 docs/HISTORY.rst
  3. +3 −2 docs/conf.py
  4. +1 −0 docs/index.rst
  5. +12 −1 setup.py
  6. +21 −0 test/issue-16.vcf
  7. +9 −2 test/test_vcf.py
  8. +20 −12 vcf.py
View
36 README.rst
@@ -8,7 +8,7 @@ specified in the meta-information lines -- specifically the ##INFO and
against the reserved types mentioned in the spec. Failing that, it will just
return strings.
-There is currently one piece of interface: ``Reader``. It takes a file-like
+There main interface is the class: ``Reader``. It takes a file-like
object and acts as a reader::
>>> import vcf
@@ -18,13 +18,12 @@ object and acts as a reader::
Record(CHROM=20, POS=14370, REF=G, ALT=['A'])
Record(CHROM=20, POS=17330, REF=T, ALT=['A'])
Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
- Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+ Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
Record(CHROM=20, POS=1234567, REF=GTCT, ALT=['G', 'GTACT'])
This produces a great deal of information, but it is conveniently accessed.
-The attributes of a Record are the 8 fixed fields from the VCF spec plus two
-more. That is:
+The attributes of a Record are the 8 fixed fields from the VCF spec::
* ``Record.CHROM``
* ``Record.POS``
@@ -35,13 +34,13 @@ more. That is:
* ``Record.FILTER``
* ``Record.INFO``
-plus three more attributes to handle genotype information:
+plus attributes to handle genotype information:
* ``Record.FORMAT``
* ``Record.samples``
* ``Record.genotype``
-``samples`` and ``genotypes``, not being the title of any column, is left lowercase. The format
+``samples`` and ``genotype``, not being the title of any column, are left lowercase. The format
of the fixed fields is from the spec. Comma-separated lists in the VCF are
converted to lists. In particular, one-entry VCF lists are converted to
one-entry Python lists (see, e.g., ``Record.ALT``). Semicolon-delimited lists
@@ -57,7 +56,7 @@ a ``True`` value. Integers and floats are handled exactly as you'd expect::
>>> print record.INFO['AF']
[0.5]
-There are a number of convienience functions for each ``Record`` allowing you to
+There are a number of convienience methods and properties for each ``Record`` allowing you to
examine properties of interest::
>>> print record.num_called, record.call_rate, record.num_unknown
@@ -67,7 +66,8 @@ examine properties of interest::
>>> print record.nucl_diversity, record.aaf
0.6 0.5
>>> print record.get_hets()
- [Call(sample=NA00002, GT=1|0)]
+ [Call(sample=NA00002, GT=1|0, GQ=[48])]
+
``record.FORMAT`` will be a string specifying the format of the genotype
fields. In case the FORMAT column does not exist, ``record.FORMAT`` is
@@ -126,13 +126,27 @@ Random access is supported for files with tabix indexes. Simply call fetch for
region you are interested in::
>>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
- >>> for record in vcf_reader.fetch('20', 1110696-1, 1230237):
+ >>> for record in vcf_reader.fetch('20', 1110696, 1230237):
... print record
Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
- Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+ Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
+
+Or extract a single row::
+
+ >>> print vcf_reader.fetch('20', 1110696)
+ Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
+
+
+The ``Writer`` class provides a way of writing a VCF file. Currently, you must specify a
+template ``Reader`` which provides the metadata::
+
+ >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
+ >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader)
+ >>> for record in vcf_reader:
+ ... vcf_writer.write_record(record)
An extensible script is available to filter vcf files in vcf_filter.py. VCF filters
declared by other packages will be available for use in this script. Please
-see FILTERS.md for full description.
+see :doc:`FILTERS` for full description.
View
17 docs/HISTORY.rst
@@ -1,14 +1,22 @@
+Development
+===========
+
+Please use the repository at github: https://github.com/jamescasbon/PyVCF/
+Pull requests gladly accepted.
+Issues should be reported at the github issue tracker.
+
Changes
=======
-Pending
--------
+0.3.0 Release
+-------------
* Fix setup.py for python < 2.7
-* Add ``__eq__`` to ``_Record``
+* Add ``__eq__`` to ``_Record`` and ``_Call``
* Add ``is_het`` and ``is_variant`` to ``_Call``
* Drop aggressive parse mode: we're always aggressive.
-* Add tabix fetch for single calls
+* Add tabix fetch for single calls, fix one->zero based indexing
+* add prepend_chr mode for ``Reader`` to add `chr` to CHROM attributes
0.2.2 Release
-------------
@@ -44,5 +52,6 @@ Contributions
-------------
Project started by @jdoughertyii and taken over by @jamescasbon on 12th January 2011.
+Contributions from @arq5x, @brentp, @martijnvermaat, @ian1roberts.
View
5 docs/conf.py
@@ -48,9 +48,10 @@
# built documents.
#
# The short X.Y version.
-version = '0.2.2'
+import vcf
+version = vcf.VERSION
# The full version, including alpha/beta/rc tags.
-release = '0.2.2'
+release = vcf.VERSION
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
View
1 docs/index.rst
@@ -12,6 +12,7 @@ Contents:
FILTERS
HISTORY
+
Indices and tables
==================
View
13 setup.py
@@ -9,6 +9,7 @@
except ImportError:
requires.append('argparse')
+import vcf
setup(
name='PyVCF',
@@ -17,6 +18,7 @@
author='James Casbon and @jdoughertyii',
author_email='casbon@gmail.com',
description='Variant Call Format (VCF) parser for python',
+ long_description=vcf.__doc__,
test_suite='test.test_vcf.suite',
requires=requires,
entry_points = {
@@ -26,5 +28,14 @@
]
},
url='https://github.com/jamescasbon/PyVCF',
- version='0.2.2'
+ version=vcf.VERSION,
+ classifiers = [
+ 'Development Status :: 3 - Alpha',
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: Science/Research',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Topic :: Scientific/Engineering',
+ ],
+ keywords='bioinformatics',
)
View
21 test/issue-16.vcf
@@ -0,0 +1,21 @@
+##fileformat=VCFv4.0
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
+20 1234568 . G . . PASS NS=3;DP=9;AA=G GT ./. ./. ./.
+
View
11 test/test_vcf.py
@@ -227,8 +227,8 @@ def testFetchSite(self):
site = self.reader.fetch('20', 14369)
assert site is None
-
-
+
+
class TestOpenMethods(unittest.TestCase):
@@ -299,6 +299,12 @@ def testApplyMultipleFilters(self):
assert 'mgq50' in reader.filters
assert 'sq30' in reader.filters
+class TestRegression(unittest.TestCase):
+
+ def test_issue_16(self):
+ reader = vcf.Reader(fh('issue-16.vcf'))
+ assert reader.next().QUAL == None
+
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput))
@@ -310,3 +316,4 @@ def testApplyMultipleFilters(self):
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall))
+suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression))
View
32 vcf.py
@@ -19,7 +19,7 @@
Record(CHROM=20, POS=14370, REF=G, ALT=['A'])
Record(CHROM=20, POS=17330, REF=T, ALT=['A'])
Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
- Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+ Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
Record(CHROM=20, POS=1234567, REF=GTCT, ALT=['G', 'GTACT'])
@@ -67,7 +67,8 @@
>>> print record.nucl_diversity, record.aaf
0.6 0.5
>>> print record.get_hets()
- [Call(sample=NA00002, GT=1|0)]
+ [Call(sample=NA00002, GT=1|0, GQ=[48])]
+
``record.FORMAT`` will be a string specifying the format of the genotype
fields. In case the FORMAT column does not exist, ``record.FORMAT`` is
@@ -126,24 +127,29 @@
region you are interested in::
>>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
- >>> for record in vcf_reader.fetch('20', 1110696-1, 1230237):
+ >>> for record in vcf_reader.fetch('20', 1110696, 1230237):
... print record
Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
- Record(CHROM=20, POS=1230237, REF=T, ALT=['.'])
+ Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
+
+Or extract a single row::
+
+ >>> print vcf_reader.fetch('20', 1110696)
+ Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
The ``Writer`` class provides a way of writing a VCF file. Currently, you must specify a
template ``Reader`` which provides the metadata::
>>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
- >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'))
+ >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader)
>>> for record in vcf_reader:
- ... print r
+ ... vcf_writer.write_record(record)
An extensible script is available to filter vcf files in vcf_filter.py. VCF filters
declared by other packages will be available for use in this script. Please
-see FILTERS.md for full description.
+see :doc:`FILTERS` for full description.
'''
import collections
@@ -153,13 +159,15 @@
import sys
import itertools
-
try:
import pysam
except ImportError:
pysam = None
+VERSION = '0.3.0'
+
+
# Metadata parsers/constants
RESERVED_INFO = {
'AA': 'String', 'AC': 'Integer', 'AF': 'Float', 'AN': 'Integer',
@@ -391,7 +399,7 @@ def __eq__(self, other):
def __iter__(self):
return iter(self.samples)
-
+
def __str__(self):
return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
@@ -692,7 +700,7 @@ def next(self):
alt = self._map(str, row[4].split(','))
if row[5] == '.':
- qual = '.'
+ qual = None
else:
qual = float(row[5]) if '.' in row[5] else int(row[5])
filt = row[6].split(';') if ';' in row[6] else row[6]
@@ -732,14 +740,14 @@ def fetch(self, chrom, start, end=None):
# not sure why tabix needs position -1
start = start - 1
-
+
if end is None:
self.reader = self._tabix.fetch(chrom, start, start+1)
try:
return self.next()
except StopIteration:
return None
-
+
self.reader = self._tabix.fetch(chrom, start, end)
return self

0 comments on commit 3811a81

Please sign in to comment.