Merge branch 'master' of https://github.com/jamescasbon/PyVCF into lenna

Conflicts: vcf/parser.py vcf/test/test_vcf.py
jamescasbon · Feb 22, 2014 · ba00d83 · ba00d83
2 parents 49f8897 + d1a9fdc
commit ba00d83
Show file tree

Hide file tree

Showing 28 changed files with 1,238 additions and 188 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ docs/_build
 .DS_Store
 vcf/cparse.c
 vcf/cparse.so
+.coverage
diff --git a/.travis.yml b/.travis.yml
@@ -4,9 +4,10 @@ python:
   - "2.6"
   - "2.7"
   - "3.2"
+  - "3.3"
   - "pypy"
 install:
-  - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors pysam argparse ordereddict; fi"
-  - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors pysam; fi"
+  - "if [[ $TRAVIS_PYTHON_VERSION == '2.6' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam argparse counter ordereddict; fi"
+  - "if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then pip install --use-mirrors cython && pip install --use-mirrors pysam; fi"
   - python setup.py install
 script: python setup.py test
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include vcf *.pyx
diff --git a/README.rst b/README.rst
@@ -14,7 +14,7 @@ There main interface is the class: ``Reader``.  It takes a file-like
 object and acts as a reader::
 
     >>> import vcf
-    >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'rb'))
+    >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r'))
     >>> for record in vcf_reader:
     ...     print record
     Record(CHROM=20, POS=14370, REF=G, ALT=[A])
@@ -49,7 +49,7 @@ one-entry Python lists (see, e.g., ``Record.ALT``).  Semicolon-delimited lists
 of key=value pairs are converted to Python dictionaries, with flags being given
 a ``True`` value. Integers and floats are handled exactly as you'd expect::
 
-    >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'rb'))
+    >>> vcf_reader = vcf.Reader(open('vcf/test/example-4.0.vcf', 'r'))
     >>> record = vcf_reader.next()
     >>> print record.POS
     14370
@@ -65,10 +65,10 @@ examine properties of interest::
     3 1.0 0
     >>> print record.num_hom_ref, record.num_het, record.num_hom_alt
     1 1 1
-    >>> print record.nucl_diversity, record.aaf
-    0.6 0.5
+    >>> print record.nucl_diversity, record.aaf, record.heterozygosity
+    0.6 [0.5] 0.5
     >>> print record.get_hets()
-    [Call(sample=NA00002, GT=1|0, HQ=[51, 51], DP=8, GQ=48)]
+    [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))]
     >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion
     True False True False
     >>> print record.var_type, record.var_subtype
@@ -101,7 +101,7 @@ call data in ``data``::
      >>> print call.sample
      NA00001
      >>> print call.data
-     {'GT': '0|0', 'HQ': [58, 50], 'DP': 3, 'GQ': 49}
+     CallData(GT=0|0, GQ=49, DP=3, HQ=[58, 50])
 
 Please note that as of release 0.4.0, attributes known to have single values (such as
 ``DP`` and ``GQ`` above) are returned as values.  Other attributes are returned
@@ -134,7 +134,7 @@ For example::
 
 ALT records are actually classes, so that you can interrogate them::
 
-    >>> reader = vcf.Reader(file('vcf/test/example-4.1-bnd.vcf'))
+    >>> reader = vcf.Reader(open('vcf/test/example-4.1-bnd.vcf'))
     >>> _ = reader.next(); row = reader.next()
     >>> print row
     Record(CHROM=1, POS=2, REF=T, ALT=[T[2:3[])
@@ -146,22 +146,22 @@ Random access is supported for files with tabix indexes.  Simply call fetch for
 region you are interested in::
 
     >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz')
-    >>> for record in vcf_reader.fetch('20', 1110696, 1230237):
+    >>> for record in vcf_reader.fetch('20', 1110696, 1230237):  # doctest: +SKIP
     ...     print record
     Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])
     Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
 
 Or extract a single row::
 
-    >>> print vcf_reader.fetch('20', 1110696)
+    >>> print vcf_reader.fetch('20', 1110696)  # doctest: +SKIP
     Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])
 
 
 The ``Writer`` class provides a way of writing a VCF file.  Currently, you must specify a
 template ``Reader`` which provides the metadata::
 
     >>> vcf_reader = vcf.Reader(filename='vcf/test/tb.vcf.gz')
-    >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader)
+    >>> vcf_writer = vcf.Writer(open('/dev/null', 'w'), vcf_reader)
     >>> for record in vcf_reader:
     ...     vcf_writer.write_record(record)
 

diff --git a/docs/HISTORY.rst b/docs/HISTORY.rst
@@ -17,6 +17,72 @@ New features should have test code sent with them.
 Changes
 =======
 
+0.6.7 Release
+-------------
+
+* Include missing .pyx files 
+
+0.6.6 Release
+-------------
+
+* better walk together record ordering (Thanks @datagram, #141)
+
+0.6.5 Release
+-------------
+
+* Better contig handling (#115, #116, #119 thanks Martijn)
+* INFO lines with type character (#120, #121 thanks @AndrewUzilov, Martijn)
+* Single breakends fix (#126 thanks @pkrushe)
+* Speedup by losing ordering of INFO (#128 thanks Martijn)
+* HOMSEQ and other missing fields in INFO (#130 thanks Martijn)
+* Add aaf property, (thanks @mgymrek #131)
+* Custom equality for walk_together, thanks bow #132
+* Change default line encoding to '\n'
+* Improved __eq__ (#134, thanks bow)
+
+
+0.6.4 Release
+-------------
+
+* Handle INFO fields with multiple values, thanks
+* Support writing records without GT data #88, thanks @bow
+* Pickleable call data #112, thanks @superbobry
+* Write files without FORMAT #95 thanks Martijn
+* Strict whitespace mode, thanks Martijn, Lee Lichtenstein and Manawsi Gupta
+* Add support for contigs in header, thanks @gcnh and Martijn
+* Fix GATK header parsing, thanks @alimanfoo
+
+0.6.3 Release
+-------------
+
+* cython port of #79
+* correct writing of meta lines #84 
+
+0.6.2 Release
+-------------
+
+* issues #78, #79 (thanks Sean, Brad) 
+
+0.6.1 Release
+-------------
+
+* Add strict whitespace mode for well formed VCFs with spaces 
+  in sample names (thanks Marco)
+* Ignore blank lines in files (thanks Martijn)
+* Tweaks for handling missing data (thanks Sean)
+* bcftools tests (thanks Martijn)
+* record.FILTER is always a list
+
+0.6.0 Release
+-------------
+
+* Backwards incompatible change: _Call.data is now a 
+  namedtuple (previously it was a dict)
+* Optional cython version, much improved performance.  
+* Improvements to writer (thanks @cmclean)
+* Improvements to inheritance of classes (thanks @lennax)
+
+
 0.5.0 Release
 -------------
 

diff --git a/scripts/vcf_filter.py b/scripts/vcf_filter.py
@@ -162,7 +162,7 @@ def addfilt(filt):
         if output_record:
             # use PASS only if other filter names appear in the FILTER column
             #FIXME: is this good idea?
-            if record.FILTER == '.' and not drop_filtered: record.FILTER = 'PASS'
+            if record.FILTER is None and not drop_filtered: record.FILTER = 'PASS'
             output.write_record(record)
 
 if __name__ == '__main__': main()
diff --git a/scripts/vcf_melt b/scripts/vcf_melt
@@ -39,7 +39,9 @@ for record in reader:
 
     for sample in record.samples:
         row = [sample.sample]
-        row += [flatten(sample.data.get(x, None)) for x in formats]
+        # Format fields not present will simply end up "blank"
+        # in the output
+        row += [flatten(getattr(sample.data, x, None)) for x in formats]
         row += [record.FILTER or '.']
         row += fixed
         row += info_row

diff --git a/setup.py b/setup.py
@@ -16,9 +16,12 @@
 except ImportError:
     requires.append('argparse')
 
-
+import collections
+try:
+    collections.Counter
+except AttributeError:
+    requires.append('counter')
 try:
-    import collections
     collections.OrderedDict
 except AttributeError:
     requires.append('ordereddict')

diff --git a/tox.ini b/tox.ini
@@ -4,7 +4,7 @@
 # and then run "tox" from this directory.
 
 [tox]
-envlist = py26, py27, py32
+envlist = py26, py27, py32, py33
 
 [testenv]
 commands =
@@ -14,7 +14,9 @@ commands =
 [testenv:py26]
 deps =
     argparse
+    counter
     ordereddict
+    cython
     pysam
 
 [testenv:py27]
@@ -23,6 +25,9 @@ deps =
     cython
 
 [testenv:py32]
-deps = 
+deps =
     cython
 
+[testenv:py33]
+deps =
+    cython
diff --git a/vcf/__init__.py b/vcf/__init__.py
@@ -66,8 +66,8 @@
     3 1.0 0
     >>> print record.num_hom_ref, record.num_het, record.num_hom_alt
     1 1 1
-    >>> print record.nucl_diversity, record.aaf
-    0.6 0.5
+    >>> print record.nucl_diversity, record.aaf, record.heterozygosity
+    0.6 [0.5] 0.5
     >>> print record.get_hets()
     [Call(sample=NA00002, CallData(GT=1|0, GQ=48, DP=8, HQ=[51, 51]))]
     >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion
@@ -178,4 +178,4 @@
 from vcf.parser import RESERVED_INFO, RESERVED_FORMAT
 from vcf.sample_filter import SampleFilter
 
-VERSION = '0.5.0'
+VERSION = '0.6.7'
diff --git a/vcf/cparse.pyx b/vcf/cparse.pyx
@@ -48,7 +48,10 @@ def parse_samples(
             if entry_num == 1 or ',' not in vals:
 
                 if entry_type == INTEGER:
-                    sampdat[j] = int(vals)
+                    try:
+                        sampdat[j] = int(vals)
+                    except ValueError:
+                        sampdat[j] = float(vals)
                 elif entry_type == FLOAT or entry_type == NUMERIC:
                     sampdat[j] = float(vals)
                 else:
@@ -62,7 +65,10 @@ def parse_samples(
             vals = vals.split(',')
 
             if entry_type == INTEGER:
-                sampdat[j] = _map(int, vals)
+                try:
+                    sampdat[j] = _map(int, vals)
+                except ValueError:
+                    sampdat[j] = map(float, vals)
             elif entry_type == FLOAT or entry_type == NUMERIC:
                 sampdat[j] = _map(float, vals)
             else:

diff --git a/vcf/filters.py b/vcf/filters.py
@@ -138,11 +138,12 @@ def __call__(self, record):
     def bias_test(self, calls):
         calls = [x for x in calls if x.called]
         #TODO: single genotype assumption
+
         try:
             # freebayes
             ra = robjects.IntVector([x['RO'][0] for x in calls])
             aa = robjects.IntVector([x['AO'][0] for x in calls])
-        except KeyError:
+        except AttributeError:
             # GATK
             ra = robjects.IntVector([x['AD'][0] for x in calls])
             aa = robjects.IntVector([x['AD'][1] for x in calls])