Permalink
Browse files

Merge pull request #66 from lennax/lenna

Filter by sample
  • Loading branch information...
2 parents d1a9fdc + cbe8d90 commit 45513dd2c9ebacc1410f6f436aceee3f24d50f0d @jamescasbon committed Feb 23, 2014
Showing with 211 additions and 7 deletions.
  1. +39 −0 scripts/vcf_sample_filter.py
  2. +2 −1 setup.py
  3. +2 −1 vcf/__init__.py
  4. +5 −5 vcf/parser.py
  5. +115 −0 vcf/sample_filter.py
  6. +48 −0 vcf/test/test_vcf.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+
+# Author: Lenna X. Peterson
+# github.com/lennax
+# arklenna at gmail dot com
+
+import argparse
+import logging
+
+from vcf import SampleFilter
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("file", help="VCF file to filter")
+ parser.add_argument("-o", metavar="outfile",
+ help="File to write out filtered samples")
+ parser.add_argument("-f", metavar="filters",
+ help="Comma-separated list of sample indices or names \
+ to filter")
+ parser.add_argument("-i", "--invert", action="store_true",
+ help="Keep rather than discard the filtered samples")
+ parser.add_argument("-q", "--quiet", action="store_true",
+ help="Less output")
+
+ args = parser.parse_args()
+
+ if args.quiet:
+ log_level = logging.WARNING
+ else:
+ log_level = logging.INFO
+ logging.basicConfig(format='%(message)s', level=log_level)
+
+ sf = SampleFilter(infile=args.file, outfile=args.o,
+ filters=args.f, invert=args.invert)
+ if args.f is None:
+ print "Samples:"
+ for idx, val in enumerate(sf.samples):
+ print "{0}: {1}".format(idx, val)
View
@@ -47,7 +47,8 @@
setup(
name='PyVCF',
packages=['vcf', 'vcf.test'],
- scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py'],
+ scripts=['scripts/vcf_melt', 'scripts/vcf_filter.py',
+ 'scripts/vcf_sample_filter.py'],
author='James Casbon and @jdoughertyii',
author_email='casbon@gmail.com',
description='Variant Call Format (VCF) parser for Python',
View
@@ -59,7 +59,7 @@
>>> print record.INFO['AF']
[0.5]
-There are a number of convienience methods and properties for each ``Record`` allowing you to
+There are a number of convenience methods and properties for each ``Record`` allowing you to
examine properties of interest::
>>> print record.num_called, record.call_rate, record.num_unknown
@@ -176,5 +176,6 @@
from vcf.parser import VCFReader, VCFWriter
from vcf.filters import Base as Filter
from vcf.parser import RESERVED_INFO, RESERVED_FORMAT
+from vcf.sample_filter import SampleFilter
VERSION = '0.6.7'
View
@@ -1,10 +1,11 @@
+import codecs
import collections
-import re
import csv
import gzip
-import sys
import itertools
-import codecs
+import os
+import re
+import sys
try:
from collections import OrderedDict
@@ -430,7 +431,6 @@ def _parse_samples(self, samples, samp_fmt, site):
# check whether we already know how to parse this format
if samp_fmt not in self._format_cache:
self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
-
samp_fmt = self._format_cache[samp_fmt]
if cparse:
@@ -601,7 +601,7 @@ def fetch(self, chrom, start, end=None):
class Writer(object):
- """ VCF Writer """
+ """VCF Writer. On Windows Python 2, open stream with 'wb'."""
# Reverse keys and values in header field count dictionary
counts = dict((v,k) for k,v in field_counts.iteritems())
View
@@ -0,0 +1,115 @@
+# Author: Lenna X. Peterson
+# github.com/lennax
+# arklenna at gmail dot com
+
+import logging
+import sys
+import warnings
+
+
+from parser import Reader, Writer
+
+
+class SampleFilter(object):
+ """
+ Modifies the vcf Reader to filter each row by sample as it is parsed.
+
+ """
+
+ def __init__(self, infile, outfile=None, filters=None, invert=False):
+ # Methods to add to Reader
+ def get_filter(self):
+ return self._samp_filter
+
+ def set_filter(self, filt):
+ self._samp_filter = filt
+ if filt:
+ self.samples = [val for idx, val in enumerate(self.samples)
+ if idx not in set(filt)]
+
+ def filter_samples(fn):
+ """Decorator function to filter sample parameter"""
+ def filt(self, samples, *args):
+ samples = [val for idx, val in enumerate(samples)
+ if idx not in set(self.sample_filter)]
+ return fn(self, samples, *args)
+ return filt
+
+ # Add property to Reader for filter list
+ Reader.sample_filter = property(get_filter, set_filter)
+ Reader._samp_filter = []
+ # Modify Reader._parse_samples to filter samples
+ self._orig_parse_samples = Reader._parse_samples
+ Reader._parse_samples = filter_samples(Reader._parse_samples)
+ self.parser = Reader(filename=infile)
+ # Store initial samples and indices
+ self.samples = self.parser.samples
+ self.smp_idx = dict([(v, k) for k, v in enumerate(self.samples)])
+ # Properties for filter/writer
+ self.outfile = outfile
+ self.invert = invert
+ self.filters = filters
+ if filters is not None:
+ self.set_filters()
+ self.write()
+
+ def __del__(self):
+ try:
+ self._undo_monkey_patch()
+ except AttributeError:
+ pass
+
+ def set_filters(self, filters=None, invert=False):
+ """Convert filters from string to list of indices, set on Reader"""
+ if filters is not None:
+ self.filters = filters
+ if invert:
+ self.invert = invert
+ filt_l = self.filters.split(",")
+ filt_s = set(filt_l)
+ if len(filt_s) < len(filt_l):
+ warnings.warn("Non-unique filters, ignoring", RuntimeWarning)
+
+ def filt2idx(item):
+ """Convert filter to valid sample index"""
+ try:
+ item = int(item)
+ except ValueError:
+ # not an idx, check if it's a value
+ return self.smp_idx.get(item)
+ else:
+ # is int, check if it's an idx
+ if item < len(self.samples):
+ return item
+ filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s)))
+ if len(filters) < len(filt_s):
+ # TODO print the filters that were ignored
+ warnings.warn("Invalid filters, ignoring", RuntimeWarning)
+
+ if self.invert:
+ filters = set(xrange(len(self.samples))).difference(filters)
+
+ # `sample_filter` setter updates `samples`
+ self.parser.sample_filter = filters
+ if len(self.parser.samples) == 0:
+ warnings.warn("Number of samples to keep is zero", RuntimeWarning)
+ logging.info("Keeping these samples: {0}\n".format(self.parser.samples))
+ return self.parser.samples
+
+ def write(self, outfile=None):
+ if outfile is not None:
+ self.outfile = outfile
+ if self.outfile is None:
+ _out = sys.stdout
+ elif hasattr(self.outfile, 'write'):
+ _out = self.outfile
+ else:
+ _out = open(self.outfile, "wb")
+ logging.info("Writing to '{0}'\n".format(self.outfile))
+ writer = Writer(_out, self.parser)
+ for row in self.parser:
+ writer.write_record(row)
+
+ def _undo_monkey_patch(self):
+ Reader._parse_samples = self._orig_parse_samples
+ delattr(Reader, 'sample_filter')
View
@@ -5,6 +5,7 @@
import commands
import cPickle
from StringIO import StringIO
+import subprocess
import vcf
from vcf import utils
@@ -870,6 +871,52 @@ def testOpenFilenameGzipped(self):
self.assertEqual(self.samples, r.samples)
+class TestSampleFilter(unittest.TestCase):
+ def testCLIListSamples(self):
+ proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = proc.communicate()
+ self.assertEqual(proc.returncode, 0)
+ self.assertFalse(err)
+ expected_out = ['Samples:', '0: NA00001', '1: NA00002', '2: NA00003']
+ self.assertEqual(out.splitlines(), expected_out)
+
+ def testCLIWithFilter(self):
+ proc = subprocess.Popen('python scripts/vcf_sample_filter.py vcf/test/example-4.1.vcf -f 1,2 --quiet', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ out, err = proc.communicate()
+ self.assertEqual(proc.returncode, 0)
+ self.assertTrue(out)
+ self.assertFalse(err)
+ buf = StringIO()
+ buf.write(out)
+ buf.seek(0)
+ #print(buf.getvalue())
+ reader = vcf.Reader(buf)
+ self.assertEqual(reader.samples, ['NA00001'])
+ rec = reader.next()
+ self.assertEqual(len(rec.samples), 1)
+
+ def testSampleFilterModule(self):
+ # init filter with filename, get list of samples
+ filt = vcf.SampleFilter('vcf/test/example-4.1.vcf')
+ self.assertEqual(filt.samples, ['NA00001', 'NA00002', 'NA00003'])
+ # set filter, check which samples will be kept
+ filtered = filt.set_filters(filters="0", invert=True)
+ self.assertEqual(filtered, ['NA00001'])
+ # write filtered file to StringIO
+ buf = StringIO()
+ filt.write(buf)
+ buf.seek(0)
+ #print(buf.getvalue())
+ # undo monkey patch by destroying instance
+ del filt
+ self.assertTrue('sample_filter' not in dir(vcf.Reader))
+ # read output
+ reader = vcf.Reader(buf)
+ self.assertEqual(reader.samples, ['NA00001'])
+ rec = reader.next()
+ self.assertEqual(len(rec.samples), 1)
+
+
class TestFilter(unittest.TestCase):
@@ -1033,6 +1080,7 @@ def test_meta(self):
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods))
+suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSampleFilter))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUtils))

0 comments on commit 45513dd

Please sign in to comment.