Skip to content

Commit

Permalink
Starting working on vcf_filter
Browse files Browse the repository at this point in the history
  • Loading branch information
aewebb80 committed Jun 14, 2017
1 parent df7250a commit 7813ee0
Show file tree
Hide file tree
Showing 7 changed files with 320 additions and 32 deletions.
27 changes: 18 additions & 9 deletions andrew/vcftools_calc.py → andrew/vcf_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from vcftools import *

# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

#from logging_module import initLogger

def vcf_calc_parser(passed_arguments):
'''VCF Argument Parser'''
'''VCF Argument Parser - Assigns arguments from command line'''

def parser_confirm_file ():
'''Custom action to confirm file exists'''
Expand Down Expand Up @@ -70,14 +70,14 @@ def run (passed_arguments = []):
'''
Statistic calculation using VCFTools.
Automates the calculation of specific statistics (Fst (site/windowed),
Tajima's D, Pi, allele frequency, and heterozygosity) using VCFTools. If no
statistic is specified, windowed Fst is used by default.
Automates the calculation of site/windowed fixation index (Fst), Tajima's D,
nucleotide diversity (Pi), allele frequency, and heterozygosity using
VCFTools. If no statistic is specified, windowed Fst is used by default.
Parameters
----------
VCF_Input : str
Input VCF filename
Specifies the input VCF filename
--out : str
Specifies the output filename
--pop-file : str
Expand All @@ -87,14 +87,23 @@ def run (passed_arguments = []):
Specifies the statistic to calculate. Choices: weir-fst,
windowed-weir-fst (Default), TajimaD, pi, freq, het
--statistic-window-size : int
Specifies the window size of the statistic
Specifies the window size for window-based statistics
--statistic-window-step : int
Specifies step size between windows
Specifies step size between windows for spcific window-based statistics
Returns
-------
file
output : file
Statistic file output
log : file
Log file output
Raises
------
IOError
Input VCF file does not exist
IOError
Output file already exists
'''
Expand Down
179 changes: 179 additions & 0 deletions andrew/vcf_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import os
import sys
import subprocess
import argparse
import logging

from vcftools import *

# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

#from logging_module import initLogger

def vcf_filter_parser(passed_arguments):
'''VCF Argument Parser - Assigns arguments from command line'''

def parser_confirm_file ():
'''Custom action to confirm file exists'''
class customAction(argparse.Action):
def __call__(self, parser, args, value, option_string=None):
if not os.path.isfile(value):
raise IOError # File not found
setattr(args, self.dest, value)
return customAction

def parser_confirm_no_file ():
'''Custom action to confirm file does not exist'''
class customAction(argparse.Action):
def __call__(self, parser, args, value, option_string=None):
if os.path.isfile(value):
raise IOError # File found
setattr(args, self.dest, value)
return customAction

def parser_confirm_files ():
'''Custom action to confirm multiple file exists'''
class customAction(argparse.Action):
def __call__(self, parser, args, value, option_string=None):
if not os.path.isfile(value):
raise IOError # File not found
getattr(args, self.dest).append(value)
return customAction

vcf_parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

# Input arguments.
vcf_parser.add_argument("vcfname", metavar='VCF_Input', help = "Input VCF filename", type = str, action = parser_confirm_file())

# Other file arguments. Expand as needed
vcf_parser.add_argument('--out', help = 'Specifies the output filename', type = str, default = 'out', action = parser_confirm_no_file())

### Site Filters

# Chromosome Filters
vcf_parser.add_argument('--filter-chr', help = 'Specifies the chromosome(s) to include', nargs = '+', type = str)
vcf_parser.add_argument('--filter-not-chr', help = 'Specifies the chromosome(s) to exclude', nargs = '+', type = str)

# Position Filters
vcf_parser.add_argument('--filter-from-bp', help = 'Specifies the lower bound of sites to include (May only be used with a single chromosome)', type = int)
vcf_parser.add_argument('--filter-to-bp', help = 'Specifies the upper bound of sites to include (May only be used with a single chromosome)', type = int)


if passed_arguments:
return vcf_parser.parse_args(passed_arguments)
else:
return vcf_parser.parse_args()

def run (passed_arguments = []):
'''
Filter VCF files using VCFTools.
Automates various filters to VCF files using VCFtools.
Parameters
----------
VCF_Input : str
Specifies the input VCF filename
--out : str
Specifies the output filename
Returns
-------
output : file
Filtered VCF file output
log : file
Log file output
Raises
------
IOError
Input VCF file does not exist
IOError
Output file already exists
'''

# Grab VCF arguments from command line
vcf_args = vcf_filter_parser(passed_arguments)

print vcf_args

'''
# Argument container for vcftools
vcftools_call_args = ['--out', vcf_args.out]
if vcf_args.calc_statistic == 'windowed-weir-fst':
# Confirms that at least two population files have been specified
if not vcf_args.pop_file or len(vcf_args.pop_file) < 2:
sys.exit('Two or more population files requried. Please assign using --pop-file')
# Assigns specific vcftools arguments for calculating fst
vcftools_pop_args = [population_args for population_file in vcf_args.pop_file for population_args in ['--weir-fst-pop', population_file]]
vcftools_window_args = ['--fst-window-size', vcf_args.statistic_window_size, '--fst-window-step', vcf_args.statistic_window_step]
# Assigns all the vcftools arguments for calculating windowed fst
vcftools_call_args.extend(vcftools_pop_args + vcftools_window_args)
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.windowed.weir.fst'
elif vcf_args.calc_statistic == 'weir-fst':
# Confirms that at least two population files have been specified
if not vcf_args.pop_file or len(vcf_args.pop_file) < 2:
sys.exit('Two or more population files requried. Please assign using --pop-file')
# Assigns specific vcftools arguments for calculating site-based fst
vcftools_call_args.extend([population_args for population_file in vcf_args.pop_file for population_args in ['--weir-fst-pop', population_file]])
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.weir.fst'
elif vcf_args.calc_statistic == 'TajimaD':
# Assigns all the vcftools arguments for calculating TajimaD
vcftools_call_args.extend(['--TajimaD', vcf_args.statistic_window_size])
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.Tajima.D'
elif vcf_args.calc_statistic == 'pi':
# Assigns all the vcftools arguments for calculating pi
vcftools_call_args.extend(['--window-pi', vcf_args.statistic_window_size, '--window-pi-step', vcf_args.statistic_window_step])
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.windowed.pi'
elif vcf_args.calc_statistic == 'freq':
# Assigns all the vcftools arguments for the allele frequency
vcftools_call_args.extend(['--freq'])
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.frq'
elif vcf_args.calc_statistic == 'het':
# Assigns all the vcftools arguments for calculating heterozygosity
vcftools_call_args.extend(['--het'])
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.het'
# Assigns the file argument for vcftools
vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
# vcftools subprocess call
vcftools_call = subprocess.Popen(['vcftools'] + vcfname_arg + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
vcftools_out, vcftools_err = vcftools_call.communicate()
# Check that the log file was created correctly, get the suffix for the log file, and create the file
if check_vcftools_for_errors(vcftools_err):
produce_vcftools_log(vcftools_err, vcf_args.out, vcftools_log_suffix)
'''

if __name__ == "__main__":
#initLogger()
run()
38 changes: 35 additions & 3 deletions andrew/vcf_phase.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import argparse
import glob

sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

import vcf_reader_func

Expand Down Expand Up @@ -41,7 +41,7 @@ def __call__(self, parser, args, value, option_string=None):

# Other basic arguments. Expand as needed
phase_parser.add_argument('--out', help = 'Defines the output filename', default = 'out', action = parser_confirm_no_file())
phase_parser.add_argument('--estimate-file', help = 'Defines the estimated genotype frequency filename. For used if using beagle', default = 'estimated_gt', action = parser_confirm_no_file())
phase_parser.add_argument('--estimate-file', help = 'Defines the estimated genotype frequency filename. Required for the beagle algorithm', default = 'estimated_gt', action = parser_confirm_no_file())

if passed_arguments:
return phase_parser.parse_args(passed_arguments)
Expand All @@ -68,7 +68,39 @@ def assign_vcf_extension (filename):
sys.exit('Unknown file format')

def run (passed_arguments = []):
''' Wrapper code for Phasing. Commands are assigned using argparse.'''
'''
Phaser for VCF files.
Automates the phasing process for a specified VCF file. The function
allows users to select between multiple phasing algorithms: beagle
(default) and shapit.
Parameters
----------
VCF_Input : str
Specifies the input VCF filename
--phase-algorithm : str
Specifies the algorithm to be used. Choices: beagle (default) and
shapit
--out : str
Specifies the output filename
--estimate-file : str
Defines the estimated genotype frequency filename. Required for the
beagle algorithm
Returns
-------
output : file
Phased VCF file
Raises
------
IOError
Input VCF file does not exist
IOError
Output file already exists
'''

# Grab VCF arguments from command line
phase_args = phase_argument_parser(passed_arguments)
Expand Down
21 changes: 14 additions & 7 deletions andrew/vcf_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,30 +109,37 @@ def run ():
Parameters
----------
VCF_Input : str
Input VCF filename, required to produce sampled VCF output
Specifies the input VCF filename
--calc-statistic : str
Specifies the statistic to calculate. Choices: windowed-weir-fst
(default) and TajimaD.
(default) and TajimaD
--statistic-window-size : int
Specifies the window size of the statistic. Required for Tajima's D
--sampling-scheme : str
Specifies the sampling scheme to use. Choices: random (default) and
uniform
--uniform-bins : int
Specifies the number of bins to use in uniform sampling
Specifies the number of bins for the uniform sampler
--sample-size : int
Specifies the total sample size. Note: If using the uniform sampling
scheme his number must be divisible by number of uniform bins.
scheme, this number must be divisible by number of uniform bins
--random-seed : int
Specifies the random seed value for the random number generator.
Specifies the random seed value for the random number generator
Returns
-------
file
output : file
Sampled statistic file
file
samples : file
Sampled VCF output
Raises
------
IOError
Input VCF file does not exist
IOError
Output file already exists
'''

# Get arguments from command line
Expand Down

0 comments on commit 7813ee0

Please sign in to comment.