Skip to content

Commit

Permalink
Updated vcftools functions
Browse files Browse the repository at this point in the history
  • Loading branch information
aewebb80 committed Aug 2, 2017
1 parent fb679cf commit d10c77b
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 43 deletions.
36 changes: 18 additions & 18 deletions andrew/vcf_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,14 @@ def __call__(self, parser, args, value, option_string=None):
setattr(args, self.dest, value)
return customAction

def parser_confirm_files ():
'''Custom action to confirm multiple file exists'''
class customAction(argparse.Action):
def __call__(self, parser, args, value, option_string=None):
if not os.path.isfile(value):
raise IOError # File not found
getattr(args, self.dest).append(value)
return customAction
def metavar_list (var_list):
'''Create a formmated metavar list for the help output'''
return '{' + ', '.join(var_list) + '}'

vcf_parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

# Input arguments.
vcf_parser.add_argument("vcfname", metavar='VCF_Input', help = "Input VCF filename", type = str, action = parser_confirm_file())
vcf_parser.add_argument("vcfname", metavar = 'VCF_Input', help = "Input VCF filename", type = str, action = parser_confirm_file())

# Other file arguments. Expand as needed
vcf_parser.add_argument('--out', help = 'Specifies the output filename', type = str, default = 'out', action = parser_confirm_no_file())
Expand All @@ -54,7 +49,7 @@ def __call__(self, parser, args, value, option_string=None):
statistic_list = ['weir-fst', 'windowed-weir-fst', 'TajimaD', 'pi', 'freq', 'het']
statistic_default = 'windowed-weir-fst'

vcf_parser.add_argument('--calc-statistic', metavar = '{' + ', '.join(statistic_list) + '}', help = 'Specifies the statistic to calculate', type = str, choices = statistic_list, default = statistic_default)
vcf_parser.add_argument('--calc-statistic', metavar = metavar_list(statistic_list), help = 'Specifies the statistic to calculate', type = str, choices = statistic_list, default = statistic_default)

# Statistic window options
vcf_parser.add_argument('--statistic-window-size', help = 'Specifies the size of window calculations', type = int, default = 10000)
Expand Down Expand Up @@ -124,7 +119,8 @@ def run (passed_arguments = []):
if vcf_args.calc_statistic == 'windowed-weir-fst':
# Confirms that at least two population files have been specified
if not vcf_args.pop_file or len(vcf_args.pop_file) < 2:
sys.exit('Two or more population files requried. Please assign using --pop-file')
logging.error('Two or more population files requried. Please assign using --pop-file')
raise IOError('Two or more population files requried. Please assign using --pop-file')

# Assigns specific vcftools arguments for calculating fst
vcftools_pop_args = [population_args for population_file in vcf_args.pop_file for population_args in ['--weir-fst-pop', population_file]]
Expand All @@ -134,53 +130,57 @@ def run (passed_arguments = []):
vcftools_call_args.extend(vcftools_pop_args + vcftools_window_args)

# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.windowed.weir.fst'
vcftools_log_suffix = 'windowed.weir.fst'

elif vcf_args.calc_statistic == 'weir-fst':
# Confirms that at least two population files have been specified
if not vcf_args.pop_file or len(vcf_args.pop_file) < 2:
sys.exit('Two or more population files requried. Please assign using --pop-file')
logging.error('Two or more population files requried. Please assign using --pop-file')
raise IOError('Two or more population files requried. Please assign using --pop-file')

# Assigns specific vcftools arguments for calculating site-based fst
vcftools_call_args.extend([population_args for population_file in vcf_args.pop_file for population_args in ['--weir-fst-pop', population_file]])

# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.weir.fst'
vcftools_log_suffix = 'weir.fst'

elif vcf_args.calc_statistic == 'TajimaD':

# Assigns all the vcftools arguments for calculating TajimaD
vcftools_call_args.extend(['--TajimaD', vcf_args.statistic_window_size])

# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.Tajima.D'
vcftools_log_suffix = 'Tajima.D'

elif vcf_args.calc_statistic == 'pi':

# Assigns all the vcftools arguments for calculating pi
vcftools_call_args.extend(['--window-pi', vcf_args.statistic_window_size, '--window-pi-step', vcf_args.statistic_window_step])

# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.windowed.pi'
vcftools_log_suffix = 'windowed.pi'

elif vcf_args.calc_statistic == 'freq':

# Assigns all the vcftools arguments for the allele frequency
vcftools_call_args.extend(['--freq'])

# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.frq'
vcftools_log_suffix = 'frq'

elif vcf_args.calc_statistic == 'het':

# Assigns all the vcftools arguments for calculating heterozygosity
vcftools_call_args.extend(['--het'])

# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.het'
vcftools_log_suffix = 'het'

logging.info('vcftools parameters assigned')

# Confirm the vcftools output and log file do not exist
check_for_vcftools_output (vcf_args.out, vcftools_log_suffix, vcftools_log_suffix)

# Assigns the file argument for vcftools
vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
logging.info('Input file assigned')
Expand Down
51 changes: 38 additions & 13 deletions andrew/vcf_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,22 @@ def __call__(self, parser, args, value, option_string=None):
setattr(args, self.dest, value)
return customAction

def parser_confirm_files ():
'''Custom action to confirm multiple file exists'''
class customAction(argparse.Action):
def __call__(self, parser, args, value, option_string=None):
if not os.path.isfile(value):
raise IOError # File not found
getattr(args, self.dest).append(value)
return customAction
def metavar_list (var_list):
'''Create a formmated metavar list for the help output'''
return '{' + ', '.join(var_list) + '}'

vcf_parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

# Input arguments.
vcf_parser.add_argument("vcfname", metavar='VCF_Input', help = "Input VCF filename", type = str, action = parser_confirm_file())
vcf_parser.add_argument("vcfname", metavar = 'VCF_Input', help = "Input VCF filename", type = str, action = parser_confirm_file())

# Other file arguments. Expand as needed
vcf_parser.add_argument('--out', help = 'Specifies the output filename', type = str, default = 'out', action = parser_confirm_no_file())
vcf_parser.add_argument('--out', help = 'Specifies the filtered VCF output filename', type = str, default = 'out', action = parser_confirm_no_file())

out_format_list = ['vcf', 'bcf']
out_format_default = 'bcf'

vcf_parser.add_argument('--out-format', metavar = '{' + ', '.join(out_format_list) + '}', help = 'Specifies the output format', type = str, choices = out_format_list, default = out_format_default)
vcf_parser.add_argument('--out-format', metavar = metavar_list(out_format_list), help = 'Specifies the output format.', type = str, choices = out_format_list, default = out_format_default)
### Filters

# Chromosome filters
Expand Down Expand Up @@ -109,6 +104,34 @@ def run (passed_arguments = []):
Specifies the input VCF filename
--out : str
Specifies the output filename
--out-format : str
Specifies the output format {vcf, bcf} (Default: bcf)
--filter-include-chr : list or str
Specifies the chromosome(s) to include
--filter-exclude-chr : list or str
Specifies the chromosome(s) to exclude
--filter-from-bp : int
Specifies the lower bound of sites to include. May only be used with a single chromosome
--filter-to-bp : int
Specifies the upper bound of sites to include. May only be used with a single chromosome
--filter-include-bed : str
Specifies a set of sites to include within a BED file
--filter-exclude-bed : str
Specifies a set of sites to exclude within a BED file
--filter-include-passed : bool
Specifies that only sites with the filter flag 'PASS' should be included (Default: False)
--filter-include-filtered : list or str
Specifies that all sites with the given filter flag should be included
--filter-exclude-filtered : list or str
Specifies that all sites with the given filter flag should be excluded
--filter-include-info : list or str
Specifies that all sites with the given info flag should be included
--filter-exclude-info : list or str
Specifies that all sites with the given info flag should be excluded
--filter-min-alleles : int
Specifies that only sites with a number of allele >= to the number given should be included
--filter-min-alleles : int
Specifies that only sites with a number of allele <= to the number given should be included
Returns
-------
Expand All @@ -124,7 +147,6 @@ def run (passed_arguments = []):
IOError
Output file already exists
'''

# Grab VCF arguments from command line
Expand Down Expand Up @@ -194,6 +216,9 @@ def run (passed_arguments = []):

logging.info('vcftools parameters assigned')

# Confirm the vcftools output and log file do not exist
check_for_vcftools_output (vcf_args.out, 'recode.' + vcf_args.out_format, '.filter')

# Assigns the file argument for vcftools
vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
logging.info('Input file assigned')
Expand All @@ -205,7 +230,7 @@ def run (passed_arguments = []):

# Check that the log file was created correctly, get the suffix for the log file, and create the file
if check_vcftools_for_errors(vcftools_err):
produce_vcftools_log(vcftools_err, vcf_args.out, '.filter')
produce_vcftools_log(vcftools_err, vcf_args.out, 'filter')


if __name__ == "__main__":
Expand Down
6 changes: 5 additions & 1 deletion andrew/vcf_phase.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,18 @@ def __call__(self, parser, args, value, option_string=None):
setattr(args, self.dest, value)
return customAction

def metavar_list (var_list):
'''Create a formmated metavar list for the help output'''
return '{' + ', '.join(var_list) + '}'

phase_parser = argparse.ArgumentParser()

# Input arguments.
phase_parser.add_argument("vcfname", metavar='VCF_Input', help = "Input VCF filename", type = str, action = parser_confirm_file())

phasing_list = ['beagle', 'shapeit']
phasing_default = 'beagle'
phase_parser.add_argument('--phase-algorithm', metavar = '{' + ', '.join(phasing_list) + '}', help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default)
phase_parser.add_argument('--phase-algorithm', metavar = metavar_list(phasing_list), help = 'Specifies the phase algorithm to be used', type = str, choices = phasing_list, default = phasing_default)

# Other basic arguments. Expand as needed
phase_parser.add_argument('--out', help = 'Defines the output filename', default = 'out', action = parser_confirm_no_file())
Expand Down
12 changes: 10 additions & 2 deletions andrew/vcf_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ def __call__(self, parser, args, value, option_string = None):
setattr(args, self.dest, value)
return customAction

def metavar_list (var_list):
'''Create a formmated metavar list for the help output'''
return '{' + ', '.join(var_list) + '}'

sampler_parser = argparse.ArgumentParser()

# Input arguments
Expand All @@ -42,14 +46,14 @@ def __call__(self, parser, args, value, option_string = None):
# Statistic based arguments.
statistic_list = ['windowed-weir-fst', 'TajimaD']
statistic_default = 'windowed-weir-fst'
sampler_parser.add_argument('--calc-statistic', metavar = '{' + ', '.join(statistic_list) + '}', help = 'Specifies the statistic calculated ', type=str, choices = statistic_list, default = statistic_default)
sampler_parser.add_argument('--calc-statistic', metavar = metavar_list(statistic_list), help = 'Specifies the statistic calculated ', type=str, choices = statistic_list, default = statistic_default)

sampler_parser.add_argument('--statistic-window-size', help = 'Specifies the size of window calculations', type = int, default = 10000)

# Sampling methods. Currently mutually exclusive to only allow a single sampling method
sampling_list = ['uniform', 'random']
sampling_default = 'random'
sampler_parser.add_argument('--sampling-scheme', metavar = '{' + ', '.join(sampling_list) + '}', help = 'Specifies the sampling scheme ', type=str, choices = sampling_list, default = sampling_default)
sampler_parser.add_argument('--sampling-scheme', metavar = metavar_list(sampling_list), help = 'Specifies the sampling scheme ', type=str, choices = sampling_list, default = sampling_default)

# Sampling options
sampler_parser.add_argument('--uniform-bins', help="Number of bins in uniform sampling", type = int, default = 10)
Expand Down Expand Up @@ -136,6 +140,10 @@ def run ():
----------
VCF_Input : str
Specifies the input VCF filename
--out : str
Specifies the VCF output filename
--sample-file : str
Specifies the sampled (statistic file) tsv output filename
--statistic-file : str
Specifies the statistic file for filtering
--calc-statistic : str
Expand Down
57 changes: 48 additions & 9 deletions andrew/vcftools.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,49 @@
import os
import sys
import logging

sys.path.insert(0, os.path.abspath(os.path.join(os.pardir,'jared')))

import vcf_reader_func

def check_for_vcftools_output (output_prefix, output_suffix, log_suffix):
'''
Checks for the previous vcftools output
Confirms that neither a previous vcftools log or output file exists.
Parameters
----------
output_prefix : str
Specifies the prefix used by vcftools, may be the default value or
user-defined
output_suffix : str
Specifies the standard output suffix used by vcftools.
log_suffix : str
Specifies the log suffix (PPP-specified) without the log extension
Raises
------
IOError
If the vcftools standard output exists
IOError
If the vcftools log file exists
'''
# Check if output file already exists
if os.path.isfile(output_prefix + '.' + output_suffix):
logging.error('Output file already exists')
raise IOError('Output file already exists')

logging.info('Output file assigned')

# Check if log file already exists
if os.path.isfile(output_prefix + '.' + log_suffix + '.log'):
logging.error('Log file already exists')
raise IOError('Log file already exists')

logging.info('Log file assigned')

def check_vcftools_for_errors (vcftools_output):
'''
Checks the vcftools stderr for errors
Expand Down Expand Up @@ -34,11 +73,13 @@ def check_vcftools_for_errors (vcftools_output):
# Splits log into list of lines
vcftools_output_lines = vcftools_output.splitlines()
# Prints the error(s)
sys.exit('\n'.join((output_line for output_line in vcftools_output_lines if output_line.startswith('Error'))))
logging.error('\n'.join((output_line for output_line in vcftools_output_lines if output_line.startswith('Error'))))
raise Exception('\n'.join((output_line for output_line in vcftools_output_lines if output_line.startswith('Error'))))

# Print output if not completed and no error found. Unlikely to be used, but included.
else:
sys.exit(vcftools_output)
logging.error(vcftools_output)
raise Exception(vcftools_output)

def produce_vcftools_log (output, filename, function):
'''
Expand Down Expand Up @@ -68,12 +109,9 @@ def produce_vcftools_log (output, filename, function):
Log file already exists
'''

if not os.path.isfile(filename + function + '.log'):
vcftools_log_file = open(filename + function + '.log','w')
vcftools_log_file.write(str(output))
vcftools_log_file.close()
else:
sys.exit('Error: Log file already exits')
vcftools_log_file = open(filename + '.' + function + '.log','w')
vcftools_log_file.write(str(output))
vcftools_log_file.close()


def assign_vcftools_input_arg (filename):
Expand Down Expand Up @@ -118,4 +156,5 @@ def assign_vcftools_input_arg (filename):
elif vcfname_format == 'bgzip':
return ['--bcf', filename]
else:
sys.exit('Unknown file format')
logging.error('Unknown VCF file format')
raise Exception('Unknown VCF file format')
Binary file removed jared/parse_functions.pyc
Binary file not shown.
Binary file removed jared/vcf_reader_func.pyc
Binary file not shown.

0 comments on commit d10c77b

Please sign in to comment.