Skip to content

Commit

Permalink
Updating vcf functions with logging module
Browse files Browse the repository at this point in the history
  • Loading branch information
aewebb80 committed Jul 24, 2017
1 parent ff6b26f commit 9ab6d65
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 32 deletions.
3 changes: 3 additions & 0 deletions andrew/example/locus8.Tajima.D.sampled
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
CHROM BIN_START N_SNPS TajimaD
0 chr11 121610000 3 -1.24618
1 chr11 121620000 7 -1.5221
22 changes: 13 additions & 9 deletions andrew/pipeline.log
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
2017-07-20 16:23:37,154 - logArgs - INFO: Arguments for Test:
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments uniform_bins: 10
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments vcfname: example/locus8.vcf.gz
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments random_seed: 83441029
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments statistic_file: example/locus8.windowed.weir.fst
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments calc_statistic: windowed-weir-fst
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments sampling_scheme: random
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments sample_size: 200
2017-07-20 16:23:37,155 - logArgs - INFO: Arguments statistic_window_size: 10000
2017-07-24 16:45:46,176 - logArgs - INFO: Arguments for vcf_sampler:
2017-07-24 16:45:46,176 - logArgs - INFO: Arguments uniform_bins: 10
2017-07-24 16:45:46,176 - logArgs - INFO: Arguments vcfname: example/locus8.vcf.gz
2017-07-24 16:45:46,177 - logArgs - INFO: Arguments random_seed: 354674735
2017-07-24 16:45:46,177 - logArgs - INFO: Arguments statistic_file: example/locus8.Tajima.D
2017-07-24 16:45:46,177 - logArgs - INFO: Arguments calc_statistic: TajimaD
2017-07-24 16:45:46,177 - logArgs - INFO: Arguments sampling_scheme: random
2017-07-24 16:45:46,177 - logArgs - INFO: Arguments sample_size: 2
2017-07-24 16:45:46,177 - logArgs - INFO: Arguments statistic_window_size: 10000
2017-07-24 16:45:46,179 - run - INFO: Sample (i.e. statistic) file assigned
2017-07-24 16:45:46,179 - run - WARNING: Sample size equal to number of samples within sample file
2017-07-24 16:45:46,179 - run - INFO: Random sampling complete
2017-07-24 16:45:46,181 - run - INFO: Created selected samples file
13 changes: 9 additions & 4 deletions andrew/vcf_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __call__(self, parser, args, value, option_string=None):
return vcf_parser.parse_args()

def logArgs(args, pipeline_function):
''' Logs arguments from argparse system. Likely should be'''
'''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
logging.info('Arguments for %s:' % pipeline_function)
for k in vars(args):
logging.info('Arguments %s: %s' % (k, vars(args)[k]))
Expand Down Expand Up @@ -110,13 +110,14 @@ def run (passed_arguments = []):
Input VCF file does not exist
IOError
Output file already exists
'''

# Grab VCF arguments from command line
vcf_args = vcf_calc_parser(passed_arguments)

# Adds the arguments (i.e. parameters) to the log file
logArgs(vcf_args, 'vcf_calc')

# Argument container for vcftools
vcftools_call_args = ['--out', vcf_args.out]

Expand Down Expand Up @@ -178,17 +179,21 @@ def run (passed_arguments = []):
# Assigns the suffix for the vcftools log file
vcftools_log_suffix = '.het'

logging.info('vcftools parameters assigned')

# Assigns the file argument for vcftools
vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
logging.info('Input file assigned')

# vcftools subprocess call
vcftools_call = subprocess.Popen(['vcftools'] + vcfname_arg + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
vcftools_out, vcftools_err = vcftools_call.communicate()
logging.info('vcftools call complete')

# Check that the log file was created correctly, get the suffix for the log file, and create the file
if check_vcftools_for_errors(vcftools_err):
produce_vcftools_log(vcftools_err, vcf_args.out, vcftools_log_suffix)

if __name__ == "__main__":
#initLogger()
initLogger()
run()
17 changes: 15 additions & 2 deletions andrew/vcf_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

#from logging_module import initLogger
from logging_module import initLogger

def vcf_filter_parser(passed_arguments):
'''VCF Argument Parser - Assigns arguments from command line'''
Expand Down Expand Up @@ -91,6 +91,12 @@ def __call__(self, parser, args, value, option_string=None):
else:
return vcf_parser.parse_args()

def logArgs(args, pipeline_function):
'''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
logging.info('Arguments for %s:' % pipeline_function)
for k in vars(args):
logging.info('Arguments %s: %s' % (k, vars(args)[k]))

def run (passed_arguments = []):
'''
Filter VCF files using VCFTools.
Expand Down Expand Up @@ -124,6 +130,9 @@ def run (passed_arguments = []):
# Grab VCF arguments from command line
vcf_args = vcf_filter_parser(passed_arguments)

# Adds the arguments (i.e. parameters) to the log file
logArgs(vcf_args, 'vcf_filter')

# Argument container for vcftools
vcftools_call_args = ['--out', vcf_args.out]

Expand Down Expand Up @@ -183,18 +192,22 @@ def run (passed_arguments = []):
if vcf_args.filter_distance:
vcftools_call_args.extend(['--thin', vcf_args.filter_distance])

logging.info('vcftools parameters assigned')

# Assigns the file argument for vcftools
vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
logging.info('Input file assigned')

# vcftools subprocess call
vcftools_call = subprocess.Popen(['vcftools'] + vcfname_arg + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
vcftools_out, vcftools_err = vcftools_call.communicate()
logging.info('vcftools call complete')

# Check that the log file was created correctly, get the suffix for the log file, and create the file
if check_vcftools_for_errors(vcftools_err):
produce_vcftools_log(vcftools_err, vcf_args.out, '.filter')


if __name__ == "__main__":
#initLogger()
initLogger()
run()
33 changes: 33 additions & 0 deletions andrew/vcf_phase.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import subprocess
import argparse
import glob
import logging

sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

import vcf_reader_func
from logging_module import initLogger

def phase_argument_parser(passed_arguments):
'''Phase Argument Parser - Assigns arguments for vcftools from command line.
Expand Down Expand Up @@ -48,6 +50,12 @@ def __call__(self, parser, args, value, option_string=None):
else:
return phase_parser.parse_args()

def logArgs(args, pipeline_function):
'''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
logging.info('Arguments for %s:' % pipeline_function)
for k in vars(args):
logging.info('Arguments %s: %s' % (k, vars(args)[k]))

def possible_beagle_paths ():
possible_paths = ['beagle*.jar']
if os.name == 'posix':
Expand Down Expand Up @@ -105,9 +113,14 @@ def run (passed_arguments = []):
# Grab VCF arguments from command line
phase_args = phase_argument_parser(passed_arguments)

# Adds the arguments (i.e. parameters) to the log file
logArgs(phase_args, 'vcf_phase')

# Assign file extension for VCF input file
vcfname_ext = assign_vcf_extension(phase_args.vcfname)

logging.info('Input file assigned')

# Used to confirm if the VCF input was renamed
vcfname_renamed = False

Expand All @@ -124,48 +137,68 @@ def run (passed_arguments = []):
# Assign the arguments for the algorithm
likelihood_call_args = ['gtgl=' + phase_args.vcfname, 'out=' + phase_args.estimate_file]

logging.info('beagle estimate parameters assigned')

# beagle estimated genotype frequency subprocess call
likelihood_call = subprocess.Popen(algorithm_call_args + likelihood_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
likelihood_out, likelihood_err = likelihood_call.communicate()

# Confirms that beagle finished without error
if likelihood_err:
logging.error('Error creating the estimated genotype frequency file. Please check input file.')
sys.exit('Error creating the estimated genotype frequency file. Please check input file.')

logging.info('beagle estimate file created')

# Assigns the arguments for phasing
phase_call_args = ['gt=' + phase_args.estimate_file + vcfname_ext, 'out=' + phase_args.out]

logging.info('beagle phasing parameters assigned')

# Phasing subprocess call
phase_call = subprocess.Popen(algorithm_call_args + phase_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
phase_out, phase_err = phase_call.communicate()
print phase_out, phase_err

logging.info('beagle phasing complete')

elif phase_args.phase_algorithm == 'shapeit':
# Assign the algorithm
algorithm_call_args = ['./bin/shapeit']

# Assigns the arguments for phasing
phase_call_args = ['--input-vcf', phase_args.vcfname, '-O', phase_args.out]

logging.info('shapeit phasing parameters assigned')

# Phasing subprocess call
phase_call = subprocess.Popen(algorithm_call_args + phase_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
phase_out, phase_err = phase_call.communicate()

# Confirms that shapeit finished without error
if phase_err:
logging.error('Error occured in phasing. Please check input file.')
sys.exit('Error occured in phasing. Please check input file.')

logging.info('shapeit phasing complete (non-VCF format)')

# Assigns the arguments for converting the phased output into a vcf file
convert_call = ['-convert', '--input-haps', phase_args.out, '--output-vcf', phase_args.out + vcfname_ext]

logging.info('shapeit conversion parameter assigned')

# Convert subprocess call
convert_call = subprocess.Popen(algorithm_call_args + convert_call, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
convert_out, convert_err = convert_call.communicate()
print convert_out, convert_err

logging.info('shapeit conversion complete (non-VCF to VCF)')


# Reverts the VCF input file
if vcfname_renamed:
os.rename(phase_args.vcfname, phase_args.vcfname[:-len(vcfname_ext)])

if __name__ == "__main__":
initLogger()
run()
85 changes: 68 additions & 17 deletions andrew/vcf_sampler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import os, sys, csv, argparse, random, pysam
import os, sys, csv, argparse, random, pysam, itertools, logging
import numpy as np
import pandas as pd
from collections import defaultdict

# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))

from logging_module import initLogger

def sampler_parser():
'''Sampler Argument Parser - Assigns arguments from command line.'''

Expand Down Expand Up @@ -42,6 +47,12 @@ def __call__(self, parser, args, value, option_string=None):

return sampler_parser.parse_args()

def logArgs(args, pipeline_function):
'''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
logging.info('Arguments for %s:' % pipeline_function)
for k in vars(args):
logging.info('Arguments %s: %s' % (k, vars(args)[k]))

def random_vcftools_sampler (vcftools_samples, sample_size):
'''Random Sampler. Returns a list of randomly selected elements from
vcftools_samples. The length of this list is defined by sample_size.'''
Expand Down Expand Up @@ -90,10 +101,12 @@ def assign_statistic_column (sample_headers, statistic):
statistic_converter = {'windowed-weir-fst':'MEAN_FST', 'TajimaD':'TajimaD'}

if not statistic_converter.has_key(statistic):
sys.exit('Statistic not found. assign_statistic_column update needed')
logging.critical('Statistic not found. Statistic list needs to be updated. Please contact the PPP Team.')
sys.exit('Statistic not found. Statistic list needs to be updated. Please contact the PPP Team.')

if statistic_converter[statistic] not in sample_headers:
sys.exit('Statistic selected not found in file specified by --statistic-file')
logging.error('Statistic selected not found in file specified by --statistic-file.')
sys.exit('Statistic selected not found in file specified by --statistic-file.')

return statistic_converter[statistic]

Expand Down Expand Up @@ -144,58 +157,96 @@ def run ():

# Get arguments from command line
sampler_args = sampler_parser()

# Adds the arguments (i.e. parameters) to the log file
logArgs(sampler_args, 'vcf_sampler')

# Set the random seed
random.seed(sampler_args.random_seed)

# Read in the sample file
vcftools_samples = pd.read_csv(sampler_args.statistic_file, sep = '\t')

logging.info('Sample (i.e. statistic) file assigned')

# Confirm there are enough samples
if len(vcftools_samples) < sampler_args.sample_size:
sys.exit('Sample size larger than the number of samples within input')
logging.error('Sample size larger than the number of samples within sample file')
sys.exit('Sample size larger than the number of samples within sample file')

# UPDATE Planned
# Improve from equal to too few samples (i.e. samples = 100, sample_size = 95)
elif len(vcftools_samples) == sampler_args.sample_size:
# Update with warning call
print 'Sample size equal to number of samples within input'
# Warns the user of poor sampling
logging.warning('Sample size equal to number of samples within sample file')

# Run the uniform sampler
if sampler_args.sampling_scheme == 'uniform':
if sampler_args.sample_size % sampler_args.uniform_bins != 0:
logging.error('Sample size not divisible by the bin count')
sys.exit('Sample size not divisible by the bin count')

assigned_statistic = assign_statistic_column (list(vcftools_samples), sampler_args.calc_statistic)
selected_samples = sorted(uniform_vcftools_sampler(list(vcftools_samples[assigned_statistic]), sampler_args.uniform_bins, sampler_args.sample_size))

logging.info('Uniform sampling complete')

# Run the random sampler
if sampler_args.sampling_scheme == 'random':

selected_samples = sorted(random_vcftools_sampler(range(len(vcftools_samples)), sampler_args.sample_size))

logging.info('Random sampling complete')

# Reduce to only selected samples
sampled_samples = vcftools_samples[vcftools_samples.index.isin(selected_samples)]

# Create TSV file of the reduced samples
sampled_samples.to_csv(sampler_args.statistic_file + '.sampled', sep = '\t')

logging.info('Created selected samples file')

# If a VCF file is specifed, create a reduced VCF file(s) from the samples
if sampler_args.vcfname:

split_vcfname = os.path.basename(sampler_args.vcfname).split(os.extsep, 1)

# Open the VCF file
vcf_input = pysam.VariantFile(sampler_args.vcfname)

# Create the VCF output file
vcf_output = pysam.VariantFile(split_vcfname[0] + '.sampled.' + split_vcfname[1], 'w', header = vcf_input.header)

# Get the chromosome, start, and end columns
chr_col, start_col, end_col = assign_position_columns(list(vcftools_samples))
for sampled_row in vcftools_samples.values:
if not end_col:
if not sampler_args.statistic_window_size:
sys.argv("--statistic-window-size argument required for the Tajima's D statistic")
if sampler_args.calc_statistic == 'TajimaD':

print sampled_row[chr_col], sampled_row[start_col], sampled_row[start_col] + (sampler_args.statistic_window_size - 1)
else:
print sampled_row[chr_col], sampled_row[start_col], sampled_row[end_col]
if not sampler_args.statistic_window_size:
logging.error("--statistic-window-size argument required for the Tajima's D statistic")
sys.argv("--statistic-window-size argument required for the Tajima's D statistic")

'''
vcf_input = pysam.VariantFile(sampler_args.vcf_file)
for sampled_row in vcftools_samples.values:
if end_col:
vcf_output = pysam.VariantFile('Sampled_{0}_{1}_{2}.vcf.gz'.format(sampled_row[chr_col], sampled_row[start_col], sampled_row[end_col]), 'w', header = vcf_input.header)
# Create iterator of all unique bin combinations
bin_start_iter = itertools.combinations(list(vcftools_samples['BIN_START']), 2)

# Check if the bin combinations are divisible by the window size
if not all([abs(bin_1 - bin_2) % sampler_args.statistic_window_size == 0 for bin_1, bin_2 in bin_start_iter]):
logging.error("--statistic-window-size argument conflicts with values in sample file")
sys.argv("--statistic-window-size argument conflicts with values in sample file")

for vcf_record in vcf_input.fetch(sampled_row[chr_col], int(sampled_row[start_col]), int(sampled_row[start_col]) + (sampler_args.statistic_window_size - 1)):
vcf_output.write(vcf_record)
else:
for vcf_record in vcf_input.fetch(sampled_row[chr_col], int(sampled_row[start_col]), int(sampled_row[end_col])):
vcf_output.write(vcf_record)
'''

vcf_input.close()
vcf_output.close()

logging.info('Created selected VCF file')


if __name__ == "__main__":
initLogger()
run()
Binary file removed andrew/vcftools.pyc
Binary file not shown.
Binary file added jared/logging_module.pyc
Binary file not shown.

0 comments on commit 9ab6d65

Please sign in to comment.