Updating vcf functions with logging module

jaredgk · Jul 24, 2017 · 9ab6d65 · 9ab6d65
1 parent ff6b26f
commit 9ab6d65
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 32 deletions.
diff --git a/andrew/example/locus8.Tajima.D.sampled b/andrew/example/locus8.Tajima.D.sampled
@@ -0,0 +1,3 @@
+	CHROM	BIN_START	N_SNPS	TajimaD
+0	chr11	121610000	3	-1.24618
+1	chr11	121620000	7	-1.5221
diff --git a/andrew/pipeline.log b/andrew/pipeline.log
@@ -1,9 +1,13 @@
-2017-07-20 16:23:37,154 - logArgs - INFO: Arguments for Test:
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments uniform_bins: 10
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments vcfname: example/locus8.vcf.gz
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments random_seed: 83441029
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments statistic_file: example/locus8.windowed.weir.fst
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments calc_statistic: windowed-weir-fst
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments sampling_scheme: random
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments sample_size: 200
-2017-07-20 16:23:37,155 - logArgs - INFO: Arguments statistic_window_size: 10000
+2017-07-24 16:45:46,176 - logArgs - INFO: Arguments for vcf_sampler:
+2017-07-24 16:45:46,176 - logArgs - INFO: Arguments uniform_bins: 10
+2017-07-24 16:45:46,176 - logArgs - INFO: Arguments vcfname: example/locus8.vcf.gz
+2017-07-24 16:45:46,177 - logArgs - INFO: Arguments random_seed: 354674735
+2017-07-24 16:45:46,177 - logArgs - INFO: Arguments statistic_file: example/locus8.Tajima.D
+2017-07-24 16:45:46,177 - logArgs - INFO: Arguments calc_statistic: TajimaD
+2017-07-24 16:45:46,177 - logArgs - INFO: Arguments sampling_scheme: random
+2017-07-24 16:45:46,177 - logArgs - INFO: Arguments sample_size: 2
+2017-07-24 16:45:46,177 - logArgs - INFO: Arguments statistic_window_size: 10000
+2017-07-24 16:45:46,179 - run - INFO: Sample (i.e. statistic) file assigned
+2017-07-24 16:45:46,179 - run - WARNING: Sample size equal to number of samples within sample file
+2017-07-24 16:45:46,179 - run - INFO: Random sampling complete
+2017-07-24 16:45:46,181 - run - INFO: Created selected samples file
diff --git a/andrew/vcf_calc.py b/andrew/vcf_calc.py
@@ -67,7 +67,7 @@ def __call__(self, parser, args, value, option_string=None):
         return vcf_parser.parse_args()
 
 def logArgs(args, pipeline_function):
-    ''' Logs arguments from argparse system. Likely should be'''
+    '''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
     logging.info('Arguments for %s:' % pipeline_function)
     for k in vars(args):
         logging.info('Arguments %s: %s' % (k, vars(args)[k]))
@@ -110,13 +110,14 @@ def run (passed_arguments = []):
         Input VCF file does not exist
     IOError
         Output file already exists
-
-
     '''
 
     # Grab VCF arguments from command line
     vcf_args = vcf_calc_parser(passed_arguments)
 
+    # Adds the arguments (i.e. parameters) to the log file
+    logArgs(vcf_args, 'vcf_calc')
+
     # Argument container for vcftools
     vcftools_call_args = ['--out', vcf_args.out]
 
@@ -178,17 +179,21 @@ def run (passed_arguments = []):
         # Assigns the suffix for the vcftools log file
         vcftools_log_suffix = '.het'
 
+    logging.info('vcftools parameters assigned')
+
     # Assigns the file argument for vcftools
     vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
+    logging.info('Input file assigned')
 
     # vcftools subprocess call
     vcftools_call = subprocess.Popen(['vcftools'] + vcfname_arg + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     vcftools_out, vcftools_err = vcftools_call.communicate()
+    logging.info('vcftools call complete')
 
     # Check that the log file was created correctly, get the suffix for the log file, and create the file
     if check_vcftools_for_errors(vcftools_err):
         produce_vcftools_log(vcftools_err, vcf_args.out, vcftools_log_suffix)
 
 if __name__ == "__main__":
-    #initLogger()
+    initLogger()
     run()
diff --git a/andrew/vcf_filter.py b/andrew/vcf_filter.py
@@ -9,7 +9,7 @@
 # Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
 
-#from logging_module import initLogger
+from logging_module import initLogger
 
 def vcf_filter_parser(passed_arguments):
     '''VCF Argument Parser - Assigns arguments from command line'''
@@ -91,6 +91,12 @@ def __call__(self, parser, args, value, option_string=None):
     else:
         return vcf_parser.parse_args()
 
+def logArgs(args, pipeline_function):
+    '''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
+    logging.info('Arguments for %s:' % pipeline_function)
+    for k in vars(args):
+        logging.info('Arguments %s: %s' % (k, vars(args)[k]))
+
 def run (passed_arguments = []):
     '''
     Filter VCF files using VCFTools.
@@ -124,6 +130,9 @@ def run (passed_arguments = []):
     # Grab VCF arguments from command line
     vcf_args = vcf_filter_parser(passed_arguments)
 
+    # Adds the arguments (i.e. parameters) to the log file
+    logArgs(vcf_args, 'vcf_filter')
+
     # Argument container for vcftools
     vcftools_call_args = ['--out', vcf_args.out]
 
@@ -183,18 +192,22 @@ def run (passed_arguments = []):
     if vcf_args.filter_distance:
         vcftools_call_args.extend(['--thin', vcf_args.filter_distance])
 
+    logging.info('vcftools parameters assigned')
+
     # Assigns the file argument for vcftools
     vcfname_arg = assign_vcftools_input_arg(vcf_args.vcfname)
+    logging.info('Input file assigned')
 
     # vcftools subprocess call
     vcftools_call = subprocess.Popen(['vcftools'] + vcfname_arg + list(map(str, vcftools_call_args)), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     vcftools_out, vcftools_err = vcftools_call.communicate()
+    logging.info('vcftools call complete')
 
     # Check that the log file was created correctly, get the suffix for the log file, and create the file
     if check_vcftools_for_errors(vcftools_err):
         produce_vcftools_log(vcftools_err, vcf_args.out, '.filter')
 
 
 if __name__ == "__main__":
-    #initLogger()
+    initLogger()
     run()
diff --git a/andrew/vcf_phase.py b/andrew/vcf_phase.py
@@ -3,10 +3,12 @@
 import subprocess
 import argparse
 import glob
+import logging
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
 
 import vcf_reader_func
+from logging_module import initLogger
 
 def phase_argument_parser(passed_arguments):
     '''Phase Argument Parser - Assigns arguments for vcftools from command line.
@@ -48,6 +50,12 @@ def __call__(self, parser, args, value, option_string=None):
     else:
         return phase_parser.parse_args()
 
+def logArgs(args, pipeline_function):
+    '''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
+    logging.info('Arguments for %s:' % pipeline_function)
+    for k in vars(args):
+        logging.info('Arguments %s: %s' % (k, vars(args)[k]))
+
 def possible_beagle_paths ():
     possible_paths = ['beagle*.jar']
     if os.name == 'posix':
@@ -105,9 +113,14 @@ def run (passed_arguments = []):
     # Grab VCF arguments from command line
     phase_args = phase_argument_parser(passed_arguments)
 
+    # Adds the arguments (i.e. parameters) to the log file
+    logArgs(phase_args, 'vcf_phase')
+
     # Assign file extension for VCF input file
     vcfname_ext = assign_vcf_extension(phase_args.vcfname)
 
+    logging.info('Input file assigned')
+
     # Used to confirm if the VCF input was renamed
     vcfname_renamed = False
 
@@ -124,48 +137,68 @@ def run (passed_arguments = []):
         # Assign the arguments for the algorithm
         likelihood_call_args = ['gtgl=' + phase_args.vcfname, 'out=' + phase_args.estimate_file]
 
+        logging.info('beagle estimate parameters assigned')
+
         # beagle estimated genotype frequency subprocess call
         likelihood_call = subprocess.Popen(algorithm_call_args + likelihood_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
         likelihood_out, likelihood_err = likelihood_call.communicate()
 
         # Confirms that beagle finished without error
         if likelihood_err:
+            logging.error('Error creating the estimated genotype frequency file. Please check input file.')
             sys.exit('Error creating the estimated genotype frequency file. Please check input file.')
 
+        logging.info('beagle estimate file created')
+
         # Assigns the arguments for phasing
         phase_call_args = ['gt=' + phase_args.estimate_file + vcfname_ext, 'out=' + phase_args.out]
 
+        logging.info('beagle phasing parameters assigned')
+
         # Phasing subprocess call
         phase_call = subprocess.Popen(algorithm_call_args + phase_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
         phase_out, phase_err = phase_call.communicate()
         print phase_out, phase_err
 
+        logging.info('beagle phasing complete')
+
     elif phase_args.phase_algorithm == 'shapeit':
         # Assign the algorithm
         algorithm_call_args = ['./bin/shapeit']
 
         # Assigns the arguments for phasing
         phase_call_args = ['--input-vcf', phase_args.vcfname, '-O', phase_args.out]
 
+        logging.info('shapeit phasing parameters assigned')
+
         # Phasing subprocess call
         phase_call = subprocess.Popen(algorithm_call_args + phase_call_args, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
         phase_out, phase_err = phase_call.communicate()
 
         # Confirms that shapeit finished without error
         if phase_err:
+            logging.error('Error occured in phasing. Please check input file.')
             sys.exit('Error occured in phasing. Please check input file.')
 
+        logging.info('shapeit phasing complete (non-VCF format)')
+
         # Assigns the arguments for converting the phased output into a vcf file
         convert_call = ['-convert', '--input-haps', phase_args.out, '--output-vcf', phase_args.out + vcfname_ext]
 
+        logging.info('shapeit conversion parameter assigned')
+
         # Convert subprocess call
         convert_call = subprocess.Popen(algorithm_call_args + convert_call, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
         convert_out, convert_err = convert_call.communicate()
         print convert_out, convert_err
 
+        logging.info('shapeit conversion complete (non-VCF to VCF)')
+
+
     # Reverts the VCF input file
     if vcfname_renamed:
         os.rename(phase_args.vcfname, phase_args.vcfname[:-len(vcfname_ext)])
 
 if __name__ == "__main__":
+    initLogger()
     run()
diff --git a/andrew/vcf_sampler.py b/andrew/vcf_sampler.py
@@ -1,8 +1,13 @@
-import os, sys, csv, argparse, random, pysam
+import os, sys, csv, argparse, random, pysam, itertools, logging
 import numpy as np
 import pandas as pd
 from collections import defaultdict
 
+# Insert Jared's directory path, required for calling Jared's functions. Change when directory structure changes.
+sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, 'jared')))
+
+from logging_module import initLogger
+
 def sampler_parser():
     '''Sampler Argument Parser - Assigns arguments from command line.'''
 
@@ -42,6 +47,12 @@ def __call__(self, parser, args, value, option_string=None):
 
     return sampler_parser.parse_args()
 
+def logArgs(args, pipeline_function):
+    '''Logs arguments from argparse system. May be replaced with a general function in logging_module'''
+    logging.info('Arguments for %s:' % pipeline_function)
+    for k in vars(args):
+        logging.info('Arguments %s: %s' % (k, vars(args)[k]))
+
 def random_vcftools_sampler (vcftools_samples, sample_size):
     '''Random Sampler. Returns a list of randomly selected elements from
     vcftools_samples. The length of this list is defined by sample_size.'''
@@ -90,10 +101,12 @@ def assign_statistic_column (sample_headers, statistic):
     statistic_converter = {'windowed-weir-fst':'MEAN_FST', 'TajimaD':'TajimaD'}
 
     if not statistic_converter.has_key(statistic):
-        sys.exit('Statistic not found. assign_statistic_column update needed')
+        logging.critical('Statistic not found. Statistic list needs to be updated. Please contact the PPP Team.')
+        sys.exit('Statistic not found. Statistic list needs to be updated. Please contact the PPP Team.')
 
     if statistic_converter[statistic] not in sample_headers:
-        sys.exit('Statistic selected not found in file specified by --statistic-file')
+        logging.error('Statistic selected not found in file specified by --statistic-file.')
+        sys.exit('Statistic selected not found in file specified by --statistic-file.')
 
     return statistic_converter[statistic]
 
@@ -144,58 +157,96 @@ def run ():
 
     # Get arguments from command line
     sampler_args = sampler_parser()
+
+    # Adds the arguments (i.e. parameters) to the log file
+    logArgs(sampler_args, 'vcf_sampler')
+
     # Set the random seed
     random.seed(sampler_args.random_seed)
+
     # Read in the sample file
     vcftools_samples = pd.read_csv(sampler_args.statistic_file, sep = '\t')
 
+    logging.info('Sample (i.e. statistic) file assigned')
+
     # Confirm there are enough samples
     if len(vcftools_samples) < sampler_args.sample_size:
-        sys.exit('Sample size larger than the number of samples within input')
+        logging.error('Sample size larger than the number of samples within sample file')
+        sys.exit('Sample size larger than the number of samples within sample file')
 
+    # UPDATE Planned
+    # Improve from equal to too few samples (i.e. samples = 100, sample_size = 95)
     elif len(vcftools_samples) == sampler_args.sample_size:
-        # Update with warning call
-        print 'Sample size equal to number of samples within input'
+        # Warns the user of poor sampling
+        logging.warning('Sample size equal to number of samples within sample file')
 
     # Run the uniform sampler
     if sampler_args.sampling_scheme == 'uniform':
         if sampler_args.sample_size % sampler_args.uniform_bins != 0:
+            logging.error('Sample size not divisible by the bin count')
             sys.exit('Sample size not divisible by the bin count')
 
         assigned_statistic = assign_statistic_column (list(vcftools_samples), sampler_args.calc_statistic)
         selected_samples = sorted(uniform_vcftools_sampler(list(vcftools_samples[assigned_statistic]), sampler_args.uniform_bins, sampler_args.sample_size))
 
+        logging.info('Uniform sampling complete')
+
     # Run the random sampler
     if sampler_args.sampling_scheme == 'random':
+
         selected_samples = sorted(random_vcftools_sampler(range(len(vcftools_samples)), sampler_args.sample_size))
 
+        logging.info('Random sampling complete')
+
     # Reduce to only selected samples
     sampled_samples = vcftools_samples[vcftools_samples.index.isin(selected_samples)]
 
     # Create TSV file of the reduced samples
     sampled_samples.to_csv(sampler_args.statistic_file + '.sampled', sep = '\t')
 
+    logging.info('Created selected samples file')
+
     # If a VCF file is specifed, create a reduced VCF file(s) from the samples
     if sampler_args.vcfname:
+
+        split_vcfname = os.path.basename(sampler_args.vcfname).split(os.extsep, 1)
+
+        # Open the VCF file
+        vcf_input = pysam.VariantFile(sampler_args.vcfname)
+
+        # Create the VCF output file
+        vcf_output = pysam.VariantFile(split_vcfname[0] + '.sampled.' + split_vcfname[1], 'w', header = vcf_input.header)
+
         # Get the chromosome, start, and end columns
         chr_col, start_col, end_col = assign_position_columns(list(vcftools_samples))
         for sampled_row in vcftools_samples.values:
             if not end_col:
-                if not sampler_args.statistic_window_size:
-                    sys.argv("--statistic-window-size argument required for the Tajima's D statistic")
+                if sampler_args.calc_statistic == 'TajimaD':
 
-                print sampled_row[chr_col], sampled_row[start_col], sampled_row[start_col] + (sampler_args.statistic_window_size - 1)
-            else:
-                print sampled_row[chr_col], sampled_row[start_col], sampled_row[end_col]
+                    if not sampler_args.statistic_window_size:
+                        logging.error("--statistic-window-size argument required for the Tajima's D statistic")
+                        sys.argv("--statistic-window-size argument required for the Tajima's D statistic")
 
-        '''
-        vcf_input = pysam.VariantFile(sampler_args.vcf_file)
-        for sampled_row in vcftools_samples.values:
-            if end_col:
-                vcf_output = pysam.VariantFile('Sampled_{0}_{1}_{2}.vcf.gz'.format(sampled_row[chr_col], sampled_row[start_col], sampled_row[end_col]), 'w', header = vcf_input.header)
+                    # Create iterator of all unique bin combinations
+                    bin_start_iter = itertools.combinations(list(vcftools_samples['BIN_START']), 2)
+
+                    # Check if the bin combinations are divisible by the window size
+                    if not all([abs(bin_1 - bin_2) % sampler_args.statistic_window_size == 0  for bin_1, bin_2 in bin_start_iter]):
+                        logging.error("--statistic-window-size argument conflicts with values in sample file")
+                        sys.argv("--statistic-window-size argument conflicts with values in sample file")
+
+                    for vcf_record in vcf_input.fetch(sampled_row[chr_col], int(sampled_row[start_col]), int(sampled_row[start_col]) + (sampler_args.statistic_window_size - 1)):
+                        vcf_output.write(vcf_record)
+            else:
                 for vcf_record in vcf_input.fetch(sampled_row[chr_col], int(sampled_row[start_col]), int(sampled_row[end_col])):
                     vcf_output.write(vcf_record)
-        '''
+
+        vcf_input.close()
+        vcf_output.close()
+
+        logging.info('Created selected VCF file')
+
 
 if __name__ == "__main__":
+    initLogger()
     run()
diff --git a/andrew/vcftools.pyc b/andrew/vcftools.pyc
diff --git a/jared/logging_module.pyc b/jared/logging_module.pyc