Skip to content
Browse files

Removed unused var2vcf subprogram

  • Loading branch information...
1 parent aed4754 commit 6bb3efb6c8fa4ac3cb3809c7c9f68dfabc23b115 @jts committed Mar 12, 2013
Showing with 0 additions and 490 deletions.
  1. +0 −1 src/SGA/Makefile.am
  2. +0 −4 src/SGA/sga.cpp
  3. +0 −457 src/SGA/var2vcf.cpp
  4. +0 −28 src/SGA/var2vcf.h
View
1 src/SGA/Makefile.am
@@ -49,7 +49,6 @@ sga_SOURCES = sga.cpp \
gen-ssa.h gen-ssa.cpp \
bwt2fa.h bwt2fa.cpp \
graph-diff.h graph-diff.cpp \
- var2vcf.h var2vcf.cpp \
hapgen.h hapgen.cpp \
gapfill.h gapfill.cpp \
variant-detectability.h variant-detectability.cpp \
View
4 src/SGA/sga.cpp
@@ -31,7 +31,6 @@
#include "bwt2fa.h"
#include "graph-diff.h"
#include "hapgen.h"
-#include "var2vcf.h"
#include "gapfill.h"
#include "variant-detectability.h"
#include "rewrite-evidence-bam.h"
@@ -77,7 +76,6 @@ static const char *SGA_USAGE_MESSAGE =
" filterBAM filter out contaminating mate-pair data in a BAM file\n"
" cluster find clusters of reads belonging to the same connected component in an assembly graph\n"
//" connect resolve the complete sequence of a paired-end fragment\n"
-//" var2vcf convert aligned variant sequences found by graph-diff into a VCF file\n"
//" hapgen generate candidate haplotypes from an assembly graph\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
@@ -146,8 +144,6 @@ int main(int argc, char** argv)
bwt2faMain(argc - 1, argv + 1);
else if(command == "graph-diff")
graphDiffMain(argc - 1, argv + 1);
- else if(command == "var2vcf")
- var2vcfMain(argc - 1, argv + 1);
else if(command == "hapgen")
hapgenMain(argc - 1, argv + 1);
else if(command == "gapfill")
View
457 src/SGA/var2vcf.cpp
@@ -1,457 +0,0 @@
-//-----------------------------------------------
-// Copyright 2011 Wellcome Trust Sanger Institute
-// Written by Jared Simpson (js18@sanger.ac.uk)
-// Released under the GPL
-//-----------------------------------------------
-//
-// var2vcf - convert variants from sga graph-diff
-// or sga assembly to a vcf file of differences
-// with respect to a reference
-//
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <iterator>
-#include <map>
-#include "ReadTable.h"
-#include "Util.h"
-#include "var2vcf.h"
-#include "Timer.h"
-#include "api/BamReader.h"
-#include "api/BamWriter.h"
-#include "StdAlnTools.h"
-#include "VCFUtil.h"
-
-//
-typedef std::vector<BamTools::BamAlignment> BamRecordVector;
-typedef std::map<std::string, std::string> StrStrMap;
-
-// Small class to read alignments of a variant group from a bam file
-struct VariantGroupReader
-{
- public:
-
- VariantGroupReader(BamTools::BamReader* pReader) : m_pReader(pReader)
- {
- // prime the reader with a single record
- bool success = m_pReader->GetNextAlignment(m_cachedAlignment);
- if(!success)
- {
- std::cerr << "Error: no records in the BAM file\n";
- exit(EXIT_FAILURE);
- }
- m_doneRead = false;
- }
-
- // Read a group of records from the bam belonging
- // to the same variant. Returns true if records
- // is successfully populated with alignments
- bool readVariantGroup(BamRecordVector& records)
- {
- if(m_doneRead)
- return false;
-
- std::string cachedName = m_cachedAlignment.Name;
- size_t sufPos = cachedName.find_last_of('-');
- assert(sufPos != std::string::npos && sufPos > 0);
- std::string targetSuffix = cachedName.substr(sufPos);
-
- // Read alignments from the string as long as they have the same
- // suffix as the cached alignment
- records.push_back(m_cachedAlignment);
- while(1)
- {
- if(!m_pReader->GetNextAlignment(m_cachedAlignment))
- {
- m_doneRead = true;
- break; // no more reads but the last group should be processed
- }
-
- if(m_cachedAlignment.Name.rfind(targetSuffix) != std::string::npos)
- records.push_back(m_cachedAlignment); // part of the group
- else
- break; // the read parsed is not part of the same group
- }
- return true;
- }
-
- private:
- bool m_doneRead;
- BamTools::BamReader* m_pReader;
- BamTools::BamAlignment m_cachedAlignment;
-};
-
-// Comparator object to sort a VCF file based
-// on the order of the chromosomes/sequences
-// in the header of a BAM file
-struct VCFReferenceSorter
-{
- public:
- VCFReferenceSorter(const BamTools::BamReader* pReader) : m_pReader(pReader) {}
- bool operator()(const VCFRecord& a, const VCFRecord& b)
- {
- int a_rid = m_pReader->GetReferenceID(a.refName);
- int b_rid = m_pReader->GetReferenceID(b.refName);
- if(a_rid != b_rid)
- return a_rid < b_rid;
- else
- return a.refPosition < b.refPosition;
- }
-
- private:
- const BamTools::BamReader* m_pReader;
-};
-
-
-// Functions
-VCFReturnCode preprocessVariants(BamRecordVector& records);
-
-VCFReturnCode parseVariants(const ReadTable* pRefTable, const BamTools::BamReader* pReader,
- const BamRecordVector& records, VCFVector& outVCFRecords);
-
-//
-// Getopt
-//
-#define SUBPROGRAM "var2vcf"
-static const char *VAR2VCF_VERSION_MESSAGE =
-SUBPROGRAM " Version " PACKAGE_VERSION "\n"
-"Written by Jared Simpson.\n"
-"\n"
-"Copyright 2011 Wellcome Trust Sanger Institute\n";
-
-static const char *VAR2VCF_USAGE_MESSAGE =
-"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... BAMFILE\n"
-"Convert the sequence aligned to a reference in BAMFILE into a VCF file of differences\n"
-"\n"
-" --help display this help and exit\n"
-" -v, --verbose display verbose output\n"
-" -r, --reference=FILE read the reference sequences from FILE\n"
-" -o, --outfile=FILE write the results to FILE\n"
-" -q, --min-quality=Q discard variants with mapping quality less than Q (default: 1)\n"
-"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
-
-static const char* PROGRAM_IDENT =
-PACKAGE_NAME "::" SUBPROGRAM;
-
-namespace opt
-{
- static unsigned int verbose;
- static std::string outFile;
- static std::string referenceFile;
- static std::string bamFile;
- static int minQuality = 1;
- static int exactMatchRequired = 21;
- static double dustThreshold = 4.0f;
-}
-
-static const char* shortopts = "r:o:q:v";
-
-enum { OPT_HELP = 1, OPT_VERSION };
-
-static const struct option longopts[] = {
- { "verbose", no_argument, NULL, 'v' },
- { "refrence", required_argument, NULL, 'r' },
- { "outfile", required_argument, NULL, 'o' },
- { "min-quality", required_argument, NULL, 'q' },
- { "help", no_argument, NULL, OPT_HELP },
- { "version", no_argument, NULL, OPT_VERSION },
- { NULL, 0, NULL, 0 }
-};
-
-//
-// Main
-//
-int var2vcfMain(int argc, char** argv)
-{
- parseVar2VCFOptions(argc, argv);
-
- // Read the reference
- ReadTable refTable(opt::referenceFile, SRF_NO_VALIDATION);
- refTable.indexReadsByID();
-
- // Open the bam files for reading/writing
- BamTools::BamReader* pBamReader = new BamTools::BamReader;
- if(!pBamReader->Open(opt::bamFile))
- {
- std::cerr << "Failed to open BAM file: " << opt::bamFile << "\n";
- exit(EXIT_FAILURE);
- }
-
- //
- // Read the alignments from the BAM file as a group of variants
- // and convert them to VCF
- //
- VariantGroupReader groupReader(pBamReader);
- IntVector returnCodeStats(VCF_NUM_RETURN_CODES, 0);
-
- VCFVector vcfRecords;
- size_t numGroups = 0;
- while(1)
- {
- BamRecordVector records;
-
- bool groupOK = groupReader.readVariantGroup(records);
- if(!groupOK)
- break;
-
- numGroups += 1;
- VCFReturnCode code = preprocessVariants(records);
- if(code != VCF_OK)
- {
- // Failed preprocess checks
- returnCodeStats[code] += 1;
- continue;
- }
-
- code = parseVariants(&refTable, pBamReader, records, vcfRecords);
- returnCodeStats[code] += 1;
- }
-
- // Sort the VCF records by chromosome then position based
- // on the ordering in the input BAM
- VCFReferenceSorter refSorter(pBamReader);
- std::sort(vcfRecords.begin(), vcfRecords.end(), refSorter);
-
- //
- // Output VCF
- //
-
- // Build program string
- std::stringstream progSS;
- progSS << "sga";
- for(int i = 0; i < argc; ++i)
- progSS << " " << argv[i];
-
- IntVector classificationStats(VCF_NUM_CLASSIFICATIONS, 0);
- std::ostream* pWriter = createWriter(opt::outFile);
- VCFUtil::writeHeader(pWriter, PROGRAM_IDENT, stripDirectories(opt::bamFile), stripDirectories(opt::referenceFile));
- for(size_t i = 0; i < vcfRecords.size(); ++i)
- {
- VCFClassification code = vcfRecords[i].classify();
- assert(code < (int)classificationStats.size());
- classificationStats[code] += 1;
- *pWriter << vcfRecords[i] << "\n";
- }
- delete pWriter;
-
- // Print stats
- printf("Total variant sequences: %zu\n", numGroups);
- printf(" -- Successfully converted: %d\n", returnCodeStats[VCF_OK]);
- printf(" -- Failed due to flanking exact match check: %d\n", returnCodeStats[VCF_EXACT_MATCH_FAILED]);
- printf(" -- Failed due to low mapping quality: %d\n", returnCodeStats[VCF_MAP_QUALITY_FAILED]);
- printf(" -- Failed due to ambiguous base sequence mappping: %d\n", returnCodeStats[VCF_BASE_MULTIMAP_FAILED]);
- printf(" -- Failed due to partially mapped base sequence: %d\n", returnCodeStats[VCF_BASE_PARTIALMAP_FAILED]);
- printf(" -- Failed due to invalid multiple alignment: %d\n", returnCodeStats[VCF_INVALID_MULTIALIGNMENT]);
- printf("\n");
- printf("Totat variants: %zu\n", vcfRecords.size());
- printf(" -- substitutions: %d\n", classificationStats[VCF_SUB]);
- printf(" -- deletions: %d\n", classificationStats[VCF_DEL]);
- printf(" -- insertions: %d\n", classificationStats[VCF_INS]);
- printf(" -- complex: %d\n", classificationStats[VCF_COMPLEX]);
- pBamReader->Close();
- delete pBamReader;
- return 0;
-}
-
-// Perform sanity checks and filtering of the records
-VCFReturnCode preprocessVariants(BamRecordVector& records)
-{
- // Perform quality check
- for(size_t i = 0; i < records.size(); ++i)
- {
- // Only check the base sequence's map quality
- // The variant can potentially be so different from the reference
- // that it does not map
- if(records[i].Name.find("base") != std::string::npos)
- {
- if(records[i].MapQuality < opt::minQuality)
- return VCF_MAP_QUALITY_FAILED;
-
- // Check if the alignment is softclipped
- assert(!records[i].CigarData.empty());
- if(records[i].CigarData.front().Type == 'S' ||
- records[i].CigarData.back().Type == 'S')
- {
- return VCF_BASE_PARTIALMAP_FAILED;
- }
- }
- }
-
- // Perform duplicate mapping check
- int numBaseRecords = 0;
- int numVariantRecords = 0;
- for(size_t i = 0; i < records.size(); ++i)
- {
- if(records[i].Name.find("base") == 0)
- numBaseRecords += 1;
- else if(records[i].Name.find("variant") == 0)
- numVariantRecords += 1;
- else
- {
- std::cerr << "Error: Unexpected record name: " << records[i].Name << "\n";
- exit(EXIT_FAILURE);
- }
- }
-
- // Fail if the base is multimapped
- if(numBaseRecords > 1)
- return VCF_BASE_MULTIMAP_FAILED;
-
- // Map a new copy of the records with only 1 entry for the variant
- if(numVariantRecords > 1)
- {
- BamRecordVector outRecords;
- // Make a copy of the records with only 1 variant record
- for(size_t i = 0; i < records.size(); ++i)
- {
- if(records[i].Name.find("base") == 0)
- {
- outRecords.push_back(records[i]);
- }
- else if(records[i].Name.find("variant") == 0)
- {
- outRecords.push_back(records[i]);
- break;
- }
- }
- records = outRecords;
- }
-
- return VCF_OK;
-}
-
-// Perform the actual conversion of the collection of BAM records into a call
-VCFReturnCode parseVariants(const ReadTable* pRefTable, const BamTools::BamReader* pReader,
- const BamRecordVector& records, VCFVector& outVCFRecords)
-{
- // classify the sequences in the vector as base
- // or variant
- std::string baseString;
- std::string varString;
- std::string varName;
- std::string refName;
- size_t refStartPos = 0;
- size_t refEndPos = 0;
- bool bBaseIsRC = false;
-
- if(records.size() != 2)
- {
- std::cerr << "Error, expected 2 records (got: " << records.size() << ")\n";
- exit(EXIT_FAILURE);
- }
-
- for(size_t i = 0; i < records.size(); ++i)
- {
- if(records[i].Name.find("base") != std::string::npos)
- {
- if(!refName.empty())
- {
- std::cerr << "Error: base sequence " << records[i].Name << " does not have a single alignment\n";
- exit(EXIT_FAILURE);
- }
-
- assert(records[i].IsMapped());
- baseString = records[i].QueryBases;
- refStartPos = records[i].Position;
- refEndPos = records[i].GetEndPosition();
- const BamTools::RefVector& refVector = pReader->GetReferenceData();
- assert(records[i].RefID < (int)refVector.size());
- refName = refVector[records[i].RefID].RefName;
- bBaseIsRC = records[i].IsReverseStrand();
- }
-
- if(records[i].Name.find("variant") != std::string::npos)
- {
- varString = records[i].QueryBases;
- varName = records[i].Name;
-
- // We always keep the variant string in its original
- // strand, then flip it to match the base
- if(records[i].IsReverseStrand())
- varString = reverseComplement(varString);
- }
- }
-
- if(varString.empty() || baseString.empty())
- {
- std::cerr << "Error variant string or base string not found\n";
- exit(EXIT_FAILURE);
- }
-
- // Correct the orientation of the var string to match the base
- if(bBaseIsRC)
- varString = reverseComplement(varString);
-
- // Extract the portion of the reference
- const SeqItem& refItem = pRefTable->getRead(refName);
- assert(refStartPos < refItem.seq.length() && refEndPos < refItem.seq.length());
- std::string refString = refItem.seq.substr(refStartPos, refEndPos - refStartPos + 1);
-
- // Perform the actual conversion
- VCFReturnCode code = VCFUtil::generateVCFFromCancerVariant(refString,
- baseString,
- varString,
- refName,
- refStartPos,
- varName,
- opt::exactMatchRequired,
- opt::dustThreshold,
- opt::verbose,
- outVCFRecords);
- return code;
-}
-
-//
-// Handle command line arguments
-//
-void parseVar2VCFOptions(int argc, char** argv)
-{
- bool die = false;
- for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
- {
- std::istringstream arg(optarg != NULL ? optarg : "");
- switch (c)
- {
- case 'o': arg >> opt::outFile; break;
- case 'r': arg >> opt::referenceFile; break;
- case 'q': arg >> opt::minQuality; break;
- case '?': die = true; break;
- case 'v': opt::verbose++; break;
- case OPT_HELP:
- std::cout << VAR2VCF_USAGE_MESSAGE;
- exit(EXIT_SUCCESS);
- case OPT_VERSION:
- std::cout << VAR2VCF_VERSION_MESSAGE;
- exit(EXIT_SUCCESS);
- }
- }
-
- if (argc - optind < 1)
- {
- std::cerr << SUBPROGRAM ": missing arguments\n";
- die = true;
- }
- else if (argc - optind > 1)
- {
- std::cerr << SUBPROGRAM ": too many arguments\n";
- die = true;
- }
-
- if(opt::referenceFile.empty())
- {
- std::cerr << SUBPROGRAM ": a reference file must be provided\n";
- die = true;
- }
-
- if (die)
- {
- std::cout << "\n" << VAR2VCF_USAGE_MESSAGE;
- exit(EXIT_FAILURE);
- }
-
- // Parse the input filenames
- opt::bamFile = argv[optind++];
-
- if(opt::outFile.empty())
- opt::outFile = "variants.vcf";
-}
View
28 src/SGA/var2vcf.h
@@ -1,28 +0,0 @@
-//-----------------------------------------------
-// Copyright 2011 Wellcome Trust Sanger Institute
-// Written by Jared Simpson (js18@sanger.ac.uk)
-// Released under the GPL
-//-----------------------------------------------
-//
-// var2vcf - convert variants from sga graph-diff
-// or sga assembly to a vcf file of differences
-// with respect to a reference
-//
-#ifndef VAR2VCF_H
-#define VAR2VCF_H
-#include <getopt.h>
-#include "config.h"
-#include "BWT.h"
-#include "Match.h"
-#include "BWTAlgorithms.h"
-#include "OverlapAlgorithm.h"
-
-// functions
-
-//
-int var2vcfMain(int argc, char** argv);
-
-// options
-void parseVar2VCFOptions(int argc, char** argv);
-
-#endif

0 comments on commit 6bb3efb

Please sign in to comment.
Something went wrong with that request. Please try again.