From 9b8190d78bd29e3484a42ebc75fb8364df5f56b8 Mon Sep 17 00:00:00 2001 From: Jiantao Wu Date: Wed, 29 Aug 2012 15:36:26 -0400 Subject: [PATCH] Change the output format to VCF 4.1 Add license and readme files --- LICENSE | 21 +++++ README.md | 71 +++++++++++++++ src/TangramDetect/TGM_Array.h | 6 ++ src/TangramDetect/TGM_LibTable.h | 5 + src/TangramDetect/TGM_Printer.cpp | 146 +++++++++++++++++++++++++++++- src/TangramDetect/TGM_Printer.h | 12 ++- 6 files changed, 255 insertions(+), 6 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2cd8d5b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2012 Jiantao Wu, Gabor Marth + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md index e69de29..0661dc9 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,71 @@ +========================================================================================= +Tangram 0.1.0 Release Distribution Documentation 2012-08-29 +Jiantao Wu (jiantaowu.xining@gmail.com), Marth Lab [1], Boston College Biology Department +========================================================================================= + + +Introduction +========================= + +Tangram is a C/C++ command line toolbox for structural variation(SV) detection based on +MOSAIK [2] alignments. It takes advantage of both read-pair and split-read algorithms +and is extremely fast and memory-efficient. Powered by the Bamtools API [3], Tangram can +call SV events on multiple BAM files (a population) simutaneously to increase the +sensitivity on low-coverage dataset.Currently it reports mobile element insertions (MEI). +More other SV event types will be introduced soon. + + +Obtaining and Compiling +========================= + +> git clone git://github.com/jiantao/Tangram.git +> cd src +> make + + +Detection pipeline +========================= + +Currently, Tangram contains four sub-programs: + +1. tangram_scan : Scan through the bam file and calculate the fragment length distribution + for each library in that bam file. It will output the fragment length + distribution files for each input bam file. + +2. tangram_merge : If more than one bam files need to be scanned, this program will combine + all the fragment length distribution files together. It will output the + merged fragment length distribution file that enable the detection of + multiple bam files simutaneously. This step is optional if only one bam + file was used. + +3. tangram_index : Index the normal and special (MEI sequences) reference file. It will output + the indexed refrence file. This step is required for split read algorithm. + +4. tangram_detect : Detect the SV events from the MOSAIK aligned BAM files. It will output the + unfiltered VCF files. + + +The overall detection pipeline for Tangram looks like the following + +tangram_scan (input: BAM files) --> fragment length distribution files \ + \ + -----> tangram_detect (BAM files) --> VCF files + / +tangram_index (input: reference fasta files) --> indexed reference file / + +For the detailed usage of each program, please run "$PROGRAM -help" + + +Bug Report +========================= + +Please report bugs using the built-in bug reporting feature in github or by sending the author +an email. + + +References +========================= + +[1] http://bioinformatics.bc.edu/marthlab/Main_Page +[2] https://github.com/wanpinglee/MOSAIK +[3] https://github.com/pezmaster31/bamtools diff --git a/src/TangramDetect/TGM_Array.h b/src/TangramDetect/TGM_Array.h index 9500d19..f0894cd 100644 --- a/src/TangramDetect/TGM_Array.h +++ b/src/TangramDetect/TGM_Array.h @@ -34,6 +34,7 @@ namespace Tangram ~Array(); inline void Init(unsigned int capacity); + inline void MemSet(int value); inline void InitToEnd(void); void ResizeNoCopy(unsigned int newCap); @@ -113,6 +114,11 @@ namespace Tangram memset(data + size, 0, sizeof(T) * (capacity - size)); } + template inline void Array::MemSet(int value) + { + memset(data, value, sizeof(T) * (capacity)); + } + template void Array::ResizeNoCopy(unsigned int newCap) { if (capacity < newCap) diff --git a/src/TangramDetect/TGM_LibTable.h b/src/TangramDetect/TGM_LibTable.h index fded533..5acff6c 100644 --- a/src/TangramDetect/TGM_LibTable.h +++ b/src/TangramDetect/TGM_LibTable.h @@ -69,6 +69,11 @@ namespace Tangram bool GetSpecialRefID(uint32_t& specialRefID, const char* specialRefName) const; + inline unsigned int GetNumSamples(void) const + { + return sampleNames.Size(); + } + inline const Array& GetAnchorNames(void) const { return anchorNames; diff --git a/src/TangramDetect/TGM_Printer.cpp b/src/TangramDetect/TGM_Printer.cpp index f017abe..6a46e7d 100644 --- a/src/TangramDetect/TGM_Printer.cpp +++ b/src/TangramDetect/TGM_Printer.cpp @@ -16,6 +16,7 @@ * ===================================================================================== */ +#include #include "TGM_Printer.h" using namespace Tangram; @@ -50,6 +51,11 @@ void Printer::Print(void) void Printer::PrintSpecial(void) { unsigned int numSp = libTable.GetNumSpecialRef(); + unsigned int numSamples = libTable.GetNumSamples(); + + PrintSpecialHeader(); + sampleMap.Init(numSamples); + sampleMap.SetSize(numSamples); for (unsigned int i = 0; i != numSp; ++i) { @@ -69,7 +75,7 @@ void Printer::PrintSpecial(void) while (GetNextSpecial(pRpSpecials, pSplitSpecials)) { - sampleMap.clear(); + sampleMap.MemSet(0); InitFeatures(); if (printIdx.pRpSpecial != NULL) @@ -78,12 +84,15 @@ void Printer::PrintSpecial(void) if (printIdx.pSplitEvent != NULL) SetSampleInfoSplit(*(printIdx.pSplitEvent)); - SetSampleString(); + // SetSampleString(); SetSpecialFeatures(i); + PrintSpecialBody(); + /* printf("chr%s\t%d\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\n", features.anchorName, features.pos, features.pos + features.len + 1, features.strand, features.rpFrag[0], features.rpFrag[1], features.splitFrag[0], features.splitFrag[1], features.spRefName, features.pos5[0], features.pos5[1], features.pos3[0], features.pos3[1], formatted.str().c_str()); + */ } } @@ -101,23 +110,150 @@ void Printer::PrintSpecial(void) for (unsigned int j = 0; j != len; ++j) { - sampleMap.clear(); + sampleMap.MemSet(0); InitFeatures(); SetSampleInfoSplit(pSplitSpecials[j]); - SetSampleString(); + // SetSampleString(); SetSpecialFeaturesFromSplit(pSplitSpecials[j]); + PrintSpecialBody(); + + /* printf("chr%s\t%d\t%d\t%c\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%s\n", features.anchorName, features.pos, features.pos + features.len + 1, features.strand, features.rpFrag[0], features.rpFrag[1], features.splitFrag[0], features.splitFrag[1], features.spRefName, features.pos5[0], features.pos5[1], features.pos3[0], features.pos3[1], formatted.str().c_str()); + */ } } } } } +void Printer::PrintSpecialHeader(void) +{ + time_t now = time(0); + struct tm tstruct; + char buf[80]; + tstruct = *localtime(&now); + strftime(buf, sizeof(buf), "%Y%m%d", &tstruct); + + printf("##fileformat=VCFv4.1\n" + "##fileDate=%s\n" + "##source=Tangram\n" + "##ALT=\n" + "##ALT=\n" + "##ALT=\n" + "##ALT=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##FORMAT=\n" + "##FORMAT=\n", + buf); + + printf("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); + + const Array& sampleNames = libTable.GetSampleNames(); + + unsigned int numSamples = sampleNames.Size(); + for (unsigned int i = 0; i != numSamples; ++i) + { + printf("\t%s", sampleNames[i]); + } + + printf("\n"); +} + +void Printer::PrintSpecialBody(void) +{ + int insertedLen = -1; + if (printIdx.pSplitEvent != NULL) + { + if (printIdx.pSplitEvent->pSpecialData->end >= 0 && printIdx.pSplitEvent->pSpecialData->pos >=0) + insertedLen = printIdx.pSplitEvent->pSpecialData->end - printIdx.pSplitEvent->pSpecialData->pos + 1; + } + + int splitFrag = features.splitFrag[0] + features.splitFrag[1]; + char refChar = char_table[pRef->refSeq[features.pos - pRef->pos]]; + + formatted.clear(); + formatted.str(""); + + if (splitFrag == 0) + { + int ciPos1 = 0; + int ciPos2 = 0; + if (features.pos5[1] < features.pos3[0]) + { + ciPos1 = features.pos5[0] - features.pos5[1]; + ciPos2 = features.pos3[1] - features.pos5[1]; + } + else + { + ciPos1 = features.pos5[0] - features.pos3[0]; + ciPos2 = features.pos3[1] - features.pos3[0]; + } + + formatted << "CIPOS=" << ciPos1 << "," << ciPos2 << ";"; + } + + printf("chr%s\t" + "%d\t" + ".\t" + "%c\t" + "\t" + ".\t" + ".\t" + "%s%s", + features.anchorName, + features.pos, + refChar, + features.spRefName, + splitFrag > 0 ? "" : "IMPRECISE;", + formatted.str().c_str() + ); + + formatted.clear(); + formatted.str(""); + + formatted << "FRAG=" << features.rpFrag[0] << "," << features.rpFrag[1] << "," << features.splitFrag[0] << "," << features.splitFrag[1] << ";"; + + printf("%sSTRAND=%c;MEILEN=%d\t" + "GT:AD", + formatted.str().c_str(), + features.strand, + insertedLen + ); + + unsigned int numSamples = libTable.GetNumSamples(); + char buff[4]; + buff[1] = '/'; + buff[3] = '\0'; + + for (unsigned int i = 0; i != numSamples; ++i) + { + if (sampleMap[i] > 0) + { + buff[0] = '1'; + buff[2] = '.'; + } + else + { + buff[0] = '0'; + buff[2] = '0'; + } + + printf("\t%s:%d", buff, sampleMap[i]); + } + + printf("\n"); +} + bool Printer::GetNextSpecial(const SpecialEvent* pRpSpecials, const SplitEvent* pSplitSpecials) { if (printIdx.rpIdx == printIdx.rpSize) @@ -238,6 +374,7 @@ void Printer::SetSampleInfoSplit(const SplitEvent& splitEvent) } } +/* void Printer::SetSampleString(void) { const Array& sampleNames = libTable.GetSampleNames(); @@ -252,6 +389,7 @@ void Printer::SetSampleString(void) formatted << name << "_" << itor->second << "_"; } } +*/ void Printer::SetSpecialFeatures(unsigned int zaSpRefID) { diff --git a/src/TangramDetect/TGM_Printer.h b/src/TangramDetect/TGM_Printer.h index 3ebda53..829bc32 100644 --- a/src/TangramDetect/TGM_Printer.h +++ b/src/TangramDetect/TGM_Printer.h @@ -80,6 +80,10 @@ namespace Tangram void PrintSpecial(void); + void PrintSpecialHeader(void); + + void PrintSpecialBody(void); + inline void InitFeatures(void) { features.anchorName = NULL; @@ -115,7 +119,7 @@ namespace Tangram void SetSampleInfoSplit(const SplitEvent& splitEvent); - void SetSampleString(void); + // void SetSampleString(void); void SetSpecialFeatures(unsigned int zaSpRefID); @@ -123,13 +127,17 @@ namespace Tangram void SetSpecialFeaturesFromRp(const SpecialEvent& rpSpecial, unsigned int zaSpRefID); + bool SpecialPrintFilter(void); + private: PrintIdx printIdx; PrintFeatures features; - std::map sampleMap; + // std::map sampleMap; + + Array sampleMap; std::ostringstream formatted;