khmer-counting.bib

%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/


%% Created for Qingpeng Zhang at 2013-06-27 01:26:17 -0400 


%% Saved with string encoding Unicode (UTF-8) 

@Article{pubmed21926975,
   author = "H Chitsaz and JL Yee-Greenbaum and G Tesler and MJ Lombardo and CL Dupont and JH Badger and M Novotny and DB Rusch and LJ Fraser and NA Gormley and O Schulz-Trieglaff and GP Smith and DJ Evers and PA Pevzner and RS Lasken",
   title = "Efficient de novo assembly of single-cell bacterial genomes from short-read data sets.",
   journal = "Nat Biotechnol",
   year = "2011",
   volume = "29",
   number = "10",
   pages = "915-21"
}

@article{Mackelprang2011,
	Abstract = {Permafrost contains an estimated 1672 Pg carbon (C), an amount roughly equivalent to the total currently contained within land plants and the atmosphere. This reservoir of C is vulnerable to decomposition as rising global temperatures cause the permafrost to thaw. During thaw, trapped organic matter may become more accessible for microbial degradation and result in greenhouse gas emissions. Despite recent advances in the use of molecular tools to study permafrost microbial communities, their response to thaw remains unclear. Here we use deep metagenomic sequencing to determine the impact of thaw on microbial phylogenetic and functional genes, and relate these data to measurements of methane emissions. Metagenomics, the direct sequencing of DNA from the environment, allows the examination of whole biochemical pathways and associated processes, as opposed to individual pieces of the metabolic puzzle. Our metagenome analyses reveal that during transition from a frozen to a thawed state there are rapid shifts in many microbial, phylogenetic and functional gene abundances and pathways. After one week of incubation at 5 $\,^{\circ}$C, permafrost metagenomes converge to be more similar to each other than while they are frozen. We find that multiple genes involved in cycling of C and nitrogen shift rapidly during thaw. We also construct the first draft genome from a complex soil metagenome, which corresponds to a novel methanogen. Methane previously accumulated in permafrost is released during thaw and subsequently consumed by methanotrophic bacteria. Together these data point towards the importance of rapid cycling of methane and nitrogen in thawing permafrost.},
	Author = {Mackelprang, Rachel and Waldrop, Mark P and DeAngelis, Kristen M and David, Maude M and Chavarria, Krystle L and Blazewicz, Steven J and Rubin, Edward M and Jansson, Janet K},
	Date-Added = {2013-06-27 05:16:27 +0000},
	Date-Modified = {2013-06-27 05:16:27 +0000},
	Doi = {10.1038/nature10576},
	Journal = {Nature},
	Journal-Full = {Nature},
	Mesh = {Alaska; Arctic Regions; Bacteria; Carbon; Carbon Cycle; DNA; Freezing; Genes, rRNA; Metagenome; Metagenomics; Methane; Nitrogen; Nitrogen Cycle; Oxidation-Reduction; Phylogeny; RNA, Ribosomal, 16S; Soil; Soil Microbiology; Temperature; Time Factors},
	Month = {Dec},
	Number = {7377},
	Pages = {368-71},
	Pmid = {22056985},
	Pst = {epublish},
	Title = {Metagenomic analysis of a permafrost microbial community reveals a rapid response to thaw},
	Volume = {480},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1038/nature10576}}

@article{Howe2012,
	Abstract = {The large volumes of sequencing data required to sample complex environments deeply pose new challenges to sequence analysis approaches. De novo metagenomic assembly effectively reduces the total amount of data to be analyzed but requires significant computational resources. We apply two pre-assembly filtering approaches, digital normalization and partitioning, to make large metagenome assemblies more comput\ ationaly tractable. Using a human gut mock community dataset, we demonstrate that these methods result in assemblies nearly identical to assemblies from unprocessed data. We then assemble two large soil metagenomes from matched Iowa corn and native prairie soils. The predicted functional content and phylogenetic origin of the assembled contigs indicate significant taxonomic differences despite similar function. The assembly strategies presented are generic and can be extended to any metagenome; full source code is freely available under a BSD license.},
	Author = {Adina Chuang Howe and Janet Jansson and Stephanie A. Malfatti and Susannah G. Tringe and James M. Tiedje and C. Titus Brown},
	Date-Added = {2013-06-27 05:02:46 +0000},
	Date-Modified = {2013-06-27 05:02:46 +0000},
	Eprint = {1212.2832},
	Month = {12},
	Title = {Assembling large, complex environmental metagenomes},
	Url = {http://arxiv.org/abs/1212.2832},
	Year = {2012},
	Bdsk-Url-1 = {http://arxiv.org/abs/1212.2832}}

@article{Kelley2010,
	Abstract = {We introduce Quake, a program to detect and correct errors in DNA sequencing reads. Using a maximum likelihood approach incorporating quality values and nucleotide specific miscall rates, Quake achieves the highest accuracy on realistically simulated reads. We further demonstrate substantial improvements in de novo assembly and SNP detection after using Quake. Quake can be used for any size project, including more than one billion human reads, and is freely available as open source software from http://www.cbcb.umd.edu/software/quake.},
	Author = {Kelley, David R and Schatz, Michael C and Salzberg, Steven L},
	Date-Added = {2013-06-27 05:00:53 +0000},
	Date-Modified = {2013-06-27 05:00:53 +0000},
	Doi = {10.1186/gb-2010-11-11-r116},
	Journal = {Genome Biol},
	Journal-Full = {Genome biology},
	Mesh = {Algorithms; Computational Biology; DNA, Bacterial; Escherichia coli; Genome, Human; Humans; Likelihood Functions; Models, Biological; Polymorphism, Single Nucleotide; Sequence Alignment; Sequence Analysis, DNA; Software},
	Number = {11},
	Pages = {R116},
	Pmc = {PMC3156955},
	Pmid = {21114842},
	Pst = {ppublish},
	Title = {Quake: quality-aware detection and correction of sequencing errors},
	Volume = {11},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2010-11-11-r116}}

@article{Medvedev2011,
	Abstract = {MOTIVATION: The continuing improvements to high-throughput sequencing (HTS) platforms have begun to unfold a myriad of new applications. As a result, error correction of sequencing reads remains an important problem. Though several tools do an excellent job of correcting datasets where the reads are sampled close to uniformly, the problem of correcting reads coming from drastically non-uniform datasets, such as those from single-cell sequencing, remains open.
RESULTS: In this article, we develop the method Hammer for error correction without any uniformity assumptions. Hammer is based on a combination of a Hamming graph and a simple probabilistic model for sequencing errors. It is a simple and adaptable algorithm that improves on other tools on non-uniform single-cell data, while achieving comparable results on normal multi-cell data.
AVAILABILITY: http://www.cs.toronto.edu/~pashadag.
CONTACT: pmedvedev@cs.ucsd.edu.},
	Author = {Medvedev, Paul and Scott, Eric and Kakaradov, Boyko and Pevzner, Pavel},
	Date-Added = {2013-06-27 04:55:52 +0000},
	Date-Modified = {2013-06-27 04:55:52 +0000},
	Doi = {10.1093/bioinformatics/btr208},
	Journal = {Bioinformatics},
	Journal-Full = {Bioinformatics (Oxford, England)},
	Mesh = {Algorithms; Escherichia coli; High-Throughput Nucleotide Sequencing; Models, Statistical; Single-Cell Analysis},
	Month = {Jul},
	Number = {13},
	Pages = {i137-41},
	Pmc = {PMC3117386},
	Pmid = {21685062},
	Pst = {ppublish},
	Title = {Error correction of high-throughput sequencing datasets with non-uniform coverage},
	Volume = {27},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr208}}

@article{Roy2013,
	Abstract = {Counting the frequencies of k-mers in read libraries is often a first step in the analysis of high-throughput sequencing experiments. Infrequent k-mers are assumed to be a result of sequencing errors. The frequent k-mers constitute a reduced but error-free representation of the experiment, which can inform read error correction or serve as the input to de novo assembly methods. Ideally, the memory requirement for counting should be linear in the number of frequent k-mers and not in the, typically much larger, total number of k-mers in the read library.},
	Author = {Rajat Shuvro Roy and Debashish Bhattacharya and Alexander Schliep},
	Date-Added = {2013-06-27 04:49:42 +0000},
	Date-Modified = {2013-06-27 04:49:42 +0000},
	Eprint = {1305.1861},
	Month = {05},
	Title = {Turtle: Identifying frequent k-mers with cache-efficient algorithms},
	Url = {http://arxiv.org/abs/1305.1861},
	Year = {2013},
	Bdsk-Url-1 = {http://arxiv.org/abs/1305.1861}}

@article{Chikhi2013,
	Abstract = {Genome assembly tools based on the de Bruijn graph framework rely on a parameter k, which represents a trade-off between several competing effects that are difficult to quantify. There is currently a lack of tools that would automatically estimate the best k to use and/or quickly generate histograms of k-mer abundances that would allow the user to make an informed decision.},
	Author = {Rayan Chikhi and Paul Medvedev},
	Date-Added = {2013-06-27 04:48:37 +0000},
	Date-Modified = {2013-06-27 04:48:37 +0000},
	Eprint = {1304.5665},
	Month = {04},
	Title = {Informed and Automated k-Mer Size Selection for Genome Assembly},
	Url = {http://arxiv.org/abs/1304.5665},
	Year = {2013},
	Bdsk-Url-1 = {http://arxiv.org/abs/1304.5665}}

@article{Brown2012,
	Abstract = {Deep shotgun sequencing and analysis of genomes, transcriptomes, amplified single-cell genomes, and metagenomes has enabled investigation of a wide range of organisms and ecosystems. However, sampling variation in short-read data sets and high sequencing error rates of modern sequencers present many new computational challenges in data interpretation. These challenges have led to the development of new classes of mapping tools and {\em de novo} assemblers. These algorithms are challenged by the continued improvement in sequencing throughput. We here describe digital normalization, a single-pass computational algorithm that systematizes coverage in shotgun sequencing data sets, thereby decreasing sampling variation, discarding redundant data, and removing the majority of errors. Digital normalization substantially reduces the size of shotgun data sets and decreases the memory and time requirements for {\em de novo} sequence assembly, all without significantly impacting content of the generated contigs. We apply digital normalization to the assembly of microbial genomic data, amplified single-cell genomic data, and transcriptomic data. Our implementation is freely available for use and modification.},
	Author = {C. Titus Brown and Adina Howe and Qingpeng Zhang and Alexis B. Pyrkosz and Timothy H. Brom},
	Date-Added = {2013-06-26 02:58:34 +0000},
	Date-Modified = {2013-06-26 02:58:34 +0000},
	Eprint = {1203.4802},
	Month = {03},
	Title = {A Reference-Free Algorithm for Computational Normalization of Shotgun Sequencing Data},
	Url = {http://arxiv.org/abs/1203.4802},
	Year = {2012},
	Bdsk-Url-1 = {http://arxiv.org/abs/1203.4802}}

@article{adina2013,
	Author = {Adina Chuang Howe and Jason Pell and Rosangela Canino-Koning and Rachel Mackelprang and Susannah Tringe and Janet Jansson and James M. Tiedje and C. Titus Brown},
	Date-Added = {2013-06-10 19:44:11 +0000},
	Date-Modified = {2013-06-10 19:44:57 +0000},
	Journal = {PLoS ONE},
	Title = {Illumina Sequencing Artifacts Revealed by Connectivity Analysis of Metagenomic Datasets},
	Year = {-}}

@article{Deorowicz2013,
	Abstract = {BACKGROUND: The k-mer counting problem, which is to build the histogram of occurrences of every k-symbol longsubstring in a given text, is important for many bioinformatics applications. They include developingde Bruijn graph genome assemblers, fast multiple sequence alignment and repeat detection. RESULTS: We propose a simple, yet efficient, parallel disk-based algorithm for counting k-mers. Experimentsshow that it usually offers the fastest solution to the considered problem, while demanding a relativelysmall amount of memory. In particular, it is capable of counting the statistics for short-read humangenome data, in input gzipped FASTQ file, in less than 40 minutes on a PC with 16GB of RAMand 6 CPU cores, and for long-read human genome data in less than 70 minutes. On a more powerfulmachine, using 32GB of RAM and 32 CPU cores, the tasks are accomplished in less than half the time.No other algorithm for most tested settings of this problem and mammalian-size data can accomplishthis task in comparable time. Our solution also belongs to memory-frugal ones; most competitivealgorithms cannot efficiently work on a PC with 16GB of memory for such massive data. CONCLUSIONS: By making use of cheap disk space and exploiting CPU and I/O parallelism we propose a very compet-itive k-mer counting procedure, called KMC. Our results suggest that judicious resource managementmay allow to solve at least some bioinformatics problems with massive data on a commodity personalcomputer. KEYWORDS: k-mer counting, de Bruijn graph genome assemblers, Multiple sequence alignment, Repeat detection AVAILABILITY: KMC is freely available at http://sun.aei.polsl.pl/kmc.},
	Author = {Deorowicz, Sebastian and Debudaj-Grabysz, Agnieszka and Grabowski, Szymon},
	Date-Added = {2013-06-10 19:40:04 +0000},
	Date-Modified = {2013-06-10 19:40:04 +0000},
	Doi = {10.1186/1471-2105-14-160},
	Journal = {BMC Bioinformatics},
	Journal-Full = {BMC bioinformatics},
	Month = {May},
	Number = {1},
	Pages = {160},
	Pmid = {23679007},
	Pst = {aheadofprint},
	Title = {Disk-based k-mer counting on a PC},
	Volume = {14},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-14-160}}

@article{Minoche2011,
	Abstract = {BACKGROUND: The generation and analysis of high-throughput sequencing data are becoming a major component of many studies in molecular biology and medical research. Illumina's Genome Analyzer (GA) and HiSeq instruments are currently the most widely used sequencing devices. Here, we comprehensively evaluate properties of genomic HiSeq and GAIIx data derived from two plant genomes and one virus, with read lengths of 95 to 150 bases.
RESULTS: We provide quantifications and evidence for GC bias, error rates, error sequence context, effects of quality filtering, and the reliability of quality values. By combining different filtering criteria we reduced error rates 7-fold at the expense of discarding 12.5% of alignable bases. While overall error rates are low in HiSeq data we observed regions of accumulated wrong base calls. Only 3% of all error positions accounted for 24.7% of all substitution errors. Analyzing the forward and reverse strands separately revealed error rates of up to 18.7%. Insertions and deletions occurred at very low rates on average but increased to up to 2% in homopolymers. A positive correlation between read coverage and GC content was found depending on the GC content range.
CONCLUSIONS: The errors and biases we report have implications for the use and the interpretation of Illumina sequencing data. GAIIx and HiSeq data sets show slightly different error profiles. Quality filtering is essential to minimize downstream analysis artifacts. Supporting previous recommendations, the strand-specificity provides a criterion to distinguish sequencing errors from low abundance polymorphisms.},
	Author = {Minoche, Andr{\'e} E and Dohm, Juliane C and Himmelbauer, Heinz},
	Date-Added = {2013-05-02 20:51:35 +0000},
	Date-Modified = {2013-05-02 20:51:35 +0000},
	Doi = {10.1186/gb-2011-12-11-r112},
	Journal = {Genome Biol},
	Journal-Full = {Genome biology},
	Mesh = {Arabidopsis; Artifacts; Automation, Laboratory; Bacteriophage phi X 174; Base Composition; Base Sequence; Beta vulgaris; Genomics; High-Throughput Nucleotide Sequencing; Molecular Sequence Data; Mutagenesis, Insertional; Polymorphism, Genetic; Reproducibility of Results; Sensitivity and Specificity; Sequence Analysis, DNA; Sequence Deletion},
	Number = {11},
	Pages = {R112},
	Pmc = {PMC3334598},
	Pmid = {22067484},
	Pst = {epublish},
	Title = {Evaluation of genomic high-throughput sequencing data generated on Illumina HiSeq and genome analyzer systems},
	Volume = {12},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2011-12-11-r112}}

@article{Rizk2013,
	Abstract = {SUMMARY: Counting all the k-mers (substrings of length k) in DNA/RNA sequencing reads is the preliminary step of many bioinformatics applications. However, state of the art k-mer counting methods require that a large data structure resides in memory. Such structure typically grows with the number of distinct k-mers to count. We present a new streaming algorithm for k-mer counting, called DSK (disk streaming of k-mers), which only requires a fixed user-defined amount of memory and disk space. This approach realizes a memory, time and disk trade-off. The multi-set of all k-mers present in the reads is partitioned, and partitions are saved to disk. Then, each partition is separately loaded in memory in a temporary hash table. The k-mer counts are returned by traversing each hash table. Low-abundance k-mers are optionally filtered. DSK is the first approach that is able to count all the 27-mers of a human genome dataset using only 4.0 GB of memory and moderate disk space (160 GB), in 17.9 h. DSK can replace a popular k-mer counting software (Jellyfish) on small-memory servers. AVAILABILITY: http://minia.genouest.org/dsk},
	Author = {Rizk, Guillaume and Lavenier, Dominique and Chikhi, Rayan},
	Date-Added = {2013-04-23 20:58:04 +0000},
	Date-Modified = {2013-04-23 20:58:04 +0000},
	Doi = {10.1093/bioinformatics/btt020},
	Journal = {Bioinformatics},
	Journal-Full = {Bioinformatics (Oxford, England)},
	Month = {Mar},
	Number = {5},
	Pages = {652-3},
	Pmid = {23325618},
	Pst = {ppublish},
	Title = {DSK: k-mer counting with very low memory usage},
	Volume = {29},
	Year = {2013},
	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btt020}}

@article{Pell2012,
	Abstract = {Deep sequencing has enabled the investigation of a wide range of environmental microbial ecosystems, but the high memory requirements for de novo assembly of short-read shotgun sequencing data from these complex populations are an increasingly large practical barrier. Here we introduce a memory-efficient graph representation with which we can analyze the k-mer connectivity of metagenomic samples. The graph representation is based on a probabilistic data structure, a Bloom filter, that allows us to efficiently store assembly graphs in as little as 4 bits per k-mer, albeit inexactly. We show that this data structure accurately represents DNA assembly graphs in low memory. We apply this data structure to the problem of partitioning assembly graphs into components as a prelude to assembly, and show that this reduces the overall memory requirements for de novo assembly of metagenomes. On one soil metagenome assembly, this approach achieves a nearly 40-fold decrease in the maximum memory requirements for assembly. This probabilistic graph representation is a significant theoretical advance in storing assembly graphs and also yields immediate leverage on metagenomic assembly.},
	Author = {Pell, Jason and Hintze, Arend and Canino-Koning, Rosangela and Howe, Adina and Tiedje, James M and Brown, C Titus},
	Date-Added = {2012-11-08 20:57:37 +0000},
	Date-Modified = {2012-11-08 20:57:37 +0000},
	Doi = {10.1073/pnas.1121464109},
	Journal = {Proc Natl Acad Sci U S A},
	Journal-Full = {Proceedings of the National Academy of Sciences of the United States of America},
	Mesh = {Base Pairing; Chromosomes, Bacterial; Computational Biology; DNA, Circular; Escherichia coli; Genome, Bacterial; Information Theory; Metagenome; Nonlinear Dynamics; Sequence Analysis, DNA; Soil Microbiology},
	Month = {Aug},
	Number = {33},
	Pages = {13272-7},
	Pmc = {PMC3421212},
	Pmid = {22847406},
	Pst = {ppublish},
	Title = {Scaling metagenome sequence assembly with probabilistic de Bruijn graphs},
	Volume = {109},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.1121464109}}

@inproceedings{CormodeM05,
	Author = {Graham Cormode and S. Muthukrishnan},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Booktitle = {SDM},
	Date-Added = {2012-09-20 16:02:34 +0000},
	Date-Modified = {2013-06-26 15:30:46 +0000},
	Title = {Summarizing and Mining Skewed Data Streams},
	Year = {2005}}

@article{BroderM03,
	Author = {Andrei Z. Broder and Michael Mitzenmacher},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Date-Added = {2012-09-19 16:24:22 +0000},
	Date-Modified = {2013-06-26 15:09:13 +0000},
	Journal = {Internet Mathematics},
	Number = {4},
	Pages = {485-509},
	Title = {Survey: Network Applications of Bloom Filters: A Survey},
	Volume = {1},
	Year = {2003},
	Bdsk-Url-1 = {http://dx.doi.org/10.1080/15427951.2004.10129096}}

@article{Bloom70,
	Author = {Burton H. Bloom},
	Bibsource = {DBLP, http://dblp.uni-trier.de},
	Date-Added = {2012-09-19 16:19:25 +0000},
	Date-Modified = {2013-06-10 19:45:55 +0000},
	Journal = {Commun. ACM},
	Number = {7},
	Pages = {422-426},
	Title = {Space/Time Trade-offs in Hash Coding with Allowable Errors},
	Volume = {13},
	Year = {1970},
	Bdsk-Url-1 = {http://doi.acm.org/10.1145/362686.362692}}

@article{Li2010,
	Abstract = {Next-generation massively parallel DNA sequencing technologies provide ultrahigh throughput at a substantially lower unit data cost; however, the data are very short read length sequences, making de novo assembly extremely challenging. Here, we describe a novel method for de novo assembly of large genomes from short read sequences. We successfully assembled both the Asian and African human genome sequences, achieving an N50 contig size of 7.4 and 5.9 kilobases (kb) and scaffold of 446.3 and 61.9 kb, respectively. The development of this de novo short read assembly method creates new opportunities for building reference sequences and carrying out accurate analyses of unexplored genomes in a cost-effective way.},
	Author = {Li, Ruiqiang and Zhu, Hongmei and Ruan, Jue and Qian, Wubin and Fang, Xiaodong and Shi, Zhongbin and Li, Yingrui and Li, Shengting and Shan, Gao and Kristiansen, Karsten and Li, Songgang and Yang, Huanming and Wang, Jian and Wang, Jun},
	Date-Added = {2012-09-17 20:35:37 +0000},
	Date-Modified = {2012-09-17 20:35:37 +0000},
	Doi = {10.1101/gr.097261.109},
	Journal = {Genome Res},
	Journal-Full = {Genome research},
	Mesh = {African Continental Ancestry Group; Asian Continental Ancestry Group; Genome, Human; Human Genome Project; Humans; Oligonucleotide Array Sequence Analysis; Sequence Alignment; Sequence Analysis, DNA},
	Month = {Feb},
	Number = {2},
	Pages = {265-72},
	Pmc = {PMC2813482},
	Pmid = {20019144},
	Pst = {ppublish},
	Title = {De novo assembly of human genomes with massively parallel short read sequencing},
	Volume = {20},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.097261.109}}

@article{Simpson2009,
	Abstract = {Widespread adoption of massively parallel deoxyribonucleic acid (DNA) sequencing instruments has prompted the recent development of de novo short read assembly algorithms. A common shortcoming of the available tools is their inability to efficiently assemble vast amounts of data generated from large-scale sequencing projects, such as the sequencing of individual human genomes to catalog natural genetic variation. To address this limitation, we developed ABySS (Assembly By Short Sequences), a parallelized sequence assembler. As a demonstration of the capability of our software, we assembled 3.5 billion paired-end reads from the genome of an African male publicly released by Illumina, Inc. Approximately 2.76 million contigs > or =100 base pairs (bp) in length were created with an N50 size of 1499 bp, representing 68% of the reference human genome. Analysis of these contigs identified polymorphic and novel sequences not present in the human reference assembly, which were validated by alignment to alternate human assemblies and to other primate genomes.},
	Author = {Simpson, Jared T and Wong, Kim and Jackman, Shaun D and Schein, Jacqueline E and Jones, Steven J M and Birol, Inan{\c c}},
	Date-Added = {2012-09-17 20:34:58 +0000},
	Date-Modified = {2012-09-17 20:34:58 +0000},
	Doi = {10.1101/gr.089532.108},
	Journal = {Genome Res},
	Journal-Full = {Genome research},
	Mesh = {Algorithms; Animals; Computational Biology; Contig Mapping; Escherichia coli K12; Genetic Variation; Genome, Human; Humans; Polymorphism, Genetic; Reproducibility of Results; Sequence Analysis, DNA; Software},
	Month = {Jun},
	Number = {6},
	Pages = {1117-23},
	Pmc = {PMC2694472},
	Pmid = {19251739},
	Pst = {ppublish},
	Title = {ABySS: a parallel assembler for short read sequence data},
	Volume = {19},
	Year = {2009},
	Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.089532.108}}

@article{Butler2008,
	Abstract = {New DNA sequencing technologies deliver data at dramatically lower costs but demand new analytical methods to take full advantage of the very short reads that they produce. We provide an initial, theoretical solution to the challenge of de novo assembly from whole-genome shotgun "microreads." For 11 genomes of sizes up to 39 Mb, we generated high-quality assemblies from 80x coverage by paired 30-base simulated reads modeled after real Illumina-Solexa reads. The bacterial genomes of Campylobacter jejuni and Escherichia coli assemble optimally, yielding single perfect contigs, and larger genomes yield assemblies that are highly connected and accurate. Assemblies are presented in a graph form that retains intrinsic ambiguities such as those arising from polymorphism, thereby providing information that has been absent from previous genome assemblies. For both C. jejuni and E. coli, this assembly graph is a single edge encompassing the entire genome. Larger genomes produce more complicated graphs, but the vast majority of the bases in their assemblies are present in long edges that are nearly always perfect. We describe a general method for genome assembly that can be applied to all types of DNA sequence data, not only short read data, but also conventional sequence reads.},
	Author = {Butler, Jonathan and MacCallum, Iain and Kleber, Michael and Shlyakhter, Ilya A and Belmonte, Matthew K and Lander, Eric S and Nusbaum, Chad and Jaffe, David B},
	Date-Added = {2012-09-17 20:34:22 +0000},
	Date-Modified = {2012-09-17 20:34:22 +0000},
	Doi = {10.1101/gr.7337908},
	Journal = {Genome Res},
	Journal-Full = {Genome research},
	Mesh = {Algorithms; Campylobacter jejuni; Computational Biology; Computer Simulation; Escherichia coli; Genome, Bacterial; Reproducibility of Results; Sequence Analysis, DNA},
	Month = {May},
	Number = {5},
	Pages = {810-20},
	Pmc = {PMC2336810},
	Pmid = {18340039},
	Pst = {ppublish},
	Title = {ALLPATHS: de novo assembly of whole-genome shotgun microreads},
	Volume = {18},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.7337908}}

@article{Zerbino2008,
	Abstract = {We have developed a new set of algorithms, collectively called "Velvet," to manipulate de Bruijn graphs for genomic sequence assembly. A de Bruijn graph is a compact representation based on short words (k-mers) that is ideal for high coverage, very short read (25-50 bp) data sets. Applying Velvet to very short reads and paired-ends information only, one can produce contigs of significant length, up to 50-kb N50 length in simulations of prokaryotic data and 3-kb N50 on simulated mammalian BACs. When applied to real Solexa data sets without read pairs, Velvet generated contigs of approximately 8 kb in a prokaryote and 2 kb in a mammalian BAC, in close agreement with our simulated results without read-pair information. Velvet represents a new approach to assembly that can leverage very short reads in combination with read pairs to produce useful assemblies.},
	Author = {Zerbino, Daniel R and Birney, Ewan},
	Date-Added = {2012-09-17 20:30:34 +0000},
	Date-Modified = {2012-09-17 20:30:34 +0000},
	Doi = {10.1101/gr.074492.107},
	Journal = {Genome Res},
	Journal-Full = {Genome research},
	Mesh = {Algorithms; Animals; Chromosomes, Artificial, Bacterial; Computational Biology; Computer Simulation; Genome, Bacterial; Genome, Human; Genomics; Humans; Mammals; Sequence Analysis, DNA; Streptococcus},
	Month = {May},
	Number = {5},
	Pages = {821-9},
	Pmc = {PMC2336801},
	Pmid = {18349386},
	Pst = {ppublish},
	Title = {Velvet: algorithms for de novo short read assembly using de Bruijn graphs},
	Volume = {18},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.074492.107}}

@article{Pevzner2001,
	Abstract = {For the last 20 years, fragment assembly in DNA sequencing followed the "overlap-layout-consensus" paradigm that is used in all currently available assembly tools. Although this approach proved useful in assembling clones, it faces difficulties in genomic shotgun assembly. We abandon the classical "overlap-layout-consensus" approach in favor of a new euler algorithm that, for the first time, resolves the 20-year-old "repeat problem" in fragment assembly. Our main result is the reduction of the fragment assembly to a variation of the classical Eulerian path problem that allows one to generate accurate solutions of large-scale sequencing problems. euler, in contrast to the celera assembler, does not mask such repeats but uses them instead as a powerful fragment assembly tool.},
	Author = {Pevzner, P A and Tang, H and Waterman, M S},
	Date-Added = {2012-09-17 20:24:10 +0000},
	Date-Modified = {2012-09-17 20:25:42 +0000},
	Doi = {10.1073/pnas.171285098},
	Journal = {Proc Natl Acad Sci U S A},
	Journal-Full = {Proceedings of the National Academy of Sciences of the United States of America},
	Mesh = {Algorithms; Campylobacter jejuni; Contig Mapping; DNA, Bacterial; Genome, Bacterial; Lactococcus lactis; Models, Theoretical; Neisseria meningitidis; Sequence Alignment; Sequence Analysis, DNA; Software},
	Month = {Aug},
	Number = {17},
	Pages = {9748-53},
	Pmc = {PMC55524},
	Pmid = {11504945},
	Pst = {ppublish},
	Title = {An Eulerian path approach to DNA fragment assembly},
	Volume = {98},
	Year = {2001},
	Bdsk-Url-1 = {http://dx.doi.org/10.1073/pnas.171285098}}

@article{Marcais2011,
	Abstract = {Counting the number of occurrences of every k-mer (substring of length k) in a long string is a central subproblem in many applications, including genome assembly, error correction of sequencing reads, fast multiple sequence alignment and repeat detection. Recently, the deep sequence coverage generated by next-generation sequencing technologies has caused the amount of sequence to be processed during a genome project to grow rapidly, and has rendered current k-mer counting tools too slow and memory intensive. At the same time, large multicore computers have become commonplace in research facilities allowing for a new parallel computational paradigm.},
	Author = {Mar\c{c}ais, Guillaume and Kingsford, Carl},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Department of Computer Science, University of Maryland, College Park, MD 20742, USA. gmarcais@umd.edu},
	Journal = {Bioinformatics},
	Number = {6},
	Pages = {764--770},
	Title = {{A fast, lock-free approach for efficient parallel counting of occurrences of k-mers.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/21217122},
	Volume = {27},
	Year = {2011},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/21217122}}

@article{Qin2010,
	Abstract = {To understand the impact of gut microbes on human health and well-being it is crucial to assess their genetic potential. Here we describe the Illumina-based metagenomic sequencing, assembly and characterization of 3.3 million non-redundant microbial genes, derived from 576.7 gigabases of sequence, from faecal samples of 124 European individuals. The gene set, approximately 150 times larger than the human gene complement, contains an overwhelming majority of the prevalent (more frequent) microbial genes of the cohort and probably includes a large proportion of the prevalent human intestinal microbial genes. The genes are largely shared among individuals of the cohort. Over 99\% of the genes are bacterial, indicating that the entire cohort harbours between 1,000 and 1,150 prevalent bacterial species and each individual at least 160 such species, which are also largely shared. We define and describe the minimal gut metagenome and the minimal gut bacterial genome in terms of functions present in all individuals and most bacteria, respectively.},
	Author = {Qin, Junjie and Li, Ruiqiang and Raes, Jeroen and Arumugam, Manimozhiyan and Burgdorf, Kristoffer Solvsten and Manichanh, Chaysavanh and Nielsen, Trine and Pons, Nicolas and Levenez, Florence and Yamada, Takuji and Mende, Daniel R and Li, Junhua and Xu, Junming and Li, Songgang Shaochuan Shengting and Li, Dongfang and Cao, Jianjun and Wang, Bo and Liang, Huiqing and Zheng, Huisong and Xie, Yinlong and Tap, Julien and Lepage, Patricia and Bertalan, Marcelo and Batto, Jean-Michel and Hansen, Torben and {Le Paslier}, Denis and Linneberg, Allan and Nielsen, H Bj\o rn and Pelletier, Eric and Renault, Pierre and Sicheritz-Ponten, Thomas and Turner, Keith and Zhu, Hongmei and Yu, Chang and Jian, Min and Zhou, Yan and Li, Yingrui and Zhang, Xiuqing and Qin, Nan and Yang, Huanming and Wang, Jun Jian and Brunak, S\o ren and Dor\'{e}, Joel and Guarner, Francisco and Kristiansen, Karsten and Pedersen, Oluf and Parkhill, Julian and Weissenbach, Jean and Bork, Peer and Ehrlich, S Dusko},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {BGI-Shenzhen, Shenzhen 518083, China.},
	Journal = {Nature},
	Number = {7285},
	Pages = {59--65},
	Publisher = {Nature Publishing Group},
	Title = {{A human gut microbial gene catalogue established by metagenomic sequencing.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/20203603},
	Volume = {464},
	Year = {2010},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/20203603}}

@article{Kurtz2008,
	Abstract = {Background: The challenges of accurate gene prediction and enumeration are further aggravated in large genomes that contain highly repetitive transposable elements (TEs). Yet TEs play a substantial role in genome evolution and are themselves an important subject of study. Repeat annotation, based on counting occurrences of k-mers, has been previously used to distinguish TEs from low-copy genic regions; but currently available software solutions are impractical due to high memory requirements or specialization for specific user-tasks. Results: Here we introduce the Tallymer software, a flexible and memory-efficient collection of programs for k-mer counting and indexing of large sequence sets. Unlike previous methods, Tallymer is based on enhanced suffix arrays. This gives a much larger flexibility concerning the choice of the k-mer size. Tallymer can process large data sizes of several billion bases. We used it in a variety of applications to study the genomes of maize and other plant species. In particular, Tallymer was used to index a set of whole genome shotgun sequences from maize (B73) (total size 109 bp.). We analyzed k-mer frequencies for a wide range of k. At this low genome coverage 0.45) highly repetitive 20-mers constituted 44\% of the genome but represented only 1\% of all possible k-mers. Similar low-complexity was seen in the repeat fractions of sorghum and rice. When applying our method to other maize data sets, High-C t derived sequences showed the greatest enrichment for low-copy sequences. Among annotated TEs, the most highly repetitive were of the Ty3/gypsy class of retrotransposons, followed by the Ty1/copia class, and DNA transposons. Among expressed sequence tags (EST), a notable fraction contained high-copy k-mers, suggesting that transposons are still active in maize. Retrotransposons in Mo17 and McC cultivars were readily detected using the B73 20-mer frequency index, indicating their conservation despite extensive rearrangement across cultivars. Among one hundred annotated bacterial artificial chromosomes (BACs), k-mer frequency could be used to detect transposon-encoded genes with 92\% sensitivity, compared to 96\% using alignment-based repeat masking, while both methods showed 92\% specificity. Conclusion: The Tallymer software was effective in a variety of applications to aid genome annotation in maize, despite limitations imposed by the relatively low coverage of sequence available. For more information on the software, see .},
	Author = {Kurtz, Stefan and Narechania, Apurva and Stein, Joshua C and Ware, Doreen},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Center for Bioinformatics, University of Hamburg, Bundesstrasse 43, 20146 Hamburg, Germany. kurtz@zbh.uni-hamburg.de},
	Journal = {BMC Genomics},
	Number = {1},
	Pages = {517},
	Publisher = {BioMed Central},
	Title = {{A new method to compute K-mer frequencies and its application to annotate large repetitive plant genomes}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/18976482},
	Volume = {9},
	Year = {2008},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/18976482}}

@article{Shi2010,
	Abstract = {Emerging DNA sequencing technologies open up exciting new opportunities for genome sequencing by generating read data with a massive throughput. However, produced reads are significantly shorter and more error-prone compared to the traditional Sanger shotgun sequencing method. This poses challenges for de novo DNA fragment assembly algorithms in terms of both accuracy (to deal with short, error-prone reads) and scalability (to deal with very large input data sets). In this article, we present a scalable parallel algorithm for correcting sequencing errors in high-throughput short-read data so that error-free reads can be available before DNA fragment assembly, which is of high importance to many graph-based short-read assembly tools. The algorithm is based on spectral alignment and uses the Compute Unified Device Architecture (CUDA) programming model. To gain efficiency we are taking advantage of the CUDA texture memory using a space-efficient Bloom filter data structure for spectrum membership queries. We have tested the runtime and accuracy of our algorithm using real and simulated Illumina data for different read lengths, error rates, input sizes, and algorithmic parameters. Using a CUDA-enabled mass-produced GPU (available for less than US\$400 at any local computer outlet), this results in speedups of 12-84 times for the parallelized error correction, and speedups of 3-63 times for both sequential preprocessing and parallelized error correction compared to the publicly available Euler-SR program. Our implementation is freely available for download from http://cuda-ec.sourceforge.net .},
	Author = {Shi, Haixiang and Schmidt, Bertil and Liu, Weiguo and M\"{u}ller-Wittig, Wolfgang},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {School of Computer Engineering, Nanyang Technological University, Singapore.},
	Journal = {Journal of computational biology a journal of computational molecular cell biology},
	Keywords = {algorithms,computational biology,computational biology methods,computer graphics,computers,databases,dna,dna methods,nucleic acid,sequence alignment,sequence analysis},
	Number = {4},
	Pages = {603--615},
	Publisher = {Mary Ann Liebert, Inc. 140 Huguenot Street, 3rd Floor New Rochelle, NY 10801 USA},
	Title = {{A parallel algorithm for error correction in high-throughput short-read data on CUDA-enabled graphics hardware.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/20426693},
	Volume = {17},
	Year = {2010},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/20426693}}

@article{Davenport2010,
	Abstract = {Background: Bacteria show a bias in their genomic oligonucleotide composition far beyond that dictated by G+C content. Patterns of over- and underrepresented oligonucleotides carry a phylogenetic signal and are thus diagnostic for individual species. Patterns of short oligomers have been investigated by multiple groups in large numbers of bacteria genomes. However, global distributions of the most highly overrepresented mid-sized oligomers have not been assessed across all prokaryotes to date. We surveyed overrepresented mid-length oligomers across all prokaryotes and normalised for base composition and embedded oligomers using zero and second order Markov models. Principal Findings: Here we report a presumably ancient set of oligomers conserved and overrepresented in nearly all branches of prokaryotic life, including Archaea. These oligomers are either adenine rich homopurines with one to three guanine nucleosides, or homopyridimines with one to four cytosine nucleosides. They do not show a consistent preference for coding or non-coding regions or aggregate in any coding frame, implying a role in DNA structure and as polypeptide binding sites. Structural parameters indicate these oligonucleotides to be an extreme and rigid form of B-DNA prone to forming triple stranded helices under common physiological conditions. Moreover, the narrow minor grooves of these structures are recognised by DNA binding and nucleoid associated proteins such as HU. Conclusion: Homopurine and homopyrimidine oligomers exhibit distinct and unusual structural features and are present at high copy number in nearly all prokaryotic lineages. This fact suggests a non-neutral role of these oligonucleotides for bacterial genome organization that has been maintained throughout evolution.},
	Author = {Davenport, Colin F and T\"{u}mmler, Burkhard},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Editor = {Scheffler, Konrad},
	Institution = {Pediatric Pneumology and Neonatology, Hanover Medical School, Hanover, Lower Saxony, Germany. davenport.colin@mh-hannover.de},
	Journal = {PLoS ONE},
	Number = {3},
	Pages = {8},
	Publisher = {Public Library of Science},
	Title = {{Abundant Oligonucleotides Common to Most Bacteria}},
	Url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2843746\&tool=pmcentrez\&rendertype=abstract},
	Volume = {5},
	Year = {2010},
	Bdsk-Url-1 = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2843746%5C&tool=pmcentrez%5C&rendertype=abstract}}

@article{Cormode2005,
	Author = {Cormode, Graham and Muthukrishnan, S},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1016/j.jalgor.2003.12.001},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/R\_0504\_Journal of Algorithms\_An Improved Data Stream Summary\_The Count-Min Sketch and its Applications.pdf:pdf;:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/0504\_Journal of Algorithms\_An Improved Data Stream Summary\_The Count-Min Sketch and its Applications.pdf:pdf;:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/An Improved Data Stream Summary- The Count-Min Sketch and its Applications.pdf:pdf},
	Issn = {01966774},
	Journal = {Journal of Algorithms},
	Month = apr,
	Number = {1},
	Pages = {58--75},
	Title = {{An improved data stream summary: the count-min sketch and its applications}},
	Url = {http://linkinghub.elsevier.com/retrieve/pii/S0196677403001913},
	Volume = {55},
	Year = {2005},
	Bdsk-Url-1 = {http://linkinghub.elsevier.com/retrieve/pii/S0196677403001913},
	Bdsk-Url-2 = {http://dx.doi.org/10.1016/j.jalgor.2003.12.001}}

@article{Healy2003,
	Abstract = {We have developed a tool for rapidly determining the number of exact matches of any word within large, internally repetitive genomes or sets of genomes. Thus we can readily annotate any sequence, including the entire human genome, with the counts of its constituent words. We create a Burrows-Wheeler transform of the genome, which together with auxiliary data structures facilitating counting, can reside in about one gigabyte of RAM. Our original interest was motivated by oligonucleotide probe design, and we describe a general protocol for defining unique hybridization probes. But our method also has applications for the analysis of genome structure and assembly. We demonstrate the identification of chromosome-specific repeats, and outline a general procedure for finding undiscovered repeats. We also illustrate the changing contents of the human genome assemblies by comparing the annotations built from different genome freezes.},
	Author = {Healy, John and Thomas, Elizabeth E and Schwartz, Jacob T and Wigler, Michael},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Cold Spring Harbor Laboratory, Cold Spring Harbor, New York 11724, USA. healy@cshl.edu},
	Journal = {Genome Research},
	Number = {10},
	Pages = {2306--2315},
	Title = {{Annotating large genomes with exact word matches.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/12975312},
	Volume = {13},
	Year = {2003},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/12975312}}

@article{Miller2010,
	Abstract = {The emergence of next-generation sequencing platforms led to resurgence of research in whole-genome shotgun assembly algorithms and software. DNA sequencing data from the Roche 454, Illumina/Solexa, and ABI SOLiD platforms typically present shorter read lengths, higher coverage, and different error profiles compared with Sanger sequencing data. Since 2005, several assembly software packages have been created or revised specifically for de novo assembly of next-generation sequencing data. This review summarizes and compares the published descriptions of packages named SSAKE, SHARCGS, VCAKE, Newbler, Celera Assembler, Euler, Velvet, ABySS, AllPaths, and SOAPdenovo. More generally, it compares the two standard methods known as the de Bruijn graph approach and the overlap/layout/consensus approach to assembly.},
	Author = {Miller, Jason R and Koren, Sergey and Sutton, Granger},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1016/j.ygeno.2010.03.001},
	Journal = {Genomics},
	Journal-Full = {Genomics},
	Mesh = {Algorithms; Forecasting; Sequence Analysis, DNA; Software},
	Month = {Jun},
	Number = {6},
	Pages = {315-27},
	Pmc = {PMC2874646},
	Pmid = {20211242},
	Pst = {ppublish},
	Title = {Assembly algorithms for next-generation sequencing data},
	Volume = {95},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1016/j.ygeno.2010.03.001}}

@article{Bar-yossef,
	Author = {Bar-yossef, Ziv and Jayram, T S and Kumar, Ravi and Sivakumar, D},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/Counting distinct elements in a data stream.pdf:pdf},
	Journal = {Memory},
	Title = {{Counting distinct elements in a data stream}}}

@article{Hampson2002,
	Abstract = {MOTIVATION: Over-represented k-mers in genomic DNA regions are often of particular biological interest. For example, over-represented k-mers in co-regulated families of genes are associated with the DNA binding sites of transcription factors. To measure over-representation, we introduce a statistical background model based on single-mismatches, and apply it to the pooled 500 bp ORF Upstream Regions (USRs) of yeast. More importantly, we investigate the context and spatial distribution of over-represented k-mers in yeast USRs. RESULTS: Single and double-stranded spatial distributions of most over-represented k-mers are highly non-random, and predominantly cluster into a small number of classes that are robust with respect to over-representation measures. Specifically, we show that the three most common distribution patterns can be related to DNA structure, function, and evolution and correspond to: (a) homologous ORF clusters associated with sharply localized distributions; (b) regulatory elements associated with a symmetric broad hill-shaped distribution in the 50-200 bp USR; and (c) runs of As, Ts, and ATs associated with a broad hill-shaped distribution also in the 50-200 bp USR, with extreme structural properties. Analysis of over-representation, homology, localization, and DNA structure are essential components of a general data-mining approach to finding biologically important k-mers in raw genomic DNA and understanding the 'lexicon' of regulatory regions.},
	Author = {Hampson, Steven and Kibler, Dennis and Baldi, Pierre},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Department of Information and Computer Science, Institute for Genomics and Bioinformatics, University of California, Irvine, Irvine, CA 92697-3425, USA. hampson@ics.uci.edu},
	Journal = {Bioinformatics},
	Number = {4},
	Pages = {513--528},
	Title = {{Distribution patterns of over-represented k-mers in non-coding yeast DNA.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/12016049},
	Volume = {18},
	Year = {2002},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/12016049}}

@article{Sindi2008,
	Abstract = {We study quantitative features of complex repetitive DNA in several genomes by studying sequences that are sufficiently long that they are unlikely to have repeated by chance. For each genome we study, we determine the number of identical copies, the "duplication count," of each sequence of length 40, that is of each "40-mer." We say a 40-mer is "repeated" if its duplication count is at least 2. We focus mainly on "complex" 40-mers, those without short internal repetitions. We find that we can classify most of the complex repeated 40-mers into two categories: one category has its copies clustered closely together on one chromosome, the other has its copies distributed widely across multiple chromosomes. For each genome and each of the categories above, we compute N(c), the number of 40-mers that have duplication count c, for each integer c. In each case, we observe a power-law-like decay in N(c) as c increases from 3 to 50 or higher. In particular, we find that N(c) decays much more slowly than would be predicted by evolutionary models where each 40-mer is equally likely to be duplicated. We also analyze an evolutionary model that does reflect the slow decay of N(c).},
	Author = {Sindi, Suzanne S and Hunt, Brian R and Yorke, James A},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Institute for Physical Sciences and Technology, University of Maryland, College Park, Maryland 20742, USA. suzanne\_sindi@brown.edu},
	Journal = {Physical Review E - Statistical, Nonlinear and Soft Matter Physics},
	Number = {6 Pt 1},
	Pages = {061912},
	Title = {{Duplication count distributions in DNA sequences.}},
	Volume = {78},
	Year = {2008}}

@article{Melsted2011,
	Abstract = {Counting k-mers (substrings of length k in DNA sequence data) is an essential component of many methods in bioinformatics, including for genome and transcriptome assembly, for metagenomic sequencing, and for error correction of sequence reads. Although simple in principle, counting k-mers in large modern sequence data sets can easily overwhelm the memory capacity of standard computers. In current data sets, a large fraction-often more than 50\%-of the storage capacity may be spent on storing k-mers that contain sequencing errors and which are typically observed only a single time in the data. These singleton k-mers are uninformative for many algorithms without some kind of error correction.},
	Author = {Melsted, P\'{a}ll and Pritchard, Jonathan K},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1186/1471-2105-12-333},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/1108\_BMCBiofor\_Efficient counting of k-mers in DNA sequences using a bloom filter.pdf:pdf},
	Issn = {1471-2105},
	Journal = {BMC bioinformatics},
	Month = jan,
	Pages = {333},
	Pmid = {21831268},
	Title = {{Efficient counting of k-mers in DNA sequences using a bloom filter.}},
	Url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3166945\&tool=pmcentrez\&rendertype=abstract},
	Volume = {12},
	Year = {2011},
	Bdsk-Url-1 = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3166945%5C&tool=pmcentrez%5C&rendertype=abstract},
	Bdsk-Url-2 = {http://dx.doi.org/10.1186/1471-2105-12-333}}

@article{Hooper2010,
	Abstract = {Motivation: Shotgun sequencing generates large numbers of short DNA reads from either an isolated organism or, in the case of metagenomics projects, from the aggregate genome of a microbial community. These reads are then assembled based on overlapping sequences into larger, contiguous sequences (contigs). The feasibility of assembly and the coverage achieved (reads per nucleotide or distinct sequence of nucleotides) depend on several factors: the number of reads sequenced, the read length and the relative abundances of their source genomes in the microbial community. A low coverage suggests that most of the genomic DNA in the sample has not been sequenced, but it is often difficult to estimate either the extent of the uncaptured diversity or the amount of additional sequencing that would be most efficacious. In this work, we regard a metagenome as a population of DNA fragments (bins), each of which may be covered by one or more reads. We employ a gamma distribution to model this bin population due to its flexibility and ease of use. When a gamma approximation can be found that adequately fits the data, we may estimate the number of bins that were not sequenced and that could potentially be revealed by additional sequencing. We evaluated the performance of this model using simulated metagenomes and demonstrate its applicability on three recent metagenomic datasets. Contact: sean.d.hoopergenpat.uu.se Supplementary information: Supplementary data are available at Bioinformatics online.},
	Author = {Hooper, Sean D and Dalevi, Daniel and Pati, Amrita and Mavromatis, Konstantinos and Ivanova, Natalia N and Kyrpides, Nikos C},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Department of Energy Joint Genome Institute (DOE-JGI), Genome Biology Program, 2800 Mitchell Drive, Walnut Creek, CA 94598, USA. sean.d.hooper@genpat.uu.se},
	Journal = {Bioinformatics},
	Number = {3},
	Pages = {295--301},
	Publisher = {Oxford University Press},
	Title = {{Estimating DNA coverage and abundance in metagenomes using a gamma approximation}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/20008478},
	Volume = {26},
	Year = {2010},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/20008478}}

@article{Charikar2004,
	Author = {Charikar, M},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1016/S0304-3975(03)00400-6},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/Finding Frequent Items in Data Streams.pdf:pdf},
	Issn = {03043975},
	Journal = {Theoretical Computer Science},
	Month = jan,
	Number = {1},
	Pages = {3--15},
	Title = {{Finding frequent items in data streams}},
	Url = {http://linkinghub.elsevier.com/retrieve/pii/S0304397503004006},
	Volume = {312},
	Year = {2004},
	Bdsk-Url-1 = {http://linkinghub.elsevier.com/retrieve/pii/S0304397503004006},
	Bdsk-Url-2 = {http://dx.doi.org/10.1016/S0304-3975(03)00400-6}}

@article{Trifonov2010,
	Abstract = {Environmental metagenomic samples and samples obtained as an attemptto identify a pathogen associated with the emergence of a novel infectious disease are important sources of novel microorganisms. The low costs and high throughput of sequencing technologies are expected to allow for the genetic material in those sample to be sequenced and the genomes of the novel microorganisms to be identified by alignment to those in a dtabase of known genomes. Yet for various biological and technical reasons, such alignment might not always be possible. We investigate a frequency analysis technieq which on one hand allows for the identification of genetic material without relying on alignment and on the other hand makes ossible the discovery of nonoverlapping contigs from the same organism. The technique is based on obtaining signatures of the genetic data and defining a distance/similarity measure between signatures. More precisely, the signatures of the genetic data are the frequencies of k-mers occurring in them, with k being a natural number. We considered an entropy-based distance between signatures, similar to the Kullback-leibler distance in information theory , and investigated its ability to categorize negative-sense single-stranded RNA (ssRNA) viral genetic data. Our conclusion is that in this viral context, the technique provides a viable way of discovering genetic relationships without relying on alignment. We envision that our approach will be applicable to other microbial genetic context, e.g., other types of viruses, and will be an important tool in the discovery of novel microorganisms.},
	Author = {Trifonov, Vladimir and Rabadan, Raul},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Issn = {21507511},
	Journal = {mBio},
	Number = {3},
	Pages = {1--8},
	Publisher = {American Society for Microbiology},
	Title = {{Frequency Analysis Techniques for Identification of Viral Genetic Data}},
	Url = {http://mbio.asm.org/content/1/3/e00156-10.full},
	Volume = {1},
	Year = {2010},
	Bdsk-Url-1 = {http://mbio.asm.org/content/1/3/e00156-10.full}}

@article{Chor2009,
	Abstract = {Tetrapods, unlike other organisms, have multimodal spectra of k-mers in their genomes},
	Author = {Chor, Benny and Horn, David and Goldman, Nick and Levy, Yaron and Massingham, Tim},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {School of Computer Science, Tel Aviv University, Klausner St, Ramat-Aviv, Tel-Aviv 39040, Israel. benny@cs.tau.ac.il},
	Journal = {Genome Biology},
	Number = {10},
	Pages = {R108},
	Publisher = {BioMed Central},
	Title = {{Genomic DNA k-mer spectra: models and modalities}},
	Url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2784323\&tool=pmcentrez\&rendertype=abstract},
	Volume = {10},
	Year = {2009},
	Bdsk-Url-1 = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=2784323%5C&tool=pmcentrez%5C&rendertype=abstract}}

@article{Kirsch2006,
	Author = {Kirsch, Adam and Mitzenmacher, Michael},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/esa06.pdf:pdf},
	Journal = {Building},
	Pages = {456--467},
	Title = {{Less Hashing , Same Performance : Building a Better Bloom Filter}},
	Year = {2006}}

@article{Hess2011,
	Abstract = {The paucity of enzymes that efficiently deconstruct plant polysaccharides represents a major bottleneck for industrial-scale conversion of cellulosic biomass into biofuels. Cow rumen microbes specialize in degradation of cellulosic plant material, but most members of this complex community resist cultivation. To characterize biomass-degrading genes and genomes, we sequenced and analyzed 268 gigabases of metagenomic DNA from microbes adherent to plant fiber incubated in cow rumen. From these data, we identified 27,755 putative carbohydrate-active genes and expressed 90 candidate proteins, of which 57\% were enzymatically active against cellulosic substrates. We also assembled 15 uncultured microbial genomes, which were validated by complementary methods including single-cell genome sequencing. These data sets provide a substantially expanded catalog of genes and genomes participating in the deconstruction of cellulosic biomass.},
	Author = {Hess, M and Sczyrba, A and Egan, R and Kim, T W and Chokhawala, H and Schroth, G and Luo, S and Clark, D S and Chen, F and Zhang, T and Mackie, R I and Pennacchio, L A and Tringe, S G and Visel, A and Woyke, T and Wang, Z and Rubin, E M},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Issn = {00368075},
	Journal = {Science},
	Number = {6016},
	Pages = {463--467},
	Title = {{Metagenomic Discovery of Biomass-Degrading Genes and Genomes from Cow Rumen}},
	Url = {http://www.sciencemag.org/cgi/doi/10.1126/science.1200387},
	Volume = {331},
	Year = {2011},
	Bdsk-Url-1 = {http://www.sciencemag.org/cgi/doi/10.1126/science.1200387}}

@article{Richter2008,
	Abstract = {Background: The new research field of metagenomics is providing exciting insights into various, previously unclassified ecological systems. Next-generation sequencing technologies are producing a rapid increase of environmental data in public databases. There is great need for specialized software solutions and statistical methods for dealing with complex metagenome data sets. Methodology/Principal Findings: To facilitate the development and improvement of metagenomic tools and the planning of metagenomic projects, we introduce a sequencing simulator called MetaSim. Our software can be used to generate collections of synthetic reads that reflect the diverse taxonomical composition of typical metagenome data sets. Based on a database of given genomes, the program allows the user to design a metagenome by specifying the number of genomes present at different levels of the NCBI taxonomy, and then to collect reads from the metagenome using a simulation of a number of different sequencing technologies. A population sampler optionally produces evolved sequences based on source genomes and a given evolutionary tree. Conclusions/Significance: MetaSim allows the user to simulate individual read datasets that can be used as standardized test scenarios for planning sequencing projects or for benchmarking metagenomic software.},
	Author = {Richter, Daniel C and Ott, Felix and Auch, Alexander F and Schmid, Ramona and Huson, Daniel H},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Editor = {Field, Dawn},
	Institution = {ZBIT- Center for Bioinformatics T\"{u}bingen, University of T\"{u}bingen, T\"{u}bingen, Germany. drichter@informatik.uni-tuebingen.de},
	Journal = {PLoS ONE},
	Number = {10},
	Pages = {12},
	Publisher = {Public Library of Science},
	Title = {{MetaSim---A Sequencing Simulator for Genomics and Metagenomics}},
	Url = {http://dx.doi.org/10.1371/journal.pone.0003373},
	Volume = {3},
	Year = {2008},
	Bdsk-Url-1 = {http://dx.doi.org/10.1371/journal.pone.0003373}}

@article{Chen2005,
	Abstract = {The evolutionary features based on the distributions of k-mers in the DNA sequences of various organisms are studied. The organisms are classified into three groups based on their evolutionary periods: (a) E. coli and T. pallidum (b) yeast, zebrafish, A. thaliana, and fruit fly, (c) mouse, chicken, and human. The distributions of 6-mers of these three groups are shown to be, respectively, (a) unimodal, (b) unimodal with peaks generally shifted to smaller frequencies of occurrence, (c) bimodal. To describe the bimodal feature of the k-mer distributions of group (c), a model based on the cytosine-guanine CG" content of the DNA sequences is introduced and shown to provide reasonably good agreements.},
	Author = {Chen, Yaw-Hwang and Nyeo, Su-Long and Yeh, Chiung-Yuh},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Issn = {15393755},
	Journal = {Physical Review E},
	Number = {1},
	Pages = {1--7},
	Title = {{Model for the distributions of k-mers in DNA sequences}},
	Url = {http://link.aps.org/doi/10.1103/PhysRevE.72.011908},
	Volume = {72},
	Year = {2005},
	Bdsk-Url-1 = {http://link.aps.org/doi/10.1103/PhysRevE.72.011908}}

@article{Woyke2010,
	Abstract = {While the bulk of the finished microbial genomes sequenced to date are derived from cultured bacterial and archaeal representatives, the vast majority of microorganisms elude current culturing attempts, severely limiting the ability to recover complete or even partial genomes from these environmental species. Single cell genomics is a novel culture-independent approach, which enables access to the genetic material of an individual cell. No single cell genome has to our knowledge been closed and finished to date. Here we report the completed genome from an uncultured single cell of Candidatus Sulcia muelleri DMIN. Digital PCR on single symbiont cells isolated from the bacteriome of the green sharpshooter Draeculacephala minerva bacteriome allowed us to assess that this bacteria is polyploid with genome copies ranging from approximately 200900 per cell, making it a most suitable target for single cell finishing efforts. For single cell shotgun sequencing, an individual Sulcia cell was isolated and whole genome amplified by multiple displacement amplification (MDA). Sanger-based finishing methods allowed us to close the genome. To verify the correctness of our single cell genome and exclude MDA-derived artifacts, we independently shotgun sequenced and assembled the Sulcia genome from pooled bacteriomes using a metagenomic approach, yielding a nearly identical genome. Four variations we detected appear to be genuine biological differences between the two samples. Comparison of the single cell genome with bacteriome metagenomic sequence data detected two single nucleotide polymorphisms (SNPs), indicating extremely low genetic diversity within a Sulcia population. This study demonstrates the power of single cell genomics to generate a complete, high quality, non-composite reference genome within an environmental sample, which can be used for population genetic analyzes.},
	Author = {Woyke, Tanja and Tighe, Damon and Mavromatis, Konstantinos and Clum, Alicia and Copeland, Alex and Schackwitz, Wendy and Lapidus, Alla and Wu, Dongying and McCutcheon, John P and McDonald, Bradon R and Moran, Nancy A and Bristow, James and Cheng, Jan-Fang},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Editor = {Ahmed, Niyaz},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/1004\_PlosOne\_One Bacterial Cell, One Complete Genome.pdf:pdf},
	Institution = {Department of Energy Joint Genome Institute, Walnut Creek, California, United States of America.},
	Journal = {PLoS ONE},
	Number = {4},
	Pages = {8},
	Publisher = {Public Library of Science},
	Title = {{One Bacterial Cell, One Complete Genome}},
	Url = {http://dx.plos.org/10.1371/journal.pone.0010314},
	Volume = {5},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.plos.org/10.1371/journal.pone.0010314}}

@article{Kane2010,
	Author = {Kane, Daniel M and Nelson, Jelani and Woodruff, David P and Road, Harry and Jose, San},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/An Optimal Algorithm for the Distinct Elements Problem.pdf:pdf},
	Isbn = {9781450300339},
	Keywords = {distinct elements,query optimiza-,streaming},
	Pages = {41--52},
	Title = {{PODS 2010 Best Paper Award An Optimal Algorithm for the Distinct Elements Problem}},
	Year = {2010}}

@article{Campagna2005,
	Abstract = {MOTIVATION: DNA repeats are a common feature of most genomic sequences. Their de novo identification is still difficult despite being a crucial step in genomic analysis and oligonucleotides design. Several efficient algorithms based on word counting are available, but too short words decrease specificity while long words decrease sensitivity, particularly in degenerated repeats. RESULTS: The Repeat Analysis Program (RAP) is based on a new word-counting algorithm optimized for high resolution repeat identification using gapped words. Many different overlapping gapped words can be counted at the same genomic position, thus producing a better signal than the single ungapped word. This results in better specificity both in terms of low-frequency detection, being able to identify sequences repeated only once, and highly divergent detection, producing a generally high score in most intron sequences. AVAILABILITY: The program is freely available for non-profit organizations, upon request to the authors. CONTACT: giorgio.valleunipd.it SUPPLEMENTARY INFORMATION: The program has been tested on the Caenorhabditis elegans genome using word lengths of 12, 14 and 16 bases. The full analysis has been implemented in the UCSC Genome Browser and is accessible at http://genome.cribi.unipd.it.},
	Author = {Campagna, Davide and Romualdi, Chiara and Vitulo, Nicola and {Del Favero}, Micky and Lexa, Matej and Cannata, Nicola and Valle, Giorgio},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {CRIBI, Universit\`{a} degli Studi di Padova via Ugo Bassi 58b, I-35121 Padova, Italy.},
	Journal = {Bioinformatics},
	Number = {5},
	Pages = {582--588},
	Publisher = {Oxford University Press},
	Title = {{RAP: a new computer program for de novo identification of repeated sequences in whole genomes.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/15374857},
	Volume = {21},
	Year = {2005},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/15374857}}

@article{Yang2011,
	Abstract = {Background: High-throughput short read sequencing is revolutionizing genomics and systems biology research by enabling cost-effective deep coverage sequencing of genomes and transcriptomes. Error detection and correction are crucial to many short read sequencing applications including de novo genome sequencing, genome resequencing, and digital gene expression analysis. Short read error detection is typically carried out by counting the observed frequencies of kmers in reads and validating those with frequencies exceeding a threshold. In case of genomes with high repeat content, an erroneous kmer may be frequently observed if it has few nucleotide differences with valid kmers with multiple occurrences in the genome. Error detection and correction were mostly applied to genomes with low repeat content and this remains a challenging problem for genomes with high repeat content. Results: We develop a statistical model and a computational method for error detection and correction in the presence of genomic repeats. We propose a method to infer genomic frequencies of kmers from their observed frequencies by analyzing the misread relationships among observed kmers. We also propose a method to estimate the threshold useful for validating kmers whose estimated genomic frequency exceeds the threshold. We demonstrate that superior error detection is achieved using these methods. Furthermore, we break away from the common assumption of uniformly distributed errors within a read, and provide a framework to model position-dependent error occurrence frequencies common to many short read platforms. Lastly, we achieve better error correction in genomes with high repeat content. Availability: The software is implemented in C++ and is freely available under GNU GPL3 license and Boost Software V1.0 license at http://aluru-sun.ece.iastate.edu/doku.php?id=redeem. Conclusions: We introduce a statistical framework to model sequencing errors in next-generation reads, which led to promising results in detecting and correcting errors for genomes with high repeat content.},
	Author = {Yang, Xiao and Aluru, Srinivas and Dorman, Karin S},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Journal = {BMC Bioinformatics},
	Number = {Suppl 1},
	Pages = {S52},
	Publisher = {BioMed Central},
	Title = {{Repeat-aware modeling and correction of short read errors}},
	Url = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3044310\&tool=pmcentrez\&rendertype=abstract},
	Volume = {12},
	Year = {2011},
	Bdsk-Url-1 = {http://www.pubmedcentral.nih.gov/articlerender.fcgi?artid=3044310%5C&tool=pmcentrez%5C&rendertype=abstract}}

@article{Rusu2008,
	Author = {Rusu, Florin and Dobra, Alin},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1145/1386118.1386121},
	File = {:Users/qingpeng/Dropbox/Work/Manuscript/Khmer/Reference/Sketches for Size of Join Estimation.pdf:pdf},
	Issn = {03625915},
	Journal = {ACM Transactions on Database Systems},
	Month = aug,
	Number = {3},
	Pages = {1--46},
	Title = {{Sketches for size of join estimation}},
	Url = {http://portal.acm.org/citation.cfm?doid=1386118.1386121},
	Volume = {33},
	Year = {2008},
	Bdsk-Url-1 = {http://portal.acm.org/citation.cfm?doid=1386118.1386121},
	Bdsk-Url-2 = {http://dx.doi.org/10.1145/1386118.1386121}}

@article{Do2008,
	Abstract = {A novel approach to the detection of genomic repeats is presented in this paper. The technique, dubbed SAGRI (Spectrum Assisted Genomic Repeat Identifier), is based on the spectrum (set of sequence k-mers, for some k) of the genomic sequence. Specifically, the genome is scanned twice. The first scan (FindHit) detects candidate pairs of repeat-segments, by effectively reconstructing portions of the Euler path of the (k-1)-mer graph of the genome only in correspondence with likely repeat sites. This process produces candidate repeat pairs, for which the location of the leftmost term is unknown. Candidate pairs are then subjected to validation in a second scan, in which the genome is labelled for hits in the (much smaller) spectrum of the repeat candidates: high hit density is taken as evidence of the location of the first segment of a repeat, and the pair of segments is then certified by pairwise alignment. The design parameters of the technique are selected on the basis of a careful probabilistic analysis (based on random sequences). SAGRI is compared with three leading repeat-finding tools on both synthetic and natural DNA sequences, and found to be uniformly superior in versatility (ability to detect repeats of different lengths) and accuracy (the central goal of repeat finding), while being quite competitive in speed. An executable program can be downloaded at http://sagri.comp.nus.edu.sg.},
	Author = {Do, Huy Hoang and Choi, Kwok Pui and Preparata, Franco P and Sung, Wing Kin and Zhang, Louxin},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Institution = {Department of Computer Science, National University of Singapore, Singapore.},
	Journal = {Journal of computational biology a journal of computational molecular cell biology},
	Number = {5},
	Pages = {469--487},
	Title = {{Spectrum-based de novo repeat detection in genomic sequences.}},
	Url = {http://www.ncbi.nlm.nih.gov/pubmed/18549302},
	Volume = {15},
	Year = {2008},
	Bdsk-Url-1 = {http://www.ncbi.nlm.nih.gov/pubmed/18549302}}

@article{Conway2011,
	Abstract = {MOTIVATION: Second-generation sequencing technology makes it feasible for many researches to obtain enough sequence reads to attempt the de novo assembly of higher eukaryotes (including mammals). De novo assembly not only provides a tool for understanding wide scale biological variation, but within human biomedicine, it offers a direct way of observing both large-scale structural variation and fine-scale sequence variation. Unfortunately, improvements in the computational feasibility for de novo assembly have not matched the improvements in the gathering of sequence data. This is for two reasons: the inherent computational complexity of the problem and the in-practice memory requirements of tools. RESULTS: In this article, we use entropy compressed or succinct data structures to create a practical representation of the de Bruijn assembly graph, which requires at least a factor of 10 less storage than the kinds of structures used by deployed methods. Moreover, because our representation is entropy compressed, in the presence of sequencing errors it has better scaling behaviour asymptotically than conventional approaches. We present results of a proof-of-concept assembly of a human genome performed on a modest commodity server.},
	Author = {Conway, Thomas C and Bromage, Andrew J},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1093/bioinformatics/btq697},
	Journal = {Bioinformatics},
	Journal-Full = {Bioinformatics (Oxford, England)},
	Mesh = {Computational Biology; Genome, Human; Genomics; Humans; Sequence Analysis, DNA; Software},
	Month = {Feb},
	Number = {4},
	Pages = {479-86},
	Pmid = {21245053},
	Pst = {ppublish},
	Title = {Succinct data structures for assembling large genomes},
	Volume = {27},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btq697}}

@article{Stein2010,
	Abstract = {With DNA sequencing now getting cheaper more quickly than data storage or computation, the time may have come for genome informatics to migrate to the cloud.},
	Author = {Stein, Lincoln D},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1186/gb-2010-11-5-207},
	Journal = {Genome Biol},
	Journal-Full = {Genome biology},
	Mesh = {Computational Biology; Genome, Human; Humans; Sequence Analysis, DNA},
	Number = {5},
	Pages = {207},
	Pmc = {PMC2898083},
	Pmid = {20441614},
	Pst = {ppublish},
	Title = {The case for cloud computing in genome informatics},
	Volume = {11},
	Year = {2010},
	Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2010-11-5-207}}

@article{Sboner2011,
	Abstract = {Advances in sequencing technology have led to a sharp decrease in the cost of 'data generation'. But is this sufficient to ensure cost-effective and efficient 'knowledge generation'?},
	Author = {Sboner, Andrea and Mu, Xinmeng Jasmine and Greenbaum, Dov and Auerbach, Raymond K and Gerstein, Mark B},
	Date-Added = {2012-09-17 19:55:25 +0000},
	Date-Modified = {2012-09-17 19:55:25 +0000},
	Doi = {10.1186/gb-2011-12-8-125},
	Journal = {Genome Biol},
	Journal-Full = {Genome biology},
	Mesh = {Costs and Cost Analysis; Database Management Systems; Genome, Human; Genomics; Humans; Sequence Analysis, DNA},
	Number = {8},
	Pages = {125},
	Pmc = {PMC3245608},
	Pmid = {21867570},
	Pst = {epublish},
	Title = {The real cost of sequencing: higher than you think!},
	Volume = {12},
	Year = {2011},
	Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2011-12-8-125}}

@article{McElroy2012,
	Abstract = {BACKGROUND: GemSIM, or General Error-Model based SIMulator, is a next-generation sequencing simulator capable of generating single or paired-end reads for any sequencing technology compatible with the generic formats SAM and FASTQ (including Illumina and Roche/454). GemSIM creates and uses empirically derived, sequence-context based error models to realistically emulate individual sequencing runs and/or technologies. Empirical fragment length and quality score distributions are also used. Reads may be drawn from one or more genomes or haplotype sets, facilitating simulation of deep sequencing, metagenomic, and resequencing projects.
RESULTS: We demonstrate GemSIM's value by deriving error models from two different Illumina sequencing runs and one Roche/454 run, and comparing and contrasting the resulting error profiles of each run. Overall error rates varied dramatically, both between individual Illumina runs, between the first and second reads in each pair, and between datasets from Illumina and Roche/454 technologies. Indels were markedly more frequent in Roche/454 than Illumina and both technologies suffered from an increase in error rates near the end of each read.The effects of these different profiles on low-frequency SNP-calling accuracy were investigated by analysing simulated sequencing data for a mixture of bacterial haplotypes. In general, SNP-calling using VarScan was only accurate for SNPs with frequency > 3%, independent of which error model was used to simulate the data. Variation between error profiles interacted strongly with VarScan's 'minumum average quality' parameter, resulting in different optimal settings for different sequencing runs.
CONCLUSIONS: Next-generation sequencing has unprecedented potential for assessing genetic diversity, however analysis is complicated as error profiles can vary noticeably even between different runs of the same technology. Simulation with GemSIM can help overcome this problem, by providing insights into the error profiles of individual sequencing runs and allowing researchers to assess the effects of these errors on downstream data analysis.},
	Author = {McElroy, Kerensa E and Luciani, Fabio and Thomas, Torsten},
	Date-Added = {2012-09-17 19:55:05 +0000},
	Date-Modified = {2012-09-17 19:55:05 +0000},
	Doi = {10.1186/1471-2164-13-74},
	Journal = {BMC Genomics},
	Journal-Full = {BMC genomics},
	Mesh = {Models, Genetic; Polymorphism, Single Nucleotide; Sequence Analysis, DNA; Software},
	Pages = {74},
	Pmc = {PMC3305602},
	Pmid = {22336055},
	Pst = {epublish},
	Title = {GemSIM: general, error-model based simulator of next-generation sequencing data},
	Volume = {13},
	Year = {2012},
	Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2164-13-74}}

@comment{BibDesk Static Groups{
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<array>
	<dict>
		<key>group name</key>
		<string>Other</string>
		<key>keys</key>
		<string></string>
	</dict>
	<dict>
		<key>group name</key>
		<string>Paper</string>
		<key>keys</key>
		<string></string>
	</dict>
</array>
</plist>
}}