/
merge.cpp
214 lines (188 loc) · 7.16 KB
/
merge.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
//-----------------------------------------------
// Copyright 2010 Wellcome Trust Sanger Institute
// Written by Jared Simpson (js18@sanger.ac.uk)
// Released under the GPL
//-----------------------------------------------
//
// merge - merge read files and their indices
//
#include <iostream>
#include <fstream>
#include <sys/stat.h>
#include "SGACommon.h"
#include "Util.h"
#include "merge.h"
#include "SACAInducedCopying.h"
#include "BWTDiskConstruction.h"
#include "BWT.h"
//
void removeFiles(const std::string& inFile);
//
// Getopt
//
#define SUBPROGRAM "merge"
static const char *MERGE_VERSION_MESSAGE =
SUBPROGRAM " Version " PACKAGE_VERSION "\n"
"Written by Jared Simpson.\n"
"\n"
"Copyright 2010 Wellcome Trust Sanger Institute\n";
static const char *MERGE_USAGE_MESSAGE =
"Usage: " PACKAGE_NAME " " SUBPROGRAM " [OPTION] ... READS1 READS2\n"
"Merge the sequence files READS1, READS2 into a single file/index\n"
"\n"
" -v, --verbose display verbose output\n"
" --help display this help and exit\n"
" -t, --threads=NUM use NUM threads to merge the indices (default: 1)\n"
" -p, --prefix=PREFIX write final index to files starting with PREFIX (the default is to concatenate the input filenames)\n"
" -r, --remove remove the original BWT, SAI and reads files after the merge\n"
" -g, --gap-array=N use N bits of storage for each element of the gap array. Acceptable values are 4,8,16 or 32. Lower\n"
" values can substantially reduce the amount of memory required at the cost of less predictable memory usage.\n"
" When this value is set to 32, the memory requirement is essentially deterministic and requires ~5N bytes where\n"
" N is the size of the FM-index of READS2.\n"
" The default value is 4.\n"
" --no-sequence Suppress merging of the sequence files. Use this option when merging the index(es) separate e.g. in parallel\n"
" --no-forward Suppress merging of the forward index. Use this option when merging the index(es) separate e.g. in parallel\n"
" --no-reverse Suppress merging of the reverse index. Use this option when merging the index(es) separate e.g. in parallel\n"
"\n"
"\nReport bugs to " PACKAGE_BUGREPORT "\n\n";
namespace opt
{
static unsigned int verbose;
static std::string prefix;
static int numThreads = 1;
static bool bRemove;
static int gapArrayStorage = 4;
static bool bMergeSequence = true;
static bool bMergeForward = true;
static bool bMergeReverse = true;
}
static const char* shortopts = "p:m:t:g:vr";
enum { OPT_HELP = 1, OPT_VERSION, OPT_NO_SEQUENCE, OPT_NO_FWD, OPT_NO_REV };
static const struct option longopts[] = {
{ "verbose", no_argument, NULL, 'v' },
{ "prefix", required_argument, NULL, 'p' },
{ "remove", no_argument, NULL, 'r' },
{ "threads", required_argument, NULL, 't' },
{ "gap-array", required_argument, NULL, 'g' },
{ "no-sequence", no_argument, NULL, OPT_NO_SEQUENCE },
{ "no-forward", no_argument, NULL, OPT_NO_FWD },
{ "no-reverse", no_argument, NULL, OPT_NO_REV },
{ "help", no_argument, NULL, OPT_HELP },
{ "version", no_argument, NULL, OPT_VERSION },
{ NULL, 0, NULL, 0 }
};
int mergeMain(int argc, char** argv)
{
parseMergeOptions(argc, argv);
StringVector inFiles;
while(optind < argc)
{
inFiles.push_back(argv[optind++]);
}
assert(inFiles.size() == 2);
if(inFiles[0] == inFiles[1])
return 0; // avoid self-merge
std::string prefix1 = stripFilename(inFiles[0]);
std::string prefix2 = stripFilename(inFiles[1]);
if(opt::prefix.empty())
{
opt::prefix = prefix1 + "." + prefix2;
}
// Merge the indices
if(opt::bMergeForward)
{
mergeIndependentIndices(inFiles[0], inFiles[1], opt::prefix, BWT_EXT, SAI_EXT, false, opt::numThreads, opt::gapArrayStorage);
}
// Skip merging the reverse indices if the reverse bwt file does not exist.
std::string rbwt_filename_1 = prefix1 + RBWT_EXT;
std::string rbwt_filename_2 = prefix2 + RBWT_EXT;
struct stat file_s_1;
struct stat file_s_2;
int ret1 = stat(rbwt_filename_1.c_str(), &file_s_1);
int ret2 = stat(rbwt_filename_2.c_str(), &file_s_2);
if((ret1 == 0 || ret2 == 0) && opt::bMergeReverse)
{
mergeIndependentIndices(inFiles[0], inFiles[1], opt::prefix, RBWT_EXT, RSAI_EXT, true, opt::numThreads, opt::gapArrayStorage);
}
// Merge the read files
if(opt::bMergeSequence)
{
mergeReadFiles(inFiles[0], inFiles[1], opt::prefix);
}
if(opt::bRemove)
{
// Delete the original reads, bwt and sai files
removeFiles(inFiles[0]);
removeFiles(inFiles[1]);
}
return 0;
}
//
void removeFiles(const std::string& inFile)
{
std::string prefix = stripFilename(inFile);
std::string bwt_file = prefix + BWT_EXT;
std::string rbwt_file = prefix + RBWT_EXT;
std::string sai_file = prefix + SAI_EXT;
std::string rsai_file = prefix + RSAI_EXT;
unlink(bwt_file.c_str());
unlink(rbwt_file.c_str());
unlink(sai_file.c_str());
unlink(rsai_file.c_str());
unlink(inFile.c_str());
}
//
// Handle command line arguments
//
void parseMergeOptions(int argc, char** argv)
{
bool die = false;
for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;)
{
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c)
{
case 'p': arg >> opt::prefix; break;
case 'r': opt::bRemove = true; break;
case '?': die = true; break;
case 't': arg >> opt::numThreads; break;
case 'g': arg >> opt::gapArrayStorage; break;
case 'v': opt::verbose++; break;
case OPT_NO_SEQUENCE: opt::bMergeSequence = false; break;
case OPT_NO_FWD: opt::bMergeForward = false; break;
case OPT_NO_REV: opt::bMergeReverse = false; break;
case OPT_HELP:
std::cout << MERGE_USAGE_MESSAGE;
exit(EXIT_SUCCESS);
case OPT_VERSION:
std::cout << MERGE_VERSION_MESSAGE;
exit(EXIT_SUCCESS);
}
}
if(opt::gapArrayStorage != 4 && opt::gapArrayStorage != 8 &&
opt::gapArrayStorage != 16 && opt::gapArrayStorage != 32)
{
std::cerr << SUBPROGRAM ": invalid argument, --gap-array,-g must be one of 4,8,16,32 (found: " << opt::gapArrayStorage << ")\n";
die = true;
}
if(opt::numThreads <= 0)
{
std::cerr << SUBPROGRAM ": invalid number of threads: " << opt::numThreads << "\n";
die = true;
}
if (argc - optind < 1)
{
std::cerr << SUBPROGRAM ": missing arguments\n";
die = true;
}
if (argc - optind > 2)
{
std::cerr << SUBPROGRAM ": too many arguments\n";
die = true;
}
if (die)
{
std::cerr << "Try `" << SUBPROGRAM << " --help' for more information.\n";
exit(EXIT_FAILURE);
}
}