From 9706f5585161f22dcceeccb84a4296f56bbec833 Mon Sep 17 00:00:00 2001 From: "raphael.lafrance@gmail.com" Date: Thu, 18 Jan 2018 16:05:08 -0500 Subject: [PATCH] Adding --single-ends argument to the preprocessor #212 --- atram_preprocessor.py | 45 +++++++++++------- tests/test_atram_preprocessor.py | 82 ++++++++++++++++++++++++++++++-- 2 files changed, 106 insertions(+), 21 deletions(-) diff --git a/atram_preprocessor.py b/atram_preprocessor.py index 9cb9972..671114c 100644 --- a/atram_preprocessor.py +++ b/atram_preprocessor.py @@ -44,14 +44,14 @@ def preprocess(args): def load_seqs(args, db_conn): """Load sequences from a fasta/fastq files into the atram database.""" # We have to clamp the end suffix depending on the file type. - for (arg, clamp) in [('mixed_ends', None), ('end_1', '1'), + for (arg, clamp) in [('mixed_ends', ''), ('end_1', '1'), ('end_2', '2'), ('single_ends', '')]: if args.get(arg): for file_name in args[arg]: - load_one_file(db_conn, file_name, clamp) + load_one_file(db_conn, file_name, arg, clamp) -def load_one_file(db_conn, file_name, seq_end_clamp=''): +def load_one_file(db_conn, file_name, arg, seq_end_clamp=''): """Load sequences from a fasta/fastq file into the atram database.""" log.info('Loading "{}" into sqlite database'.format(file_name)) @@ -68,7 +68,10 @@ def load_one_file(db_conn, file_name, seq_end_clamp=''): match = blast.PARSE_HEADER.match(title) if match.group(2): seq_name = match.group(1) - seq_end = match.group(2) + if arg == 'mixed_ends': + seq_end = match.group(2) + else: + seq_end = seq_end_clamp else: seq_name = title seq_end = seq_end_clamp @@ -183,26 +186,33 @@ def parse_command_line(temp_dir_default): parser.add_argument('--mixed-ends', '-m', metavar='FASTA', nargs='+', help='''Sequence read archive files that have a mix of - both end 1 and end 2 sequences. The sequence names - MUST have an end suffix like "/1" or "_2". The - files are in fasta or fastq format. You may enter - more than one file or you may use wildcards.''') + both end 1 and end 2 sequences (or single ends). + The files are in fasta or fastq format. You may + enter more than one file or you may use wildcards. + ''') parser.add_argument('--end-1', '-1', metavar='FASTA', nargs='+', help='''Sequence read archive files that have only end 1 sequences. The sequence names do not need an - end suffix, we will assume the suffix is 1 if it - is missing. The files are in fasta or fastq - format. You may enter more than one file or you - may use wildcards.''') + end suffix, we will assume the suffix is always 1. + The files are in fasta or fastq format. You may + enter more than one file or you may use wildcards. + ''') parser.add_argument('--end-2', '-2', metavar='FASTA', nargs='+', help='''Sequence read archive files that have only end 2 sequences. The sequence names do not need an - end suffix, we will assume the suffix is 2 if it - is missing. The files are in fasta or fastq - format. You may enter more than one file or you - may use wildcards.''') + end suffix, we will assume the suffix is always 2. + The files are in fasta or fastq format. You may + enter more than one file or you may use wildcards. + ''') + + parser.add_argument('--single-ends', '-S', metavar='FASTA', nargs='+', + help='''Sequence read archive files that have only + unpaired sequences. Any sequence suffix will be + ignored. The files are in fasta or fastq format. + You may enter more than one file or you may use + wildcards.''') parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) @@ -221,8 +231,7 @@ def parse_command_line(temp_dir_default): cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1) group.add_argument('--cpus', '--processes', '--max-processes', type=int, default=cpus, - help='''Number of CPU threads to use. This will also be - used for the assemblers when possible. On this + help='''Number of CPU threads to use. On this machine the default is ("{}")'''.format(cpus)) group.add_argument('-t', '--temp-dir', metavar='DIR', diff --git a/tests/test_atram_preprocessor.py b/tests/test_atram_preprocessor.py index 8ebd2a1..c7f4633 100644 --- a/tests/test_atram_preprocessor.py +++ b/tests/test_atram_preprocessor.py @@ -62,11 +62,11 @@ def test_preprocess( @patch('lib.log.info') @patch('lib.db.insert_sequences_batch') - def test_load_one_file_1(self, insert_sequences_batch, info): + def test_load_one_file_mixed(self, insert_sequences_batch, info): db.BATCH_SIZE = 5 file_1 = join('tests', 'data', 'load_seq1.txt') - atram_preprocessor.load_one_file(self.db_conn, file_1) + atram_preprocessor.load_one_file(self.db_conn, file_1, 'mixed_ends') msg = 'Loading "{}" into sqlite database'.format(file_1) info.assert_called_once_with(msg) @@ -85,13 +85,89 @@ def test_load_one_file_1(self, insert_sequences_batch, info): ('seq4', '2', 'AAAAAAAAAAGGGGGGGGGG')])] insert_sequences_batch.assert_has_calls(calls) + @patch('lib.log.info') + @patch('lib.db.insert_sequences_batch') + def test_load_one_file_end1(self, insert_sequences_batch, info): + db.BATCH_SIZE = 5 + + file_1 = join('tests', 'data', 'load_seq1.txt') + atram_preprocessor.load_one_file(self.db_conn, file_1, 'end_1', '1') + + msg = 'Loading "{}" into sqlite database'.format(file_1) + info.assert_called_once_with(msg) + + calls = [ + call(self.db_conn, [ + ('seq1', '1', 'AAAAAAAAAA'), + ('seq2', '1', 'AAAAAAAAAAGGGGGGGGGG'), + ('seq3', '1', 'AAAAAAAAAA'), + ('seq4', '1', 'AAAAAAAAAA'), + ('seq5/3', '1', 'AAAAAAAAAAGGGGGGGGGG')]), + call(self.db_conn, [ + ('seq1', '1', 'AAAAAAAAAA'), + ('seq2', '1', 'AAAAAAAAAAGGGGGGGGGG'), + ('seq3', '1', 'AAAAAAAAAA'), + ('seq4', '1', 'AAAAAAAAAAGGGGGGGGGG')])] + insert_sequences_batch.assert_has_calls(calls) + + @patch('lib.log.info') + @patch('lib.db.insert_sequences_batch') + def test_load_one_file_end2(self, insert_sequences_batch, info): + db.BATCH_SIZE = 5 + + file_1 = join('tests', 'data', 'load_seq1.txt') + atram_preprocessor.load_one_file(self.db_conn, file_1, 'end_2', '2') + + msg = 'Loading "{}" into sqlite database'.format(file_1) + info.assert_called_once_with(msg) + + calls = [ + call(self.db_conn, [ + ('seq1', '2', 'AAAAAAAAAA'), + ('seq2', '2', 'AAAAAAAAAAGGGGGGGGGG'), + ('seq3', '2', 'AAAAAAAAAA'), + ('seq4', '2', 'AAAAAAAAAA'), + ('seq5/3', '2', 'AAAAAAAAAAGGGGGGGGGG')]), + call(self.db_conn, [ + ('seq1', '2', 'AAAAAAAAAA'), + ('seq2', '2', 'AAAAAAAAAAGGGGGGGGGG'), + ('seq3', '2', 'AAAAAAAAAA'), + ('seq4', '2', 'AAAAAAAAAAGGGGGGGGGG')])] + insert_sequences_batch.assert_has_calls(calls) + + @patch('lib.log.info') + @patch('lib.db.insert_sequences_batch') + def test_load_one_file_single(self, insert_sequences_batch, info): + db.BATCH_SIZE = 5 + + file_1 = join('tests', 'data', 'load_seq1.txt') + atram_preprocessor.load_one_file( + self.db_conn, file_1, 'single_ends', '') + + msg = 'Loading "{}" into sqlite database'.format(file_1) + info.assert_called_once_with(msg) + + calls = [ + call(self.db_conn, [ + ('seq1', '', 'AAAAAAAAAA'), + ('seq2', '', 'AAAAAAAAAAGGGGGGGGGG'), + ('seq3', '', 'AAAAAAAAAA'), + ('seq4', '', 'AAAAAAAAAA'), + ('seq5/3', '', 'AAAAAAAAAAGGGGGGGGGG')]), + call(self.db_conn, [ + ('seq1', '', 'AAAAAAAAAA'), + ('seq2', '', 'AAAAAAAAAAGGGGGGGGGG'), + ('seq3', '', 'AAAAAAAAAA'), + ('seq4', '', 'AAAAAAAAAAGGGGGGGGGG')])] + insert_sequences_batch.assert_has_calls(calls) + @patch('lib.log.info') @patch('lib.db.insert_sequences_batch') def test_load_one_file_2(self, insert_sequences_batch, info): db.BATCH_SIZE = 5 file_1 = join('tests', 'data', 'load_seq2.txt') - atram_preprocessor.load_one_file(self.db_conn, file_1) + atram_preprocessor.load_one_file(self.db_conn, file_1, 'mixed_ends') msg = 'Loading "{}" into sqlite database'.format(file_1) info.assert_called_once_with(msg)