From 9706f5585161f22dcceeccb84a4296f56bbec833 Mon Sep 17 00:00:00 2001
From: "raphael.lafrance@gmail.com" <raphael.lafrance@gmail.com>
Date: Thu, 18 Jan 2018 16:05:08 -0500
Subject: [PATCH] Adding --single-ends argument to the preprocessor #212

---
 atram_preprocessor.py            | 45 +++++++++++-------
 tests/test_atram_preprocessor.py | 82 ++++++++++++++++++++++++++++++--
 2 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/atram_preprocessor.py b/atram_preprocessor.py
index 9cb9972..671114c 100644
--- a/atram_preprocessor.py
+++ b/atram_preprocessor.py
@@ -44,14 +44,14 @@ def preprocess(args):
 def load_seqs(args, db_conn):
     """Load sequences from a fasta/fastq files into the atram database."""
     # We have to clamp the end suffix depending on the file type.
-    for (arg, clamp) in [('mixed_ends', None), ('end_1', '1'),
+    for (arg, clamp) in [('mixed_ends', ''), ('end_1', '1'),
                          ('end_2', '2'), ('single_ends', '')]:
         if args.get(arg):
             for file_name in args[arg]:
-                load_one_file(db_conn, file_name, clamp)
+                load_one_file(db_conn, file_name, arg, clamp)
 
 
-def load_one_file(db_conn, file_name, seq_end_clamp=''):
+def load_one_file(db_conn, file_name, arg, seq_end_clamp=''):
     """Load sequences from a fasta/fastq file into the atram database."""
     log.info('Loading "{}" into sqlite database'.format(file_name))
 
@@ -68,7 +68,10 @@ def load_one_file(db_conn, file_name, seq_end_clamp=''):
             match = blast.PARSE_HEADER.match(title)
             if match.group(2):
                 seq_name = match.group(1)
-                seq_end = match.group(2)
+                if arg == 'mixed_ends':
+                    seq_end = match.group(2)
+                else:
+                    seq_end = seq_end_clamp
             else:
                 seq_name = title
                 seq_end = seq_end_clamp
@@ -183,26 +186,33 @@ def parse_command_line(temp_dir_default):
 
     parser.add_argument('--mixed-ends', '-m', metavar='FASTA', nargs='+',
                         help='''Sequence read archive files that have a mix of
-                             both end 1 and end 2 sequences. The sequence names
-                             MUST have an end suffix like "/1" or "_2". The
-                             files are in fasta or fastq format. You may enter
-                             more than one file or you may use wildcards.''')
+                             both end 1 and end 2 sequences (or single ends).
+                             The files are in fasta or fastq format. You may
+                             enter more than one file or you may use wildcards.
+                             ''')
 
     parser.add_argument('--end-1', '-1', metavar='FASTA', nargs='+',
                         help='''Sequence read archive files that have only
                              end 1 sequences. The sequence names do not need an
-                             end suffix, we will assume the suffix is 1 if it
-                             is missing. The files are in fasta or fastq
-                             format. You may enter more than one file or you
-                             may use wildcards.''')
+                             end suffix, we will assume the suffix is always 1.
+                             The files are in fasta or fastq format. You may
+                             enter more than one file or you may use wildcards.
+                             ''')
 
     parser.add_argument('--end-2', '-2', metavar='FASTA', nargs='+',
                         help='''Sequence read archive files that have only
                              end 2 sequences. The sequence names do not need an
-                             end suffix, we will assume the suffix is 2 if it
-                             is missing. The files are in fasta or fastq
-                             format. You may enter more than one file or you
-                             may use wildcards.''')
+                             end suffix, we will assume the suffix is always 2.
+                             The files are in fasta or fastq format. You may
+                             enter more than one file or you may use wildcards.
+                             ''')
+
+    parser.add_argument('--single-ends', '-S', metavar='FASTA', nargs='+',
+                        help='''Sequence read archive files that have only
+                             unpaired sequences. Any sequence suffix will be
+                             ignored. The files are in fasta or fastq format.
+                             You may enter more than one file or you may use
+                             wildcards.''')
 
     parser.add_argument('--version', action='version',
                         version='%(prog)s {}'.format(db.ATRAM_VERSION))
@@ -221,8 +231,7 @@ def parse_command_line(temp_dir_default):
     cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
     group.add_argument('--cpus', '--processes', '--max-processes',
                        type=int, default=cpus,
-                       help='''Number of CPU threads to use. This will also be
-                            used for the assemblers when possible. On this
+                       help='''Number of CPU threads to use. On this
                             machine the default is ("{}")'''.format(cpus))
 
     group.add_argument('-t', '--temp-dir', metavar='DIR',
diff --git a/tests/test_atram_preprocessor.py b/tests/test_atram_preprocessor.py
index 8ebd2a1..c7f4633 100644
--- a/tests/test_atram_preprocessor.py
+++ b/tests/test_atram_preprocessor.py
@@ -62,11 +62,11 @@ def test_preprocess(
 
     @patch('lib.log.info')
     @patch('lib.db.insert_sequences_batch')
-    def test_load_one_file_1(self, insert_sequences_batch, info):
+    def test_load_one_file_mixed(self, insert_sequences_batch, info):
         db.BATCH_SIZE = 5
 
         file_1 = join('tests', 'data', 'load_seq1.txt')
-        atram_preprocessor.load_one_file(self.db_conn, file_1)
+        atram_preprocessor.load_one_file(self.db_conn, file_1, 'mixed_ends')
 
         msg = 'Loading "{}" into sqlite database'.format(file_1)
         info.assert_called_once_with(msg)
@@ -85,13 +85,89 @@ def test_load_one_file_1(self, insert_sequences_batch, info):
                 ('seq4', '2', 'AAAAAAAAAAGGGGGGGGGG')])]
         insert_sequences_batch.assert_has_calls(calls)
 
+    @patch('lib.log.info')
+    @patch('lib.db.insert_sequences_batch')
+    def test_load_one_file_end1(self, insert_sequences_batch, info):
+        db.BATCH_SIZE = 5
+
+        file_1 = join('tests', 'data', 'load_seq1.txt')
+        atram_preprocessor.load_one_file(self.db_conn, file_1, 'end_1', '1')
+
+        msg = 'Loading "{}" into sqlite database'.format(file_1)
+        info.assert_called_once_with(msg)
+
+        calls = [
+            call(self.db_conn, [
+                ('seq1', '1', 'AAAAAAAAAA'),
+                ('seq2', '1', 'AAAAAAAAAAGGGGGGGGGG'),
+                ('seq3', '1', 'AAAAAAAAAA'),
+                ('seq4', '1', 'AAAAAAAAAA'),
+                ('seq5/3', '1', 'AAAAAAAAAAGGGGGGGGGG')]),
+            call(self.db_conn, [
+                ('seq1', '1', 'AAAAAAAAAA'),
+                ('seq2', '1', 'AAAAAAAAAAGGGGGGGGGG'),
+                ('seq3', '1', 'AAAAAAAAAA'),
+                ('seq4', '1', 'AAAAAAAAAAGGGGGGGGGG')])]
+        insert_sequences_batch.assert_has_calls(calls)
+
+    @patch('lib.log.info')
+    @patch('lib.db.insert_sequences_batch')
+    def test_load_one_file_end2(self, insert_sequences_batch, info):
+        db.BATCH_SIZE = 5
+
+        file_1 = join('tests', 'data', 'load_seq1.txt')
+        atram_preprocessor.load_one_file(self.db_conn, file_1, 'end_2', '2')
+
+        msg = 'Loading "{}" into sqlite database'.format(file_1)
+        info.assert_called_once_with(msg)
+
+        calls = [
+            call(self.db_conn, [
+                ('seq1', '2', 'AAAAAAAAAA'),
+                ('seq2', '2', 'AAAAAAAAAAGGGGGGGGGG'),
+                ('seq3', '2', 'AAAAAAAAAA'),
+                ('seq4', '2', 'AAAAAAAAAA'),
+                ('seq5/3', '2', 'AAAAAAAAAAGGGGGGGGGG')]),
+            call(self.db_conn, [
+                ('seq1', '2', 'AAAAAAAAAA'),
+                ('seq2', '2', 'AAAAAAAAAAGGGGGGGGGG'),
+                ('seq3', '2', 'AAAAAAAAAA'),
+                ('seq4', '2', 'AAAAAAAAAAGGGGGGGGGG')])]
+        insert_sequences_batch.assert_has_calls(calls)
+
+    @patch('lib.log.info')
+    @patch('lib.db.insert_sequences_batch')
+    def test_load_one_file_single(self, insert_sequences_batch, info):
+        db.BATCH_SIZE = 5
+
+        file_1 = join('tests', 'data', 'load_seq1.txt')
+        atram_preprocessor.load_one_file(
+            self.db_conn, file_1, 'single_ends', '')
+
+        msg = 'Loading "{}" into sqlite database'.format(file_1)
+        info.assert_called_once_with(msg)
+
+        calls = [
+            call(self.db_conn, [
+                ('seq1', '', 'AAAAAAAAAA'),
+                ('seq2', '', 'AAAAAAAAAAGGGGGGGGGG'),
+                ('seq3', '', 'AAAAAAAAAA'),
+                ('seq4', '', 'AAAAAAAAAA'),
+                ('seq5/3', '', 'AAAAAAAAAAGGGGGGGGGG')]),
+            call(self.db_conn, [
+                ('seq1', '', 'AAAAAAAAAA'),
+                ('seq2', '', 'AAAAAAAAAAGGGGGGGGGG'),
+                ('seq3', '', 'AAAAAAAAAA'),
+                ('seq4', '', 'AAAAAAAAAAGGGGGGGGGG')])]
+        insert_sequences_batch.assert_has_calls(calls)
+
     @patch('lib.log.info')
     @patch('lib.db.insert_sequences_batch')
     def test_load_one_file_2(self, insert_sequences_batch, info):
         db.BATCH_SIZE = 5
 
         file_1 = join('tests', 'data', 'load_seq2.txt')
-        atram_preprocessor.load_one_file(self.db_conn, file_1)
+        atram_preprocessor.load_one_file(self.db_conn, file_1, 'mixed_ends')
 
         msg = 'Loading "{}" into sqlite database'.format(file_1)
         info.assert_called_once_with(msg)