- amalgkit integrate now correctly works with decompressed fastq filess

Signed-off-by: Hego_CCTB <matthias_freund@outlook.com>
kfuku52 · Jan 17, 2023 · 9d3ebcf · 9d3ebcf
1 parent b97c9f2
commit 9d3ebcf
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 5 deletions.
diff --git a/amalgkit/__init__.py b/amalgkit/__init__.py
@@ -1 +1 @@
-__version__ = '0.6.8.9'
+__version__ = '0.6.8.10'
diff --git a/amalgkit/amalgkit b/amalgkit/amalgkit
@@ -326,8 +326,8 @@ pin.add_argument('--getfastq_dir', metavar='PATH', default=None, type=str, requi
                       'out_dir/getfastq/')
 pin.add_argument('--remove_tmp', metavar='yes|no', default='no', type=strtobool, required=False, action='store',
                  help='default=%(default)s: Remove temporary files.')
-pin.add_argument('--accurate_size', metavar='yes|no', default='no', type=strtobool, required=False, action='store',
-                 help='default=%(default)s: If no, runs seqkit only on the first 1000 sequences in the fastq file to get an estimate for information like average read length. '
+pin.add_argument('--accurate_size', metavar='yes|no', default='yes', type=strtobool, required=False, action='store',
+                 help='default=%(default)s: ONLY APPLIES TO .gz COMPRESSED FASTQ FILES. If no, runs seqkit only on the first 1000 sequences in the fastq file to get an estimate for information like average read length. '
                       'If yes, runs seqkit on the whole fastq file. More accurate, but comes with much higher runtime.')
 pin.set_defaults(handler=command_integrate)
 

diff --git a/amalgkit/integrate.py b/amalgkit/integrate.py
@@ -5,7 +5,7 @@
 import platform
 import re
 import subprocess
-import time
+import warnings
 
 from amalgkit.sanity import check_getfastq_outputs
 from amalgkit.util import *
@@ -52,7 +52,16 @@ def get_fastq_stats(args):
         print("Found {} file(s) for ID {}. Lib-layout: {}".format(num_fastq_files[id], id,lib_layout), flush=True)
         print("Getting sequence statistics.", flush=True)
         tmp_file = os.path.join(args.out_dir, id+'_seqkit_stats.tmp')
-        if args.accurate_size:
+        # check for file extension. seqkit is significantly slower on compressed files, but still fast on decompressed files.
+        if fastq_files[0].endswith(('.fq', '.fastq')):
+            is_decompressed = True
+        elif fastq_files[0].endswith(('.fq.gz', '.fastq.gz')):
+            is_decompressed = False
+        else:
+            warnings.warn(fastq_files[0]+"is not a fastq file. Skipping.")
+            continue
+
+        if args.accurate_size or is_decompressed:
             print('--accurate_size set to yes. Running accurate sequence scan.')
             seqkit_command = ['seqkit', 'stats', '-T', '-j', str(args.threads), fastq_files[0]]
             seqkit_stdout = open(tmp_file, 'w')