Skip to content

Commit

Permalink
- amalgkit integrate now correctly works with decompressed fastq filess
Browse files Browse the repository at this point in the history
Signed-off-by: Hego_CCTB <matthias_freund@outlook.com>
  • Loading branch information
Hego-CCTB committed Jan 17, 2023
1 parent b97c9f2 commit 9d3ebcf
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
2 changes: 1 addition & 1 deletion amalgkit/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.6.8.9'
__version__ = '0.6.8.10'
4 changes: 2 additions & 2 deletions amalgkit/amalgkit
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,8 @@ pin.add_argument('--getfastq_dir', metavar='PATH', default=None, type=str, requi
'out_dir/getfastq/')
pin.add_argument('--remove_tmp', metavar='yes|no', default='no', type=strtobool, required=False, action='store',
help='default=%(default)s: Remove temporary files.')
pin.add_argument('--accurate_size', metavar='yes|no', default='no', type=strtobool, required=False, action='store',
help='default=%(default)s: If no, runs seqkit only on the first 1000 sequences in the fastq file to get an estimate for information like average read length. '
pin.add_argument('--accurate_size', metavar='yes|no', default='yes', type=strtobool, required=False, action='store',
help='default=%(default)s: ONLY APPLIES TO .gz COMPRESSED FASTQ FILES. If no, runs seqkit only on the first 1000 sequences in the fastq file to get an estimate for information like average read length. '
'If yes, runs seqkit on the whole fastq file. More accurate, but comes with much higher runtime.')
pin.set_defaults(handler=command_integrate)

Expand Down
13 changes: 11 additions & 2 deletions amalgkit/integrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import platform
import re
import subprocess
import time
import warnings

from amalgkit.sanity import check_getfastq_outputs
from amalgkit.util import *
Expand Down Expand Up @@ -52,7 +52,16 @@ def get_fastq_stats(args):
print("Found {} file(s) for ID {}. Lib-layout: {}".format(num_fastq_files[id], id,lib_layout), flush=True)
print("Getting sequence statistics.", flush=True)
tmp_file = os.path.join(args.out_dir, id+'_seqkit_stats.tmp')
if args.accurate_size:
# check for file extension. seqkit is significantly slower on compressed files, but still fast on decompressed files.
if fastq_files[0].endswith(('.fq', '.fastq')):
is_decompressed = True
elif fastq_files[0].endswith(('.fq.gz', '.fastq.gz')):
is_decompressed = False
else:
warnings.warn(fastq_files[0]+"is not a fastq file. Skipping.")
continue

if args.accurate_size or is_decompressed:
print('--accurate_size set to yes. Running accurate sequence scan.')
seqkit_command = ['seqkit', 'stats', '-T', '-j', str(args.threads), fastq_files[0]]
seqkit_stdout = open(tmp_file, 'w')
Expand Down

0 comments on commit 9d3ebcf

Please sign in to comment.