From b43902c9bb5295d880b2c9fe8f8b29d973b82131 Mon Sep 17 00:00:00 2001 From: Brent Pedersen Date: Mon, 10 Sep 2018 15:23:49 -0600 Subject: [PATCH] change hash function to something faster avoids use of CRAM sequence and base-quals which otherwise do not need to be decoded. This changes the run-time of one test-case from 20.8 seconds to 14.9 --- svtyper/classic.py | 3 ++- svtyper/parsers.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/svtyper/classic.py b/svtyper/classic.py index 8c79da4..b93eca4 100755 --- a/svtyper/classic.py +++ b/svtyper/classic.py @@ -123,7 +123,8 @@ def sv_genotype(bam_string, if b.endswith('.bam'): bam_list.append(pysam.AlignmentFile(b, mode='rb')) elif b.endswith('.cram'): - bam_list.append(pysam.AlignmentFile(b, mode='rc', reference_filename=ref_fasta)) + bam_list.append(pysam.AlignmentFile(b, + mode='rc',reference_filename=ref_fasta,format_options=["required_fields=7167"])) else: sys.stderr.write('Error: %s is not a valid alignment file (*.bam or *.cram)\n' % b) exit(1) diff --git a/svtyper/parsers.py b/svtyper/parsers.py index 208b45a..18d4fb1 100644 --- a/svtyper/parsers.py +++ b/svtyper/parsers.py @@ -717,6 +717,9 @@ def close(self): # from a single molecule # ================================================== +def rhash(r): + return hash((r.query_name, r.flag)) + class SamFragment(object): def __init__(self, read, lib): self.lib = lib @@ -738,7 +741,7 @@ def is_primary(self, read): def add_read(self, read): # ensure we don't add the same read twice - read_hash = read.__hash__() + read_hash = rhash(read) if read_hash in self.read_set: return else: