Skip to content

Commit

Permalink
add ultima options
Browse files Browse the repository at this point in the history
  • Loading branch information
lhqing committed Aug 30, 2022
1 parent 107ff1f commit 339c659
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 11 deletions.
22 changes: 22 additions & 0 deletions cemba_data/hisat3n/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import click

from .hisat3n_m3c import remove_overlap_read_parts


@click.command('remove_overlap_read_parts')
@click.argument('in_bam_path')
@click.argument('out_bam_path')
def _remove_overlap_read_parts(in_bam_path, out_bam_path):
remove_overlap_read_parts(in_bam_path, out_bam_path)
return


@click.group()
def _main():
return


def main():
_main.add_command(_remove_overlap_read_parts)
_main()
return
2 changes: 1 addition & 1 deletion cemba_data/hisat3n/config/gcp.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ mamba install -y gxx

# Create mapping env hisat3n_env.yml
wget https://raw.githubusercontent.com/lhqing/cemba_data/master/hisat3n_env.yml
mamba env update -y hisat3n_env.yml # this should install things in the base env
mamba env update -f hisat3n_env.yml # this should install things in the base env

# Install packages
mkdir -p ~/pkg
Expand Down
18 changes: 11 additions & 7 deletions cemba_data/hisat3n/hisat3n_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from ..utilities import get_configuration


def bam_read_to_fastq_read(read):
if read.is_read1:
read_type = '1'
else:
read_type = '2'
def bam_read_to_fastq_read(read, read_type=None):
if read_type is None:
if read.is_read1:
read_type = '1'
else:
read_type = '2'

fastq_record = f"@{read.qname}_{read_type}\n" \
f"{read.query_sequence}\n" \
Expand All @@ -24,7 +25,8 @@ def separate_unique_and_multi_align_reads(in_bam_path,
out_unmappable_path=None,
unmappable_format='auto',
mapq_cutoff=10,
qlen_cutoff=30):
qlen_cutoff=30,
read_type=None):
"""
Separate unique aligned, multi-aligned, and unaligned reads from hisat-3n bam file.
Expand All @@ -45,6 +47,8 @@ def separate_unique_and_multi_align_reads(in_bam_path,
note that for hisat-3n, unique aligned reads always have MAPQ=60
qlen_cutoff
read length cutoff for any reads
read_type
read type, only None, "1" and "2" supported. If the BAM file is paired-end, use None.
Returns
-------
None
Expand Down Expand Up @@ -88,7 +92,7 @@ def separate_unique_and_multi_align_reads(in_bam_path,
if unmappable_format == 'bam':
unmappable_file.write(read)
else:
unmappable_file.write(bam_read_to_fastq_read(read))
unmappable_file.write(bam_read_to_fastq_read(read, read_type=read_type))

if unmappable_file is not None:
unmappable_file.close()
Expand Down
8 changes: 6 additions & 2 deletions cemba_data/hisat3n/hisat3n_m3c.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,8 @@ def call_chromatin_contacts(bam_path: str,
contact_prefix: str,
save_raw: bool = False,
save_hic_format: bool = True,
span=2500):
span=2500,
qname_format='illumina'):
"""
Process 3C bam file and generate contact file.
Expand Down Expand Up @@ -742,7 +743,10 @@ def call_chromatin_contacts(bam_path: str,
cur_read_pair_name = None
cur_read_parts = []
for read in bam:
read_pair_name = read.qname.split('_')[0]
if qname_format == 'illumina':
read_pair_name = read.qname.split('_')[0]
else:
read_pair_name = read.qname
if not read.has_tag('SS'):
read.set_tag('SS', 0)
read.set_tag('SE', read.qlen)
Expand Down
42 changes: 42 additions & 0 deletions cemba_data/hisat3n/stats_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,48 @@ def cell_parser_hisat_summary(stat_path):
return pd.Series(report_dict, name=cell_id)


def cell_parser_hisat_se_summary(stat_path):
"""
parse hisat3n summary file
"""
cell_id = pathlib.Path(stat_path).name.split('.')[0]
term_dict = {
'Total reads': 'ReadsMappedInSE',
'Aligned 0 time': 'UnmappableReads',
'Aligned 1 time': 'UniqueMappedReads',
'Aligned >1 times': 'MultiMappedReads',
}

with open(stat_path) as rep:
report_dict = {}
for line in rep:
try:
start, rest = line.split(':')
start = start.strip()
except ValueError:
continue # more or less than 2 after split
try:
report_dict[term_dict[start]] = rest.strip().split(
' ')[0].strip('%')
except KeyError:
pass
for v in term_dict.values():
if v not in report_dict:
report_dict[v] = 0

report_dict = pd.Series(report_dict).astype(int)
total_reads = report_dict['ReadsMappedInSE']
unique_mapped_reads = report_dict['UniqueMappedReads']
report_dict['UniqueMappingRate'] = round(unique_mapped_reads /
(total_reads + 0.00001) * 100)
multi_mapped_reads = report_dict['MultiMappedReads']
report_dict[f'MultiMappingRate'] = round(multi_mapped_reads /
(total_reads + 0.00001) * 100)
report_dict[f'OverallMappingRate'] = round(
(unique_mapped_reads + multi_mapped_reads) / (total_reads + 0.00001) * 100)
return pd.Series(report_dict, name=cell_id)


def cell_parser_picard_dedup_stat(stat_path):
stat_path = pathlib.Path(stat_path)
cell_id, *other_parts = stat_path.name.split('.')
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
'pysam'],
entry_points={
'console_scripts': ['yap=cemba_data.__main__:main',
'yap-internal=cemba_data._yap_internal_cli_:internal_main'],
'yap-internal=cemba_data._yap_internal_cli_:internal_main',
'yap-hisat3n=cemba_data.hisat3n.cli:main'],
}
)

0 comments on commit 339c659

Please sign in to comment.