add ultima options

lhqing · Aug 30, 2022 · 339c659 · 339c659
1 parent 107ff1f
commit 339c659
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 11 deletions.
diff --git a/cemba_data/hisat3n/cli.py b/cemba_data/hisat3n/cli.py
@@ -0,0 +1,22 @@
+import click
+
+from .hisat3n_m3c import remove_overlap_read_parts
+
+
+@click.command('remove_overlap_read_parts')
+@click.argument('in_bam_path')
+@click.argument('out_bam_path')
+def _remove_overlap_read_parts(in_bam_path, out_bam_path):
+    remove_overlap_read_parts(in_bam_path, out_bam_path)
+    return
+
+
+@click.group()
+def _main():
+    return
+
+
+def main():
+    _main.add_command(_remove_overlap_read_parts)
+    _main()
+    return
diff --git a/cemba_data/hisat3n/config/gcp.md b/cemba_data/hisat3n/config/gcp.md
@@ -17,7 +17,7 @@ mamba install -y gxx
 
 # Create mapping env hisat3n_env.yml
 wget https://raw.githubusercontent.com/lhqing/cemba_data/master/hisat3n_env.yml
-mamba env update -y hisat3n_env.yml  # this should install things in the base env
+mamba env update -f hisat3n_env.yml  # this should install things in the base env
 
 # Install packages
 mkdir -p ~/pkg

diff --git a/cemba_data/hisat3n/hisat3n_general.py b/cemba_data/hisat3n/hisat3n_general.py
@@ -5,11 +5,12 @@
 from ..utilities import get_configuration
 
 
-def bam_read_to_fastq_read(read):
-    if read.is_read1:
-        read_type = '1'
-    else:
-        read_type = '2'
+def bam_read_to_fastq_read(read, read_type=None):
+    if read_type is None:
+        if read.is_read1:
+            read_type = '1'
+        else:
+            read_type = '2'
 
     fastq_record = f"@{read.qname}_{read_type}\n" \
                    f"{read.query_sequence}\n" \
@@ -24,7 +25,8 @@ def separate_unique_and_multi_align_reads(in_bam_path,
                                           out_unmappable_path=None,
                                           unmappable_format='auto',
                                           mapq_cutoff=10,
-                                          qlen_cutoff=30):
+                                          qlen_cutoff=30,
+                                          read_type=None):
     """
     Separate unique aligned, multi-aligned, and unaligned reads from hisat-3n bam file.
 
@@ -45,6 +47,8 @@ def separate_unique_and_multi_align_reads(in_bam_path,
         note that for hisat-3n, unique aligned reads always have MAPQ=60
     qlen_cutoff
         read length cutoff for any reads
+    read_type
+        read type, only None, "1" and "2" supported. If the BAM file is paired-end, use None.
     Returns
     -------
     None
@@ -88,7 +92,7 @@ def separate_unique_and_multi_align_reads(in_bam_path,
                         if unmappable_format == 'bam':
                             unmappable_file.write(read)
                         else:
-                            unmappable_file.write(bam_read_to_fastq_read(read))
+                            unmappable_file.write(bam_read_to_fastq_read(read, read_type=read_type))
 
             if unmappable_file is not None:
                 unmappable_file.close()

diff --git a/cemba_data/hisat3n/hisat3n_m3c.py b/cemba_data/hisat3n/hisat3n_m3c.py
@@ -714,7 +714,8 @@ def call_chromatin_contacts(bam_path: str,
                             contact_prefix: str,
                             save_raw: bool = False,
                             save_hic_format: bool = True,
-                            span=2500):
+                            span=2500,
+                            qname_format='illumina'):
     """
     Process 3C bam file and generate contact file.
 
@@ -742,7 +743,10 @@ def call_chromatin_contacts(bam_path: str,
         cur_read_pair_name = None
         cur_read_parts = []
         for read in bam:
-            read_pair_name = read.qname.split('_')[0]
+            if qname_format == 'illumina':
+                read_pair_name = read.qname.split('_')[0]
+            else:
+                read_pair_name = read.qname
             if not read.has_tag('SS'):
                 read.set_tag('SS', 0)
                 read.set_tag('SE', read.qlen)

diff --git a/cemba_data/hisat3n/stats_parser.py b/cemba_data/hisat3n/stats_parser.py
@@ -64,6 +64,48 @@ def cell_parser_hisat_summary(stat_path):
     return pd.Series(report_dict, name=cell_id)
 
 
+def cell_parser_hisat_se_summary(stat_path):
+    """
+    parse hisat3n summary file
+    """
+    cell_id = pathlib.Path(stat_path).name.split('.')[0]
+    term_dict = {
+        'Total reads': 'ReadsMappedInSE',
+        'Aligned 0 time': 'UnmappableReads',
+        'Aligned 1 time': 'UniqueMappedReads',
+        'Aligned >1 times': 'MultiMappedReads',
+    }
+
+    with open(stat_path) as rep:
+        report_dict = {}
+        for line in rep:
+            try:
+                start, rest = line.split(':')
+                start = start.strip()
+            except ValueError:
+                continue  # more or less than 2 after split
+            try:
+                report_dict[term_dict[start]] = rest.strip().split(
+                    ' ')[0].strip('%')
+            except KeyError:
+                pass
+        for v in term_dict.values():
+            if v not in report_dict:
+                report_dict[v] = 0
+
+        report_dict = pd.Series(report_dict).astype(int)
+        total_reads = report_dict['ReadsMappedInSE']
+        unique_mapped_reads = report_dict['UniqueMappedReads']
+        report_dict['UniqueMappingRate'] = round(unique_mapped_reads /
+                                                 (total_reads + 0.00001) * 100)
+        multi_mapped_reads = report_dict['MultiMappedReads']
+        report_dict[f'MultiMappingRate'] = round(multi_mapped_reads /
+                                                 (total_reads + 0.00001) * 100)
+        report_dict[f'OverallMappingRate'] = round(
+            (unique_mapped_reads + multi_mapped_reads) / (total_reads + 0.00001) * 100)
+    return pd.Series(report_dict, name=cell_id)
+
+
 def cell_parser_picard_dedup_stat(stat_path):
     stat_path = pathlib.Path(stat_path)
     cell_id, *other_parts = stat_path.name.split('.')

diff --git a/setup.py b/setup.py
@@ -30,6 +30,7 @@
                       'pysam'],
     entry_points={
         'console_scripts': ['yap=cemba_data.__main__:main',
-                            'yap-internal=cemba_data._yap_internal_cli_:internal_main'],
+                            'yap-internal=cemba_data._yap_internal_cli_:internal_main',
+                            'yap-hisat3n=cemba_data.hisat3n.cli:main'],
     }
 )