# Profile performance on RecA example
This Python Jupyter notebook profiles the code on the RecA example data.
The goal is to find slow spots to optimize.

Import necessary Python modules.

In [1]:
import contextlib
import cProfile
import os
import pstats
import tempfile
import warnings

import pandas as pd

import alignparse.minimap2
import alignparse.targets

Suppress warnings that clutter output:

In [2]:
warnings.simplefilter('ignore')

Initialize the `Targets` object:

In [3]:
recA_targetfile = '../notebooks/input_files/recA_amplicon.gb'
recA_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml'

targets = alignparse.targets.Targets(
                seqsfile=recA_targetfile,
                feature_parse_specs=recA_parse_specs_file)

Create a `Mapper` to run `minimap2`:

In [4]:
mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)

FASTQ file to align and parse:

In [5]:
fastq_files = ['../notebooks/input_files/recA_lib-1_ccs.fastq',
               '../notebooks/input_files/recA_lib-1_ccs.fastq']

Now create the SAM file with the mapper, and then parse it:

In [6]:
with contextlib.ExitStack() as stack:
    # Combine both FASTQ files for slightly bigger query
    queryfile = stack.enter_context(tempfile.NamedTemporaryFile(mode='wt',
                                                                suffix='.fastq'))
    text = []
    for fastq_file in fastq_files:
        with open(fastq_file) as f:
            text.append(f.read().rstrip())
    queryfile.write('\n'.join(text))
    queryfile.flush()
    
    # Create samfile with alignments
    samfile = stack.enter_context(tempfile.NamedTemporaryFile(mode='wt',
                                                              suffix='.sam'))
    targets.align(queryfile=queryfile.name,
                  alignmentfile=samfile.name,
                  mapper=mapper)
    
    # temporary file for profiling stats
    pstats_file = stack.enter_context(tempfile.NamedTemporaryFile(mode='wt'))
    
    # profile the alignment parsing
    cProfile.runctx('targets.parse_alignment(samfile.name)',
                    globals(),
                    locals(),
                    filename=pstats_file.name)

    stats = pstats.Stats(pstats_file.name)

Now print the top most time-intensive functions by total and cumulative time:

In [7]:
topn = 10
for timetype in ['tottime', 'cumtime']:
    print('-' * 20 + f" Top {topn} by {timetype} " + '-' * 20)
    (stats
     .strip_dirs()
     .sort_stats(timetype)
     .print_stats(topn)
     )

-------------------- Top 10 by tottime --------------------
Wed Aug 28 15:01:34 2019    /var/folders/fc/7sw280c13755gkvcsqlhlrfc0000gp/T/tmpki1guyjo

         196351 function calls (195089 primitive calls) in 0.277 seconds

   Ordered by: internal time
   List reduced from 808 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    38276    0.086    0.000    0.086    0.000 {method 'fullmatch' of '_regex.Pattern' objects}
     1252    0.035    0.000    0.109    0.000 cs_tag.py:239(extract_cs)
    33640    0.017    0.000    0.087    0.000 cs_tag.py:76(cs_op_type)
     2727    0.010    0.000    0.010    0.000 {method 'reduce' of 'numpy.ufunc' objects}
    13500    0.008    0.000    0.043    0.000 cs_tag.py:119(cs_op_len_target)
      200    0.008    0.000    0.191    0.001 targets.py:859(_parse_single_Alignment)
     2492    0.005    0.000    0.005    0.000 {method 'searchsorted' of 'numpy.ndarray' objects}
5835/5425    0.005    0.000    