### Filtering SAM to BAM unmapped 

This job to filter SAM files output from Nextgenmap. It runs `samtools` via `pysam` to convert to BAM and store only unmapped content.

In [None]:
import traceback

import pysam
import os
from hops import hdfs
import utils
import sys
from pyspark import SparkContext
import subprocess

sc = SparkContext.getOrCreate()

#### Load arguments

In [None]:

args_full=utils.load_arguments(sys.argv)

OUTPUT_DATASET=args_full[utils.OUTPUT_DATASET]
INPUT_ROOT_PATH=args_full[utils.INPUT_ROOT_PATH]
RUN_FOLDER=args_full[utils.RUN_FOLDER]
WORK_PATH=os.path.join(OUTPUT_DATASET, RUN_FOLDER)

args=args_full[utils.KEY_SAM]
# check of input and output root override
if args_full.get(utils.INPUT_OVERRIDE):
    inputRoot=args_full.get(utils.INPUT_OVERRIDE)
else :
    inputRoot=os.path.join(WORK_PATH,args['INPUT_ROOT'])
if args_full.get(utils.OUTPUT_OVERRIDE):
    outputBam=args_full.get(utils.OUTPUT_OVERRIDE)
else:
    outputBam=os.path.join(WORK_PATH,args['OUTPUT_BAM'])

threads=str(args['THREADS'])


##### Map function

In [None]:

def convert_sam(file_path):
    """
    Map function to run via pysam equivalent to 'samtools view'
    Output is copied back to hdfs
    """

    file=os.path.basename(file_path)
    bam_file=file.split('.')[0]+utils.UNMAPPED_BAM
    if utils.skip_file(file,bam_file,outputBam):
        return [-1]
    hdfs.copy_to_local(file_path, overwrite=True)
    print("INFO: Run unmapped sequences BAM : ", file)
    try:
     pysam.view('-o', bam_file, '-b', file,'-f 4','-@',threads, catch_stdout=False)
     if os.path.exists(bam_file):
        hdfs.copy_to_hdfs(bam_file,outputBam,overwrite=True)
        os.remove(bam_file)
    except pysam.SamtoolsError:
        traceback.print_exc()
        utils.hdfs_delete_file(file) # delete corrupted input file
    finally:
        os.remove(file)

    return bam_file




    


In [None]:
# load input file paths

inputFiles=utils.load_file_names(inputRoot)

#### Run in parallel

In [None]:
### convert to bam and filter unmapped sequences
unMapped=sc.parallelize(inputFiles,sc.getConf().get("spark.executor.instances")).map(convert_sam).collect()