### Run braken on kraken report files

In [1]:
import traceback

import os
from hops import hdfs
import utils
import sys
from pyspark import SparkContext
import subprocess

sc = SparkContext.getOrCreate()

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
97,application_1747302721605_0025,pyspark,idle,Link,Link


SparkSession available as 'spark'.


#### Load arguments

In [None]:
args_full=utils.load_arguments(sys.argv)

In [None]:

OUTPUT_DATASET=args_full[utils.OUTPUT_DATASET]
INPUT_ROOT_PATH=args_full[utils.INPUT_ROOT_PATH]
RUN_FOLDER=args_full[utils.RUN_FOLDER]
WORK_PATH=os.path.join(OUTPUT_DATASET, RUN_FOLDER)

args=args_full["Bracken"]
# check of input and output root override
if args_full.get(utils.INPUT_OVERRIDE):
    inputRoot=args_full.get(utils.INPUT_OVERRIDE)
else :
    inputRoot=os.path.join(WORK_PATH,args['INPUT_ROOT'])
if args_full.get(utils.OUTPUT_OVERRIDE):
    output_dir=args_full.get(utils.OUTPUT_OVERRIDE)
else:
    output_dir=os.path.join(WORK_PATH,args['OUTPUT_ROOT'])

threads=str(args['THREADS'])
db_path=args['KRAKEN_DB_PATH']
read_length = args['READ_LENGTH']

TOOL = '/srv/hops/anaconda/envs/theenv/Bracken'


##### Map function

In [25]:

import subprocess
import os

def run_bracken_levels(db_path, rep_file, read_length=150, threads=10):
    """
    Run bracken for both species (S) and genus (G) levels.
    """
    levels = ['S', 'G']
    for level in levels:
        base = os.path.splitext(os.path.basename(rep_file))[0]
        output_file = f"{base}_{level}.bracken"
        report_file = f"{base}_{level}._report.txt"
        braken_path = TOOL + '/bracken'
        input_file_name = os.path.basename(rep_file)
        cmd = [
            braken_path,
            '-d', db_path,
            '-i', input_file_name,
            '-o', output_file,
            '-r', str(read_length),
            '-l', level,
            '-t', str(threads),
            '-w', report_file
        ]
        try:
            print(f'INFO: Starting bracken command {cmd}')
            subprocess.run(cmd, check=True)
            return output_file, report_file
        except subprocess.CalledProcessError:
            print(f"Bracken failed for level {level}")
            

def run_braken_driver(rep_file):
    # copy kraken db to local
    kk_db=os.path.split(db_path)[1]  
    # check if the db folder exists
    if not os.path.exists(kk_db):
        print('Starting copy of kraken db !')
        hdfs.copy_to_local(db_path)
    # copy report file 
    hdfs.copy_to_local(rep_file, overwrite=True)
    
    # Run Bracken for species and genus levels
    try:
        output_file, report_file = run_bracken_levels(kk_db, rep_file, read_length=read_length, threads=threads)
        
        print(f"Bracken output file: {output_file}")
        if os.path.exists(output_file):
            hdfs.copy_to_hdfs(output_file, output_dir, overwrite=True)
        if os.path.exists(report_file):
            hdfs.copy_to_hdfs(report_file, output_dir, overwrite=True)
    except Exception as e:
        print(f"Error occurred while running Bracken: {e}")


In [14]:
# load input file paths
inputFiles=utils.load_file_names(inputRoot)

In [15]:
print(f'Number of input files in list {len(inputFiles)}')

Number of input files in list 128

#### Run in parallel

In [None]:
finalFiles=sc.parallelize(inputFiles,sc.getConf().get("spark.executor.instances")).map(run_braken_driver).collect()