# BLAST

This notebook will go run blast by:

1. Create blast database(s) using the files in the directory called database (*fasta)
2. Run blastn against the blast database(s) using files in the directory query (*fasta)
3. Collate the results into a single file for each of the query files

## Getting Started

You will need to rerun this section each time you come back to this notebook to reset all directories and variables.

In [None]:
# set the variables for your netid
netid = "MY_NETID"

In [None]:
# Go into the working directory
work_dir = "/xdisk/bhurwitz/bh_class/" + netid + "/assignments/21_blast"
%cd $work_dir

## Creating a config file
Let's create a config file with all of the variables we will need in the scripts below. Then when we want to use these variables in the script, we will "source" the config file to set the variables.

In [None]:
# create the config file
my_code = '''# defining the log and scripts directories
export WORKING_DIR=$PWD
export SCRIPTS_DIR=$PWD/run_scripts
export LOG_DIR=$PWD/logs

# step 1 create blastdb
#export DB_DIR=/groups/bhurwitz/databases/AVrC
export MAX_DB_SIZE="1GB"
export DB_DIR=$PWD/database

# step 2 : blast query against blast db
export FASTA_DIR=$PWD/query
export FA_SPLIT_FILE_SIZE=5000000 # in bytes, 5000 in KB

# containers for tools
export FASPLIT=/contrib/singularity/shared/bhurwitz/ucsc-fasplit:469--h9b8f530_0.sif
export BLAST=/contrib/singularity/shared/bhurwitz/blast:2.16.0--hc155240_2.sif

# BLAST parameters
export BLAST_TYPE=blastn
export MAX_TARGET_SEQS=5
export EVAL=1e-3
export OUT_FMT=6 # tabular format with no headings

#
# Some custom functions for our scripts
#
# --------------------------------------------------
function init_dir {
    for dir in $*; do
        if [ -d "$dir" ]; then
            rm -rf $dir/*
        else
            mkdir -p "$dir"
        fi
    done
}

# --------------------------------------------------
function create_dir {
    for dir in $*; do
        if [[ ! -d "$dir" ]]; then
          echo "$dir does not exist. Directory created"
          mkdir -p $dir
        fi
    done
}

# --------------------------------------------------
function lc() {
    wc -l $1 | cut -d ' ' -f 1
}
'''

with open('config.sh', mode='w') as file:
    file.write(my_code)


In [None]:
# copy your database files from your home directory to the database directory
# be sure the files are named with the extension *fasta
!cp -r ~/database ./database

In [None]:
# copy your query files from your home directory to the query directory
# be sure the files are named with the extension *fasta
!cp -r ~/query ./query

In [None]:
# go into the script directory to create the run scripts
!mkdir -p run_scripts
%cd run_scripts

## Create the run scripts for the blast pipeline

01-makeblastdb.sh  02-launch-blast.sh  03-blast.sh  04-merge-blast.sh

In [None]:
# Create a script create the blast dbs (01-makeblastdb.sh)
my_code = '''#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --time=10:00:00
#SBATCH --partition=standard
#SBATCH --account=bh_class
#SBATCH --output=./logs/01-makeblastdb.out
#SBATCH --error=./logs/01-makeblastdb.err
#SBATCH --cpus-per-task=10
#SBATCH --mem-per-cpu=5G

#
# This script will create blast databases with all *fa files in the DB_DIR
#

pwd; hostname; date

# get the configurations
source $WORKING_DIR/config.sh

cd "$DB_DIR"

export DB_LIST="db-list"

find . -type f -name \*.fasta | sed "s/^\.\///" > $DB_LIST

if [[ ! -e "$DB_LIST" ]]; then
    echo Cannot find database list \"$DB_LIST\"
    exit 1
fi

i=0
while read DB; do
    let i++

    DB_NAME=`basename $FILE`

    printf "%5d: %s\n" $i "$DB_NAME"

    #
    # create blast database for fasta file
    #
    apptainer run ${BLAST} makeblastdb -title ${DB_NAME} -out ${DB_NAME} -in ${DB} -dbtype nucl -max_file_sz ${MAX_DB_SIZE}
done < "$DB_LIST"

echo Finished `date`

'''

with open('01-makeblastdb.sh', mode='w') as file:
    file.write(my_code)

In [None]:
# Create a script split up the query files and lauch blast 02-launch-blast.sh
my_code = '''#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --time=01:00:00
#SBATCH --partition=standard
#SBATCH --account=bh_class
#SBATCH --output=./logs/02-launch-blast.out
#SBATCH --error=./logs/02-launch-blast.err
#SBATCH --cpus-per-task=5
#SBATCH --mem-per-cpu=5G

#
# This script splits the input files, blasts them against the databases, and collates results
# Note that this script runs additional jobs on each of the files as a job array for the split files
#

pwd; hostname; date
source $WORKING_DIR/config.sh

# set up the results, stderr and stdout directories for this script
PROG="02-launch-blast"
RESULTS_DIR="$WORKING_DIR/results/$PROG"

# initialize directories, this will remove prior runs and create new directories
init_dir "$RESULTS_DIR"

cd "$FASTA_DIR"

export FILES_LIST="fasta-files"

find . -type f -name \*.fasta | sed "s/^\.\///" > $FILES_LIST

NUM_FILES=$(lc $FILES_LIST)

echo Found \"$NUM_FILES\" files in \"$FASTA_DIR\"

if [ $NUM_FILES -gt 0 ]; then
    i=0
    while read FILE; do
        let i++

        export FILE_NAME=`basename $FILE`

        printf "%5d: %s\n" $i "$FILE_NAME"

        OUT_DIR="$RESULTS_DIR/$FILE_NAME"
        export SPLIT_DIR="$OUT_DIR/fa_split"
        init_dir "$OUT_DIR" "$SPLIT_DIR"

        # first we need to split up the fasta files to run quickly
        apptainer run ${FASPLIT} faSplit about "$FILE" "$FA_SPLIT_FILE_SIZE" "$SPLIT_DIR/"

        # now launch the blast for the split files against all of the blast databases
        cd "$SPLIT_DIR"
        export SPLIT_FILES_LIST="$SPLIT_DIR/split-fasta-files"
        find . -type f -name \*.fa | sed "s/^\.\///" > $SPLIT_FILES_LIST

        NUM_SPLIT_FILES=$(lc $SPLIT_FILES_LIST)
        ARRAY_NUM=$((NUM_SPLIT_FILES - 1))
        echo Found \"$NUM_SPLIT_FILES\" files in \"$SPLIT_DIR\"

        cd $WORKING_DIR

        # run all blast jobs for each split file
        job1=$(sbatch --array=0-${ARRAY_NUM} ${SCRIPTS_DIR}/03-blast.sh)
        jid1=$(echo $job1 | sed 's/^Submitted batch job //')
        echo $jid1 $FILE_NAME $SPLIT_DIR

        # collate the results for each split file
        job2=$(sbatch --dependency=afterok:$jid1 ${SCRIPTS_DIR}/04-merge-blast.sh)
        jid2=$(echo $job2 | sed 's/^Submitted batch job //')
        echo $jid2 $FILE_NAME $SPLIT_DIR

        cd "$FASTA_DIR"
    done < "$FILES_LIST"
else
    echo No input fasta files.
fi

echo "Finished `date`"
'''

with open('02-launch-blast.sh', mode='w') as file:
    file.write(my_code)

In [None]:
# Create a script to run blast on each of the split files 03-blast.sh
my_code = '''#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --time=48:00:00
#SBATCH --partition=standard
#SBATCH --account=bh_class
#SBATCH --output=./logs/03-blast.out
#SBATCH --error=./logs/03-blast.err
#SBATCH --cpus-per-task=24
#SBATCH --mem-per-cpu=5G

#
# This script runs blast jobs for each of the split fasta files
#

pwd; hostname; date
source $WORKING_DIR/config.sh

# get the split file, that we are currently blasting
names=($(cat ${SPLIT_FILES_LIST}))
SPLIT_FILE=${names[${SLURM_ARRAY_TASK_ID}]}

# set up the results, stderr and stdout directories for this script
PROG="03-blast"
RESULTS_DIR="$WORKING_DIR/results/$PROG"

# create directory if it doesn't exist.
create_dir "$RESULTS_DIR"

# we want to blast each array value for each split file against each database
i=0
while read DB; do
    let i++

    #NAME=`basename $SPLIT_FILE`
    printf "%5d: %s\n" $i "$DB"
    RESULTS_BY_DB="$RESULTS_DIR/$DB/$FILE_NAME"
    create_dir "$RESULTS_BY_DB"
    BLAST_OUT="$RESULTS_BY_DB/$SPLIT_FILE"
    BLAST_DB="$DB_DIR/$DB"

    # run blast against each split file and database
    apptainer run ${BLAST} $BLAST_TYPE -num_threads 24 -db $BLAST_DB -query $SPLIT_DIR/$SPLIT_FILE -out $BLAST_OUT -evalue $EVAL -outfmt $OUT_FMT -max_target_seqs $MAX_TARGET_SEQS
done < "$DB_DIR/db-list"

echo "Finished `date`"
'''

with open('03-blast.sh', mode='w') as file:
    file.write(my_code)

In [None]:
# Create a script create the blast dbs
my_code = '''#!/bin/bash
#SBATCH --ntasks=1
#SBATCH --nodes=1
#SBATCH --time=01:00:00
#SBATCH --partition=standard
#SBATCH --account=bh_class
#SBATCH --output=./logs/04-merge-blast.out
#SBATCH --error=./logs/04-merge-blast.err
#SBATCH --cpus-per-task=5
#SBATCH --mem-per-cpu=5G

#
# This script collates the blast results for each file against each database
#

pwd; hostname; date
source $WORKING_DIR/config.sh

# set up the results, stderr and stdout directories for this script
PROG="04-merge-blast"
RESULTS_DIR="$WORKING_DIR/results/$PROG"

# create new result dir if it doesn't exist
create_dir "$RESULTS_DIR"

i=0
while read DB; do
    let i++
    printf "%5d: %s\n" $i "$DB_NAME"
    RESULTS_BY_DB="$RESULTS_DIR/$DB"
    create_dir "$RESULTS_BY_DB"
    BLAST_RESULTS="$RESULTS_BY_DB/${FILE_NAME}.txt"
    BLAST_GFF="$RESULTS_BY_DB/${FILE_NAME}.gff"

    BLAST_OUT="$PWD/results/03-blast/$DB/$FILE_NAME"

    cat $BLAST_OUT/* > $BLAST_RESULTS

    # convert to GFF format
    awk '{print $1"\tblast\tgene\t"$7"\t"$8"\t.\t.\t.\tID=Gene"$7";Name="$2}' $BLAST_RESULTS > $BLAST_GFF

done < "$DB_DIR/db-list"

echo "Finished `date`"
'''

with open('04-merge-blast.sh', mode='w') as file:
    file.write(my_code)

In [None]:
# go back one directory to the main directory
%cd ..

In [None]:
# Let's create the launcher script to kick off our pipeline.

my_code = '''#! /bin/bash

# get the configurations
source ./config.sh

# create directories, this will create directories if they do not exist from a prior run
create_dir "$RESULTS_DIR" "$LOG_DIR"

# run the jobs in order with dependencies

# 01-makeblastdb.sh - create the blast databases - no dependencies
job1=$(sbatch $SCRIPTS_DIR/01-makeblastdb.sh)
jid1=$(echo $job1 | sed 's/^Submitted batch job //')
echo $jid1

# 02-launch-blast.sh - jid2 depends on jid1
# This script:
# 1. splits the query files (into small chunks)
# 2. runs job 03-blast.sh to blast each chunk vs the databases
# 3. runs jub 04-merge-blast.sh to collate results by input file
job2=$(sbatch --dependency=afterok:$jid1 $SCRIPTS_DIR/02-launch-blast.sh)
jid2=$(echo $job2 | sed 's/^Submitted batch job //')
echo $jid2

'''

with open('launch_pipeline.sh', mode='w') as file:
    file.write(my_code)

In [None]:
# Make the pipeline script executable
!chmod +x *.sh

In [None]:
# now let's run it!
!./launch_pipeline.sh

In [None]:
# You can check if it is running using the squeue command
# Check for all jobs under your netid
!squeue --user=$netid

## Final Step
Copy your notebook to the current working directory

In [None]:
!cp ~/be487-fall-2024/assignments/21_blast/21_blast.ipynb $work_dir