PRIESSTESS_scan

#!/bin/bash

# Establish package installation location
libpath=`whereis PRIESSTESS | cut -f 2 -d ' '`
libpath=${libpath%/PRIESSTESS}
libpath="${libpath}/bin"

#### ARGUMENTS ####
# -----------------------------------------------------------------------------#
# Argument default values
PRIESSTESS_dir="./PRIESSTESS_output"  # -p
test_set_name="test_data"             # -testName
flank5=""                             # -f5
flank3=""                             # -f3
flanks_included="FALSE"	              # -flanksIn
temp=37                               # -t
clean="TRUE"		                  # -noCleanup

# Read in arguments and save
while test $# -gt 0; do
    case "$1" in
        -h|--help)
            echo "USAGE:"
            echo "  PRIESSTESS_scan -fg foreground_file -bg background_file [OPTIONS]"
            echo ""
            echo "    foreground_file and background_file must contain 1 probe"
            echo "    sequence per line containing only characters A, C, G, U "
            echo "    and N"
            echo "    Files must be either uncompressed or gzipped"
            echo "    To convert a fasta to the correct format use:"
            echo "        awk 'NR%2==0' fasta_file.fa > correct_format.txt"
            echo ""
            echo "OPTIONS:"
            echo "  -h, --help  Print help and exit"
            echo ""
            echo "  -p          Path to PRIESSTESS_output directory containing"
            echo "                the model to apply to the test data"
            echo "                Default: ./PRIESSTESS_output"
            echo ""
            echo "  -testName   A name to use when creating directories and "
            echo "                files with results"
            echo "                Ex. K562_RBFOX2_clip"
            echo "                Default: test_data"          
            echo ""
            echo "  -f5         5' constant flanking sequence to be added to"
            echo "                5' end of all probes in fg and bg files"
            echo "                OR if -flanksIn flag is used can also be a"
            echo "                number"
            echo "                Ex: GGAUUGUAACCUAUCUGUA OR 19"
            echo "                Default: None"
            echo ""
            echo "  -f3         3' constant flanking sequence to be added to"
            echo "                3' end of all probes in fg and bg files"
            echo "                OR if -flanksIn flag is used can also be a"
            echo "                number"
            echo "                Ex: GGAUUGUAACCUAUCUGUA OR 19"
            echo "                Default: None"
            echo ""
            echo "  -flanksIn   Indcates that 5' and 3' constant flanking "
            echo "                sequences defined above are included in the"
            echo "                probe sequences (-fg and -bg files)"
            echo "                If the flag is not used, the flanks defined"
            echo "                by -f5 and -f3 will be added by PRIESSTESS"
            echo ""
            echo "  -t          Folding temperature - passed to RNAfold"
            echo "                Default: 37"
            echo ""
            echo "  -noCleanup  Do not remove intermediate files created by"
            echo "                PRIESSTESS"
            echo "                If this flag is not used intermediate files"
            echo "                will be removed after usage"
            echo ""
            exit 0
            ;;
        -fg)
            shift
            if [ ! -f $1 ]; then echo "-fg: file $1 does not exist"; exit 1; fi;
            if [[ $1 =~ \.gz$ ]]; then 
                n=`zcat $1 | grep -v "[ACGUN]*" | wc -l`
                if [[ $n -gt 0 ]]; then
                    echo "-fg: file should only contain: A,C,G,U,N"
                    exit 1
                fi
                n=`zcat $1 | wc -l`
                if [[ $n -lt 1 ]]; then
                    echo "-fg: file should contain at least 1 sequence"
                    exit 1
                fi
            else
                n=`grep -v "[ACGUN]*" $1 | wc -l`
                if [[ $n -gt 0 ]]; then
                    echo "-fg: file should only contain: A,C,G,U,N"
                    exit 1
                fi
                n=`cat $1 | wc -l`
                if [[ $n -lt 1 ]]; then
                    echo "-fg: file should contain at least 1 sequence"
                    exit 1
                fi
            fi
            fgfile=$1
            shift
            ;;
        -bg)
            shift
            if [ ! -f $1 ]; then echo "-bg: file $1 does not exist"; exit 1; fi;
            if [[ $1 =~ \.gz$ ]]; then 
                n=`zcat $1 | grep -v "[ACGUN]*" | wc -l`
                if [[ $n -gt 0 ]]; then
                    echo "-bg: file can only contain: A,C,G,U,N"
                    exit 1
                fi
                n=`zcat $1 | wc -l`
                if [[ $n -lt 1 ]]; then
                    echo "-bg: file should contain at least 1 sequence"
                    exit 1
                fi
            else
                n=`grep -v "[ACGUN]*" $1 | wc -l`
                if [[ $n -gt 0 ]]; then
                    echo "-bg: file can only contain: A,C,G,U,N"
                    exit 1
                fi
                n=`cat $1 | wc -l`
                if [[ $n -lt 1 ]]; then
                    echo "-bg: file should contain at least 1 sequence"
                    exit 1
                fi
            fi
            bgfile=$1
            shift
            ;;
        -p)
            shift
            if [ ! -d $1 ]; then
                echo "-p: PRIESSTESS model directory does not exist"
                exit 1
            fi
            if [ ! -f ${1}/PRIESSTESS_model.sav ]; then
                echo "-p: PRIESSTESS model directory does not contain the "
                echo "    file PRIESSTESS_model.sav"
                exit 1
            fi
            if [ ! -f ${1}/LR_training_set.tab ]; then
                echo "-p: PRIESSTESS model directory does not contain the "
                echo "    file LR_training_set.tab"
                exit 1
            fi
            PRIESSTESS_dir=$1
            shift
            ;; 
        -testName)
            shift
            if [[ ! "$1" =~ ^[A-Za-z0-9_]*$ ]]; then
                echo "-testName: name for testing data should only include"
                echo "           letters, numbers and underscores"
            fi
            test_set_name=$1
            shift
            ;;
        -f5)
            shift
            if [[ ! "$1" =~ ^[ACGU]*$ ]]; then
                if [[ ! "$1" =~ ^[0-9][0-9]*$ ]]; then
                    echo "-f5: 5' flank sequence can only contain: A,C,G,U"
                    echo "     or be a integer" 
                    exit 1
                fi
            fi
            flank5=$1
            shift
            ;;
        -f3)
            shift
            if [[ ! "$1" =~ ^[ACGU]*$ ]]; then 
                if [[ ! "$1" =~ ^[0-9][0-9]*$ ]]; then
                    echo "-f3: 3' flank sequence can only contain: A,C,G,U"
                    echo "     or be a integer" 
                    exit 1
                fi
            fi
            flank3=$1
            shift
            ;;
        -flanksIn)
            flanks_included="TRUE"
            shift
            ;;
        -t)
            shift
            if [[ ! "$1" =~ ^[1-9][0-9]*$ ]]; then
                echo "-t: temperature must be int"
                exit 1
            fi
            temp=$1
            shift
            ;;
        -noCleanup)
            clean="FALSE"
            shift
            ;;
        *)
            echo "$1 is not a valid argument"
            exit 1
              ;;
    esac
done            

## Further argument checks

# Ensure that fgfile and bgfile are defined
if [ -z $fgfile ]; then 
    echo "-fg: No foreground file was provided"
    exit 1
fi;
if [ -z $bgfile ]; then 
    echo "-bg: No background file was provided"
    exit 1
fi;

# Ensure that if user indicates flanks are included that they
# defined the flanks
if [[ "$flanks_included" == "TRUE" ]]; then
    if [[ "$flank5" == "" || "$flank3" == "" ]]; then
        echo "-flanksIn: If the -flanksIn flag is used -f5 and -f3 must also"
        echo "           be supplied and not be empty strings"
        echo "           Sequences or lengths (integers) can be supplied"
        echo "           If it is your intention to retain the flanks in motif"
        echo "           identification and logistic regression - use none of"
        echo "           the three flags"
        exit 1
    fi
else
    if [[ "$flank5" =~ ^[0-9][0-9]*$ || "$flank3" =~ ^[0-9][0-9]*$ ]]; then
        echo "-f5, -f3: If the -flanksIn flag is NOT used, -f5 and -f3 must"
        echo "          be sequences not integers" 
    fi
fi

# -----------------------------------------------------------------------------#

#### DATA SET UP ####
# -----------------------------------------------------------------------------#
# Remove trailing slash from PRIESSTESS_dir if exists
PRIESSTESS_dir=${PRIESSTESS_dir%/}
# Create directory for testing data
test_dir="${PRIESSTESS_dir}/test_${test_set_name}"
mkdir $test_dir
if [ ! -d $test_dir ]; then
    echo "Issue creating testing directory"
    exit 1
fi

# Retrieve foreground & background files
# Remove sequences containing "N"s 
# Save sequences in files called fg_seqs.txt and bg_seqs.txt
# Files are assumed to be gzipped or uncompressed
if [[ $fgfile =~ \.gz$ ]]; then 
    zcat $fgfile | grep -v "N" > ${test_dir}/fg_seqs.txt
else
    grep -v "N" $fgfile > ${test_dir}/fg_seqs.txt
fi;

if [[ $bgfile =~ \.gz$ ]]; then 
    zcat $bgfile | grep -v "N" > ${test_dir}/bg_seqs.txt
else
    grep -v "N" $bgfile > ${test_dir}/bg_seqs.txt
fi;

# Ensure ability to write to directory and that everything is there
if [ ! -f ${test_dir}/fg_seqs.txt ]; then
    echo "Files could not be written to output directory"
    exit 1
fi
if [ ! -f ${test_dir}/bg_seqs.txt ]; then
    echo "Files could not be written to output directory"
    exit 1
fi

# Go to PRIESSTESS model directory
cd $test_dir

# If not already present (-flanksIn):
# Add 5' flank to beginning of each sequence
# Add 3' flank to end of each sequence
if [[ $flanks_included == "FALSE" ]]; then
    for f in fg_seqs.txt bg_seqs.txt; do
        awk -v prefix="$flank5" -v suffix="$flank3" '{print prefix $0 suffix}' $f > tmp.tmp
    mv -f tmp.tmp $f
    done;
    # Get length of each flank + 1 for correct indexing
    flank5len=`echo $flank5 | wc -c`
    flank3len=`echo $flank3 | wc -c`
else
    # Get length of each flank + 1 for correct indexing
    if [[ "$flank5" =~ ^[0-9][0-9]*$ ]]; then
        flank5len=$(( flank5 + 1 ))
    else
        flank5len=`echo $flank5 | wc -c`
    fi;
    if [[ "$flank3" =~ ^[0-9][0-9]*$ ]]; then
        flank3len=$(( flank3 + 1 ))
    else
        flank3len=`echo $flank3 | wc -c`
    fi;
fi;

# Get length of each flank + 1 for correct indexing
flank5len=`echo $flank5 | wc -c`
flank3len=`echo $flank3 | wc -c`

# Fold the full probe sequences for foreground and background 
# and return files with all 7 probe annotations plus a "name"
# for each probe
# This file returns a file called ${prefix}_alphabet_annotations.tab
${libpath}/fold_and_annotate.sh fg_seqs.txt $libpath fg $temp $clean
${libpath}/fold_and_annotate.sh bg_seqs.txt $libpath bg $temp $clean

# Remove alphabets that are not used in the PRIESSTESS model
cols=`tail -n 1 ../annotation_alphabets_header.tab | tr '\t' ','`
for p in fg bg; do
    cut -f $cols ${p}_alphabet_annotations.tab > tmp.tmp
    mv -f tmp.tmp ${p}_alphabet_annotations.tab
done;

# Extract the portion of the probe that is NOT in the 5' or 3'
# flank
# If 5' or 3' flanks are blank it will leave the files unchanged
for p in fg bg; do
    f=${p}_alphabet_annotations.tab
    no_flank="${p}_alphabet_annotations_no_flanks"
    # Each column must be managed individually
    numcols=`head -1 $f | tr '\t' '\n' | wc -l`
    topaste=''
    for i in `seq 2 $numcols`; do 
        cut -f $i $f | cut -c ${flank5len}- | rev | cut -c ${flank3len}- | rev > ${no_flank}_${i}_col.tmp
        topaste="${topaste} ${no_flank}_${i}_col.tmp"
    done
    # Paste together all the flank free columns
    cut -f 1 $f | paste - $topaste > ${no_flank}.tab
    # Clean up files
    # Remove redudant temporary files
    rm -f ${no_flank}_*_col.tmp
    # Swap names of the files and clean up uncessary files
    # unless requested otherwise
    if [[ $clean == "FALSE" ]]; then
        mv -f $f ${p}_alphabet_annotations_with_flanks.tab
    else
        rm -f fg_seqs.txt bg_seqs.txt
    fi
    mv -f ${no_flank}.tab $f
done

# Create directory and fasta files for each alphabet
for c in `cut -f 2- ../annotation_alphabets_header.tab | tail -n 1 | tr '\t' ' '`; do
    a=`cut -f $c ../annotation_alphabets_header.tab | head -n 1`
    mkdir $a
    for p in fg bg; do 
        cut -f $c ${p}_alphabet_annotations.tab | \
        awk -v p="$p" '{print p "_" NR "\t" $0}' | \
        ${libpath}/utils/tab2fasta.pl > ${a}/${p}_test.fa
    done;
done;

# Get scoreN value
N_score=`grep "^scoreN" ../PRIESSTESS_arguments.txt | cut -f 2 -d ' '`

# Scan probes with PFMs from each alphabet
for a in `cut -f 2- ../annotation_alphabets_header.tab | head -1 | tr '\t' ' '`; do
    cd $a
    # If any PFMs, scan them on *_LR.fa and *_test.fa files
    if [ -f ../../${a}/PFM-1.txt ]; then
        for f in fg_test.fa bg_test.fa; do
            python ${libpath}/PFM_scan.py -a $a -f $f -p ../../${a}/PFM -n $N_score
        done;
    fi;
    cd ..
done;

# -----------------------------------------------------------------------------#

#### TEST PRIESSTESS MODEL ####
# -----------------------------------------------------------------------------#
# Get AUROC of PRIESSTESS model on test data

# Get scores for all PFMs for all alphabets from test files
topaste_fg=""
topaste_bg=""
# For each alphabet if there are PFMs
for a in `cut -f 2- ../annotation_alphabets_header.tab | head -1 | tr '\t' ' '`; do
    if [ -f ${a}/fg_test_PFM_scan_sum_top_${N_score}.tab ]; then
        # Get scores and the names of the PFMs
        fg_scores="${a}/fg_test_PFM_scan_sum_top_${N_score}.tab"
        bg_scores="${a}/bg_test_PFM_scan_sum_top_${N_score}.tab"
        # Add alphabet name to the start of each PFM (now feature) names
        cut -f 2- $fg_scores | head -n 1 | sed "s/^/${a}_/" | sed "s/\t/\t${a}_/g" > ${a}_fg_scores.tmp
        cut -f 2- $fg_scores | tail -n +2 >> ${a}_fg_scores.tmp
        cut -f 2- $bg_scores | tail -n +2 > ${a}_bg_scores.tmp
        # Preparation to paste all alphabet scores together
        topaste_fg="$topaste_fg ${a}_fg_scores.tmp"
        topaste_bg="$topaste_bg ${a}_bg_scores.tmp"
    fi;
done;

# Combine scores for all alphabets into one file for training
# Add class column (1 or 0) to differentiate fg and bg sets
paste $topaste_fg | head -n 1 | sed 's/^/class\t/' > ${test_set_name}_scores.tab
paste $topaste_fg | sed 's/^/1\t/' | tail -n +2 >> ${test_set_name}_scores.tab
paste $topaste_bg | sed 's/^/0\t/' >> ${test_set_name}_scores.tab

rm -f *.tmp

echo "Calculating performance on test data"
python ${libpath}/test_PRIESSTESS_model.py ../LR_training_set.tab ${test_set_name}_scores.tab ../PRIESSTESS_model.sav $test_set_name

echo "--------"
echo "PRIESSTESS model scan complete"
auroc=`cat test_PRIESSTESS_model_ON_${test_set_name}_auroc.tab`
echo "AUROC on heldout: $auroc"
echo "AUROC on heldout: ${PRIESSTESS_dir}/test_${test_set_name}/test_PRIESSTESS_model_ON_${test_set_name}_auroc.tab"
echo "--------"