Skip to content

Commit

Permalink
Adding getDropTaxa function
Browse files Browse the repository at this point in the history
Adding `getDropTaxa` function.
  • Loading branch information
justincbagley committed Dec 21, 2020
1 parent 8eb8a26 commit 73b8bc9
Showing 1 changed file with 338 additions and 0 deletions.
338 changes: 338 additions & 0 deletions bin/getDropTaxa
Original file line number Diff line number Diff line change
@@ -0,0 +1,338 @@
#!/bin/sh

##########################################################################################
# __ o __ __ __ |__ __ #
# |__) | | ' (__( | ) | ) (__( #
# | #
# #
# File: getDropTaxa.sh #
VERSION="v1.0.0" #
# Author: Justin C. Bagley #
# Date: Created by Justin Bagley on Sun, Dec 20 01:48:24 CST 2020. #
# Last update: December 20, 2020 #
# Copyright (c) 2020 Justin C. Bagley. All rights reserved. #
# Please report bugs to <jbagley@jsu.edu>. #
# #
# Description: #
# SHELL SCRIPT THAT CREATES A DROP TAXON LIST (CONTAINING NAMES OF TIP TAXA TO REMOVE, #
# ONE PER LINE), STARTING FROM TWO LISTS: A LIST OF ALL TAXA VS. A LIST OF TAXA TO KEEP. #
# DROP TAXON LISTS CAN BE SUPPLIED TO THE dropTaxa FUNCTION OF PIrANHA (-t flag). #
# #
##########################################################################################
# Usage: $ ./getDropTaxa.sh <fullTaxonList> <dropTaxaList>
# e.g. $ chmod u+x ./getDropTaxa.sh
# $ ./getDropTaxa.sh all_313_taxon_names.txt n123_subset_names.txt
##########################################################################################

# Provide a variable with the location of this script.
SCRIPT_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

# Source Scripting Utilities
# -----------------------------------
# These shared utilities provide many functions which are needed to provide
# the functionality in this boilerplate. This script will fail if they can
# not be found.
# -----------------------------------
UTILS_LOCATION="${SCRIPT_PATH}/../lib/utils.sh" # Update this path to find the utilities.

if [[ -f "${UTILS_LOCATION}" ]]; then
source "${UTILS_LOCATION}"
else
echo "Please find the file util.sh and add a reference to it in this script. Exiting..."
exit 1
fi

# Source shared functions and variables
# -----------------------------------
FUNCS_LOCATION="${SCRIPT_PATH}/../lib/sharedFunctions.sh" # Update this path to find the shared functions.
VARS_LOCATION="${SCRIPT_PATH}/../lib/sharedVariables.sh" # Update this path to find the shared variables.

if [[ -f "${FUNCS_LOCATION}" ]] && [[ -f "${VARS_LOCATION}" ]]; then
source "${FUNCS_LOCATION}" ;
source "${VARS_LOCATION}" ;
else
echo "Please find the files sharedFunctions.sh and sharedVariables.sh and add references to them in this script. Exiting... "
exit 1
fi

# trapCleanup Function
# -----------------------------------
# Any actions that should be taken if the script is prematurely
# exited. Always call this function at the top of your script.
# -----------------------------------
trapCleanup () {
echo ""
# Delete temp files, if any
if is_dir "${tmpDir}"; then
rm -r "${tmpDir}"
fi
die "Exit trapped. In function: '${FUNCNAME[*]}'"
}

# safeExit
# -----------------------------------
# Non destructive exit for when script exits naturally.
# Usage: Add this function at the end of every script.
# -----------------------------------
safeExit () {
# Delete temp files, if any
if is_dir "${tmpDir}"; then
rm -r "${tmpDir}"
fi
trap - INT TERM EXIT
exit
}

# Set Flags
# -----------------------------------
# Flags which can be overridden by user input.
# Default values are below
# -----------------------------------
quiet=false
printLog=false
verbose=false
force=false
strict=false
debug=false
args=()

# Set Temp Directory
# -----------------------------------
# Create temp directory with three random numbers and the process ID
# in the name. This directory is removed automatically at exit.
# -----------------------------------
tmpDir="/tmp/${SCRIPT_NAME}.$RANDOM.$RANDOM.$RANDOM.$$"
(umask 077 && mkdir "${tmpDir}") || {
die "Could not create temporary directory! Exiting."
}

# Logging
# -----------------------------------
# Log is only used when the '-l' flag is set.
#
# To never save a logfile change variable to '/dev/null'
# Save to Desktop use: $HOME/Desktop/${SCRIPT_BASENAME}.log
# Save to standard user log location use: $HOME/Library/Logs/${SCRIPT_BASENAME}.log
# -----------------------------------
logFile="$HOME/Library/Logs/${SCRIPT_BASENAME}.log"

# Check for Dependencies
# -----------------------------------
# Arrays containing package dependencies needed to execute this script.
# The script will fail if dependencies are not installed. For Mac users,
# most dependencies can be installed automatically using the package
# manager 'Homebrew'. Mac applications will be installed using
# Homebrew Casks. Ruby and gems via RVM.
# -----------------------------------
export homebrewDependencies=()
export caskDependencies=()
export gemDependencies=()




getDropTaxa () {

######################################## START ###########################################
##########################################################################################

echo "INFO | $(date) |----------------------------------------------------------------"
echo "INFO | $(date) | getDropTaxa, v1.0.0 December 2020 "
echo "INFO | $(date) | Copyright (c) 2020 Justin C. Bagley. All rights reserved. "
echo "INFO | $(date) |----------------------------------------------------------------"

# MY_FULL_TAXON_LIST="$1" # 313_taxon_names.txt
# MY_TAXON_LIST="$2" # alltaxa_Burmeistera_56_plus_UMS_107802_RG_67_n123.txt

# PROCESS FULL TAXON LIST, COPY TO PRELIM OUTPUT FILE; ALSO PROCESS
# KEPT TAXON LIST
# ------------------------------------------------------
MY_FULL_NUM_TAXA="$(cat "$MY_FULL_TAXON_LIST" | wc -l | sed 's/\ //g')";
cp "$MY_FULL_TAXON_LIST" ./dropTaxonList.txt ;

MY_NUM_KEPT_TAXA="$(cat "$MY_TAXON_LIST" | wc -l | sed 's/\ //g')";

echo "INFO | $(date) | Full taxon list: ${MY_FULL_TAXON_LIST}"
echo "INFO | $(date) | N=${MY_FULL_NUM_TAXA}"
echo "INFO | $(date) | Kept taxon list: ${MY_TAXON_LIST} "
echo "INFO | $(date) | N=${MY_NUM_KEPT_TAXA}"


# PROCESS TAXON LIST FILE <taxonList> AND MAKE IT CONFORM TO SED FORMAT, WHILE SAVING
# COPY OF ORIGINAL
# ------------------------------------------------------
if [[ -s "$MY_TAXON_LIST" ]]; then
cp "$MY_TAXON_LIST" "$MY_TAXON_LIST".tmp ;
#
if [[ "${machine}" = "Mac" ]]; then
sed -i.bak 's/\ /\_/g; s/\-/\\\-/g; s/\_/\\\_/g; s/\./\_/g' "$MY_TAXON_LIST".tmp ;
rm ./*.bak;
fi
if [[ "${machine}" = "Linux" ]]; then
sed -i 's/\ /\_/g; s/\-/\\\-/g; s/\_/\\\_/g; s/\./\_/g' "$MY_TAXON_LIST".tmp ;
fi
fi

# MAKE LIST OF TAXA TO DROP BY DROPPING TAXA MATCHING ACTUAL TAXON LIST FILE (LIST OF TAXA TO KEEP)
# ------------------------------------------------------
(
while read TAXON; do
sed -i '' '/'"$TAXON"'/d' ./dropTaxonList.txt ;
done < "$MY_TAXON_LIST".tmp
)
rm "$MY_TAXON_LIST".tmp ;
MY_NUM_TAXA_TO_DROP="$(cat ./dropTaxonList.txt | wc -l | sed 's/\ //g')";

echo "INFO | $(date) | Drop taxon list: ./dropTaxonList.txt"
echo "INFO | $(date) | No. taxa to drop: N=${MY_NUM_TAXA_TO_DROP}"

MY_ACTUAL_KEPT_TAXA="$(calc "$MY_FULL_NUM_TAXA" - "$MY_NUM_TAXA_TO_DROP")";
MY_DIFFERENCE="$(calc "$MY_NUM_KEPT_TAXA" - "$MY_ACTUAL_KEPT_TAXA")";

echo "INFO | $(date) | No. taxa to keep: N=${MY_ACTUAL_KEPT_TAXA}"
echo "INFO | $(date) | Exp. difference: 0"
echo "INFO | $(date) | (No. taxa expected to be kept, versus no. actually kept.)"
echo "INFO | $(date) | Actual difference: ${MY_DIFFERENCE}"
echo "INFO | $(date) | (No. taxa expected to be kept, versus no. actually kept.)"
if [[ "$MY_DIFFERENCE" = "0" ]]; then
echo "INFO | $(date) | File check PASSED. No difference between exp. vs. actual kept taxa."
elif [[ "$MY_DIFFERENCE" -gt "0" ]]; then
echo "WARNING | $(date) | File check FAILED. Non-zero difference between exp. vs. actual kept taxa if drop taxon list "
echo "WARNING | $(date) | (i.e. created dropfile) is used. Please check './dropTaxonList.txt' for errors."
fi

echo "INFO | $(date) | Done."
echo "----------------------------------------------------------------------------------------------------------"

## Get matching lines (just to facilitate checking 'by-eye'):
(
while read line; do
grep -h "$line" "$MY_FULL_TAXON_LIST" >> ./matchingLines.txt ;
done < "$MY_TAXON_LIST"
)

##########################################################################################
######################################### END ############################################

}

###### SCRIPT OPTIONS:
MY_FULL_TAXON_LIST="$1" # e.g.: 313_taxon_names.txt
MY_TAXON_LIST="$2" # e.g.: alltaxa_Burmeistera_56_plus_UMS_107802_RG_67_n123.txt
#MY_OUTPUT_NAME="$3"

############ CREATE USAGE & HELP TEXTS
USAGE="
Usage: $(basename "$0") [INPUT]...
${bold}OVERVIEW${reset}
THIS SCRIPT creates a drop taxon list containing names of tip taxa to remove, or 'drop',
from one or more DNA sequence alignments, with one taxon name per line. It expects to be
passed as input the names of two lists: 1) <fullTaxonList>, a list of all taxa in the
alignment(s), and 2) <taxonSubsetList>, the subset of taxa in the full list that the user
wishes to keep. Both filenames are mandatory parameters.
This is a useful exercise because the limiting factor in managing taxon lists for data
subsetting is often generation and curation of the correct taxon list; also, starting from
a larger dataset and needed to reduce it to a subset is a common problem in data management.
In PIrANHA (Bagley 2020), drop taxon lists can be supplied to the dropTaxa function (using
the '-t' flag; see 'piranha -f dropTaxa -h' for more info) to quickly and easily remove
unwanted tip taxa from a DNA sequence alignment(s) in PHYLIP format (Felsenstein 2002).
An option (-h) is available to print the help text, and another option (-V|--version)
prints the current version of the program and exits.
This program runs on UNIX-like and Linux systems using commonly distributed utility
software, with usage as obtained by running the script with the -h flag and version printed
with the -V|--version flag. It has been tested with Perl v5.1+ on macOS (v10.13+, High Sierra
to Catalina) and Centos 5/6/7 Linux, but should work on many other versions of macOS or
Linux. There are no other dependencies.
${bold}Usage examples:${reset}
Call the program using PIrANHA, as follows:
piranha -f getDropTaxa.sh <fullTaxonList> <taxonSubsetList> Generic usage, makes drop taxon list
piranha -f dropTaxa -h Show this help text and exit
${bold}CITATION${reset}
Bagley, J.C. 2020. PIrANHA v0.4a4. GitHub repository, Available at:
<https://github.com/justincbagley/piranha>.
${bold}REFERENCES${reset}
Bagley, J.C. 2020. PIrANHA v0.4a4. GitHub repository, Available at:
<https://github.com/justincbagley/piranha>.
Felsenstein, J. 2002. PHYLIP (Phylogeny Inference Package) Version 3.6 a3.
Available at: <http://evolution.genetics.washington.edu/phylip.html>.
Created by Justin Bagley on Sun, Apr 5 23:48:03 CDT 2020.
Copyright (c) 2020 Justin C. Bagley. All rights reserved.
"

if [[ -z "$*" ]]; then
echo "$USAGE"
exit
fi

if [[ "$1" == "-h" ]] || [[ "$1" == "-help" ]]; then
echo "$USAGE"
exit
fi

if [[ "$1" == "-V" ]] || [[ "$1" == "--version" ]]; then
echo "$(basename "$0") $VERSION";
exit
fi

############ CHECK ARGUMENTS
# echo "$@"; echo "$#"; echo "$1"
# for i in "$@"; do
# echo "$i";
# done
# MY_ARGS="$(echo "$@" | perl -pe $'s/\ /\n/')"
# echo "$MY_ARGS"


############ CAPTURE ARGUMENTS, SEND TO FILE FOR PARSING
if [[ -s ./args.tmp ]]; then rm ./args.tmp ; fi ;
if [[ -s ./args.txt ]]; then rm ./args.txt ; fi ;
if [[ -s ./*.tmp ]]; then rm ./*.tmp ; fi ;

ALL_MY_ARGUMENTS="$(echo "$@")"
echo "$ALL_MY_ARGUMENTS" > ./args.txt
perl -p -i -e $'s/\-/\n\-/g' ./args.txt
#perl -p -i -e $'s/\ /\n/g' ./args.txt
#wc -l ./args.txt | perl -pe 's/\.\/args\.txt.*//g' | perl -pe 's/\ //g'


# ############# ############# #############
# ## TIME TO RUN THE SCRIPT ##
# ## ##
# ## You shouldn't need to edit anything ##
# ## beneath this line ##
# ## ##
# ############# ############# #############

# Trap bad exits with your cleanup function
trap trapCleanup EXIT INT TERM

# Set IFS to preferred implementation
IFS=$'\n\t'

# Exit on error. Append '||true' when you run the script if you expect an error.
set -o errexit

# Run in debug mode, if set
if ${debug}; then set -x ; fi

# Exit on empty variable
if ${strict}; then set -o nounset ; fi

# Bash will remember & return the highest exitcode in a chain of pipes.
# This way you can catch the error in case mysqldump fails in `mysqldump |gzip`, for example.
set -o pipefail

# Invoke the checkDependenices function to test for Bash packages. Uncomment if needed.
# checkDependencies

# Run the script
getDropTaxa

# Exit cleanly
safeExit

0 comments on commit 73b8bc9

Please sign in to comment.