# Annotation of transcript isoforms

- Transcript isoforms with coding region were annotated using several softwares:
    - Bakta (prokaryotic annotation)
    - KEGG for eukaryotes and viruses (BlastKOALA; https://www.kegg.jp/blastkoala/)
    - dbCAN3 (CAZy annotation)
    - Anvio (KEGG module completeness)
    - Top alignment row from BLASTn alignments (all assembled 10M transcript isoforms)

### 1) Bakta

In [None]:
# conda env
conda activate bakta

# codes
bakta_db download --output db --type full
bakta --db ../../../Bakta/db diff-expressed-genes.fasta --verbose --output results/ --threads 16

### 2) dbCAN3

In [None]:
conda activate dbCAN3

# automatically download and prepare databases
run_dbcan database --db_dir db

# run dbCAN3 for CAZy annotation
run_dbcan CAZyme_annotation --input_raw_data diff-expressed-genes.fasta --out_dir dbCAN3_annotation --db_dir ../db --mode meta

### 3) Anvi'o with dev version

In [None]:
# install anvio-dev version: https://anvio.org/install/linux/dev/
conda create -y --name anvio-dev python=3.10
conda activate anvio-dev

# install dependencies
conda install -y -c conda-forge -c bioconda python=3.10 \
        sqlite=3.46 prodigal idba mcl muscle=3.8.1551 famsa hmmer diamond \
        blast megahit spades bowtie2 bwa graphviz "samtools>=1.9" \
        trimal iqtree trnascan-se fasttree vmatch r-base r-tidyverse \
        r-optparse r-stringi r-magrittr bioconductor-qvalue meme ghostscript \
        nodejs=20.12.2

# try this, if it doesn't install, don't worry (it is sad, but OK):
conda install -y -c bioconda fastani

# setting up the local copy of the anvi’o codebase
mkdir -p ~/github && cd ~/github/
git clone --recursive https://github.com/merenlab/anvio.git

# installing the Python dependencies
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++

# and:
cd ~/github/anvio/
pip install -r requirements.txt

# linking conda environment and the codebase
cat <<EOF >${CONDA_PREFIX}/etc/conda/activate.d/anvio.sh
# creating an activation script for the the conda environment for anvi'o
# development branch so (1) Python knows where to find anvi'o libraries,
# (2) the shell knows where to find anvi'o programs, and (3) every time
# the environment is activated it synchronizes with the latest code from
# active GitHub repository:
export PYTHONPATH=\$PYTHONPATH:\$HOME/github/anvio/
export PATH=\$PATH:\$HOME/github/anvio/bin:\$HOME/github/anvio/sandbox
echo -e "\033[1;34mUpdating from anvi'o GitHub \033[0;31m(press CTRL+C to cancel)\033[0m ..."
cd \$HOME/github/anvio && git pull && cd -
EOF

# check if it worked
which anvi-self-test
anvi-self-test -v

# final installation check
anvi-self-test --suite mini

# now to start using anvio for pathway analysis, do the following
# set up KEGG database on the cluster - this code will download the db: https://anvio.org/tutorials/fmt-mag-metabolism/
anvi-setup-kegg-data

# then check if it is being called correctly
# learn where the MODULES.db is:
export ANVIO_MODULES_DB=`python -c "import anvio; import os; print(os.path.join(os.path.dirname(anvio.__file__), 'data/misc/KEGG/MODULES.db'))"`

# print the path so you can see where it is located
echo $ANVIO_MODULES_DB

# check the hash of the MODULES.db contents
anvi-db-info $ANVIO_MODULES_DB

# reformat fasta file so anvio knows how to read it
anvi-script-reformat-fasta upregulated.fasta --simplify-names --output-file upregulated.fa --report-file upregulated_report.txt
anvi-script-reformat-fasta downregulated.fasta --simplify-names --output-file downregulated.fa --report-file downregulated_report.txt

# build a contig database for anvio, one for upregulated transcripts and another one for downregulated transcripts
anvi-gen-contigs-database --contigs-fasta upregulated.fa --project-name FE_upregulated --output-db-path FE_upregulated.db --num-threads 4
anvi-gen-contigs-database --contigs-fasta downregulated.fa --project-name FE_downregulated --output-db-path FE_downregulated.db --num-threads 4

# then annotate the up and downregulated transcripts with KOfam hits
anvi-run-kegg-kofams -c FE_upregulated.db -T 4
anvi-run-kegg-kofams -c FE_downregulated.db -T 4

# estimate metabolism completeness metrics
anvi-estimate-metabolism -c FE_upregulated.db -O FE_upregulated_metabolism
anvi-estimate-metabolism -c FE_downregulated.db -O FE_downregulated_metabolism
# here we can get a table showing module completeness

### 4) Top hit from BLASTn alignments

In [None]:
# conda env
conda activate blast

# download most up to date databases from NCBI
update_blastdb.pl --decompress nt
update_blastdb.pl --decompress nr

# exported db to 
export BLASTDB=/home/hfm/db/blast/nt:/home/hfm/db/blast/nr

# run BLASTn with the goal of retaining only the top row from alignments as a proxy of transcript host
blastn -db nt -query ../../trinity_2.15/Trinity.fasta -out alignments_BLASTn.csv \
-evalue 10 -outfmt 6 -max_target_seqs 1 -num_threads 16

# Extracting TRINITY_ID and Accession
awk '{print $1, $2}' alignments_BLASTn.csv | sort -u > trinity_taxonomy_ids.txt

# List of unique accessions
awk '{print $2}' trinity_taxonomy_ids.txt | sort -u > trinity_accessions.txt

# Recovering NCBI IDs using accession
while read accession; do
    taxid=$(blastdbcmd -db nt -entry "$accession" -outfmt "%T")
    echo "$accession $taxid"
done < trinity_accessions.txt > trinity_accessions_with_taxids.txt

# Cleaning IDs with missing accession
awk 'NF == 2' trinity_accessions_with_taxids.txt > cleaned_trinity_accessions_with_taxids.txt

# Merging previous file with Trinity_ID
awk 'NR==FNR {taxid[$1] = $2; next} {print $0, taxid[$2]}' cleaned_trinity_accessions_with_taxids.txt \
trinity_taxonomy_ids.txt > trinity_BLASTn_with_taxids.csv

- 6) Efetch to recover tax information.
    - Created an executable fetcher.sh file with the following:

In [None]:
#!/bin/bash

process_file() {
    input_file=$1
    output_file=$2
    error_log=$3

    while read trinity_id accession taxid; do
        retry_count=0
        max_retries=3
        success=0

        while [ $retry_count -lt $max_retries ]; do
            taxonomy=$(efetch -db taxonomy -id "$taxid" -format xml | xtract -pattern LineageEx -block Taxon -sep ":" -element ScientificName,Rank 2>/dev/null | tr '\n' ' ')

            if [ -n "$taxonomy" ]; then
                success=1
                break
            else
                retry_count=$((retry_count + 1))
                sleep 3  # fine-tune this to avoid rate limits
            fi
        done

        if [ $success -eq 1 ]; then
            echo "$trinity_id $accession $taxid $taxonomy" >> "$output_file"
        else
            echo "$trinity_id $accession $taxid ERROR: Taxonomy fetch failed after $max_retries attempts" >> "$error_log"
        fi

    done < "$input_file"
}

# run
process_file "trinity_BLASTn_with_taxids.csv" "trinity_taxonomy_results.csv" "trinity_BLASTn_error_log.txt"

In [None]:
# making it executable and running it
chmod +x fetcher.sh
./fetcher.sh