# De novo transcript co-assembly and abundance estimation

- De novo transcript assembly was performed in Trinity at the isoform level due to the abundance of eukaryotes
- All paired-end reads were used for co-assembly to build transcript isoform database
- Transcript isoform abundance was estimated per sample using Salmon
- Identification of transcript isoforms containing coding region was performed in TransDecoder

### 1) First step in the de novo co-assembly of transcript isoforms in Trinity

In [None]:
# concatenated all forward and all reverse reads for co-assembly
cat ../../nonhost_reads/*_1.fixed.gz > reads.ALL.left.fastq.gz
cat ../../nonhost_reads/*_2.fixed.gz > reads.ALL.right.fastq.gz

In [None]:
### this is heavy, so make sure the reserve enough mem!

import os
import glob
import sys

inpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/functionalAnalysis/trinity_2.15/"
outpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/functionalAnalysis/trinity_2.15/trinity_merged/"

SAMPLES = []
for name in glob.glob(inpath+"../../nonhost_reads/*_nonhost_reads_1.fixed.gz"):
    ID = os.path.basename(name).split("_nonhost_reads_1")[0]
    if ID not in SAMPLES:
        SAMPLES.append(ID)


rule all:
    input:
         trinity = outpath+"Trinity.fasta.gene_trans_map"

rule trinity:
    input:
         read1 = inpath+"reads.ALL.left.fastq.gz",
         read2 = inpath+"reads.ALL.right.fastq.gz"
    output:
         fa = outpath+"Trinity.fasta.gene_trans_map"
    params:
         dir = directory(outpath)
    conda:
         "/home/hfm/yaml/trinity.yaml"
    shell:"""
         Trinity --seqType fq --max_memory 2000G --CPU 256 --left {input.read1} --right {input.read2} --output {params.dir} --verbose
         """

In [None]:
import os
import glob
import sys

inpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/functionalAnalysis/trinity_2.15/"
outpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/functionalAnalysis/trinity_2.15/step1/"

SAMPLES = []
for name in glob.glob(inpath+"../../nonhost_reads/*_non_host_reads_1.fixed.gz"):
    ID = os.path.basename(name).split("_non_host_reads_1")[0]
    if ID not in SAMPLES:
        SAMPLES.append(ID)

rule all:
    input:
         genes = expand(outpath+"{sample}/quant.sf.genes", sample = SAMPLES),
         iso = expand(outpath+"{sample}/quant.sf", sample = SAMPLES),
         counts = outpath+"Trinity_salmon.isoform.counts.matrix",
         gene = outpath+"Trinity_salmon_gene_transmap.isoform.counts.matrix"


rule abundance_est:
    input:
         read1 = inpath+"../../nonhost_reads/{sample}_non_host_reads_1.fixed.gz",
         read2 = inpath+"../../nonhost_reads/{sample}_non_host_reads_2.fixed.gz"
    output:
         genes = outpath+"{sample}/quant.sf.genes",
         iso = outpath+"{sample}/quant.sf"
    params:
         fa = outpath+"Trinity.fasta",
         dir = directory(outpath+"{sample}")
    conda:
         "/home/hfm/yaml/trinity.yaml"
    shell:"""
         align_and_estimate_abundance.pl --seqType fq --left {input.read1} --right {input.read2} --transcripts {params.fa}  --est_method salmon  --aln_method bowtie2 --trinity_mode --prep_reference --output_dir {params.dir}
         """

rule count_matrix:
    input:
         iso = expand(outpath+"{sample}/quant.sf", sample=SAMPLES)
    output:
         counts = outpath+"Trinity_salmon.isoform.counts.matrix"
    params:
         pre = "Trinity_salmon",
         dir = directory(outpath)
    conda:
         "/home/hfm/yaml/trinity.yaml"
    shell:"""
         cd {params.dir}
         abundance_estimates_to_matrix.pl --est_method salmon --gene_trans_map none --out_prefix {params.pre} --name_sample_by_basedir {input.iso}
         """

rule count_matrix_gene:
    input:
         iso = expand(outpath+"{sample}/quant.sf", sample=SAMPLES)
    output:
         gene = outpath+"Trinity_salmon_gene_transmap.isoform.counts.matrix"
    params:
         pre = "Trinity_salmon_gene_transmap",
         genemap = outpath+"Trinity.fasta.gene_trans_map",
         dir = directory(outpath)
    conda:
         "/home/hfm/yaml/trinity.yaml"
    shell:"""
         cd {params.dir}
         abundance_estimates_to_matrix.pl --est_method salmon --gene_trans_map {params.genemap} --out_prefix {params.pre} --name_sample_by_basedir {input.iso}
         """

### 2) Then TransDecoder was used to identify which ones contained higher-quality coding region
- More info for TransDecoder can be found here: https://github.com/TransDecoder/TransDecoder

In [None]:
# TransDecoder
conda activate transdecoder

# identify transcript isoforms with coding region
TransDecoder.LongOrfs -t Trinity.fasta
TransDecoder.Predict -t Trinity.fasta

# then filter out from Trinity.fasta (sequences) and abundance table (csv) transcripts that 
# do not contain coding region or that did not pass quality assessment from TransDecoder