# Dual RNA-Sequencing Analysis

- 24 rumen samples from an unique cohort of cows
- Prokaryotic and Eukaryotic ribodepletion for Dual RNA-Seq
- Sequenced at 150M paired-end reads per sample using Illumina paired-end short-reads
- Quality checked using FastQC and MultiQC
- Low quality reads and adaptors removed using Trimmomatic
- Host and nonhost reads separated using kraken2 (--classified-out --unclassified-out) with a db with the bovine reference genome

In [None]:
# commands to build host database were performed prior to the analysis below
cd ~/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/kraken2-2.1.2/

# Add the PhiX genome for quality control
kraken2-build --download-taxonomy --db Cow_and_PhiX_db
curl -L https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/819/615/GCF_000819615.1_ViralProj14015/GCF_000819615.1_ViralProj14015_genomic.fna.gz | gunzip > phiX.fa
kraken2-build --add-to-library phiX.fa --db Cow_and_PhiX_db --no-masking

# Add cow to the library
kraken2-build --add-to-library bosTaurus.fa --db Cow_and_PhiX_db

# At the end build all downloaded libraries
kraken2-build --build --threads 8 --db Cow_and_PhiX_db
kraken2-build --clean --db Cow_and_PhiX_db

In [None]:
import os
import glob
import sys

inpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/SAMPLES/"
outpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/"


SAMPLES=[] 
for name in glob.glob(inpath+"*_R1_001.fixed.gz"):
    ID = os.path.basename(name).split("_R1_001")[0]
    if ID not in SAMPLES:
        SAMPLES.append(ID)

print('samples are:', SAMPLES, file=sys.stderr)

rule all:
    input:
        fastqc1 = expand(outpath+"FASTQC/{sample}_R1_001.fixed_fastqc.html", sample=SAMPLES),
        fastqc2 = expand(outpath+"FASTQC/{sample}_R2_001.fixed_fastqc.html", sample=SAMPLES),
        trimmed1 = expand(outpath+"TRIMMED/{sample}_R1_001_trim.fixed.gz", sample=SAMPLES),
        trimmed2 = expand(outpath+"TRIMMED/{sample}_R2_001_trim.fixed.gz", sample=SAMPLES),
        se1 = expand(outpath+"TRIMMED/{sample}_R1_001_se.fixed.gz", sample=SAMPLES),
        se2 = expand(outpath+"TRIMMED/{sample}_R2_001_se.fixed.gz", sample=SAMPLES),
        host1 = expand(outpath+"host_reads/{sample}_host_reads_1.fixed.gz", sample=SAMPLES),
        host2 = expand(outpath+"host_reads/{sample}_host_reads_2.fixed.gz", sample=SAMPLES),
        nonhost1 = expand(outpath+"nonhost_reads/{sample}_non_host_reads_1.fixed.gz", sample=SAMPLES),
        nonhost2 = expand(outpath+"nonhost_reads/{sample}_non_host_reads_2.fixed.gz", sample=SAMPLES),


rule make_fastqc:
    input:
        read1 = inpath+"{sample}_R1_001.fixed.gz",
        read2 = inpath+"{sample}_R2_001.fixed.gz"
    output:
        fastqc1 = outpath+"FASTQC/{sample}_R1_001.fixed_fastqc.html",
        fastqc2 = outpath+"FASTQC/{sample}_R2_001.fixed_fastqc.html"
    shell:
        "fastqc {input}"

rule trimmomatic:
    input:
        read1 = inpath+"{sample}_R1_001.fixed.gz",
        read2 = inpath+"{sample}_R2_001.fixed.gz"
    output:
        trimmed1 =  outpath+"TRIMMED/{sample}_R1_001_trim.fixed.gz",
        trimmed2 =  outpath+"TRIMMED/{sample}_R2_001_trim.fixed.gz",
        se1 = outpath+"TRIMMED/{sample}_R1_001_se.fixed.gz",
        se2 = outpath+"TRIMMED/{sample}_R2_001_se.fixed.gz"
    params:
        adaptor = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/all_illumina_trimmomatic_2022Aug01.fa"
    conda:
        "trimmomatic.yaml"
    shell:"""
        trimmomatic PE -quiet {input.read1} {input.read2} {output.trimmed1} {output.se1} {output.trimmed2} {output.se2}\
        ILLUMINACLIP:{params.adaptor}:2:40:15 LEADING:2 TRAILING:2 SLIDINGWINDOW:4:15 MINLEN:50
        """

rule kraken_sort:
    input:
         trimmed1 = outpath+"TRIMMED/{sample}_R1_001_trim.fixed.gz",
         trimmed2 = outpath+"TRIMMED/{sample}_R2_001_trim.fixed.gz"
    output:
         host1 = outpath+"host_reads/{sample}_host_reads_1.fixed.gz",
         host2 = outpath+"host_reads/{sample}_host_reads_2.fixed.gz",
         nonhost1 = outpath+"nonhost_reads/{sample}_non_host_reads_1.fixed.gz",
         nonhost2 = outpath+"nonhost_reads/{sample}_non_host_reads_2.fixed.gz"
    params:
         db = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/kraken2-2.1.2/Cow_and_PhiX_db",
         host = outpath+"host_reads/{sample}_host_reads#.fixed",
         nonhost = outpath+"nonhost_reads/{sample}_non_host_reads#.fixed",
         host1 = outpath+"host_reads/{sample}_host_reads_1.fixed",
         host2 = outpath+"host_reads/{sample}_host_reads_2.fixed",
         nonhost1 = outpath+"nonhost_reads/{sample}_non_host_reads_1.fixed",
         nonhost2 = outpath+"nonhost_reads/{sample}_non_host_reads_2.fixed"
    shell:"""
         kraken2 --threads 16 --db {params.db} --gzip-compressed --classified-out {params.host} --unclassified-out {params.nonhost} --paired {input.trimmed1} {input.trimmed2}
         pigz -f {params.host1} > {output.host1}
         pigz -f {params.host2} > {output.host2}
         pigz -f {params.nonhost1} > {output.nonhost1}
         pigz -f {params.nonhost2} > {output.nonhost2}
         """