# Containment Analysis on RNA reads

- 24 rumen samples from an unique cohort of cows
- Prokaryotic and Eukaryotic ribodepletion for Dual RNA-Seq
- Sequenced at 150M paired-end reads per sample using Illumina paired-end short-reads
- QC and host reads removal performed in upstream_qc file
- Taxonomy assignment performed using sourmash (--threshold-bp 3000 --scaled=1000 kmer=21) with the genbank db

In [None]:
import os
import glob
import sys

inpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/SAMPLES/"
outpath = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/"


SAMPLES=[] 
for name in glob.glob(inpath+"*_R1_001.fixed.gz"):
    ID = os.path.basename(name).split("_R1_001")[0]
    if ID not in SAMPLES:
        SAMPLES.append(ID)

print('samples are:', SAMPLES, file=sys.stderr)

rule all:
    input:
        sign = expand(outpath+"signatures/{sample}-reads.sig", sample=SAMPLES),
        matrix = outpath+"compare/Noc_comp",
        plot = outpath+"compare/Noc_comp.matrix.pdf",
        fastgather = expand(outpath+"fastgather/all/{sample}_fastgather.csv", sample=SAMPLES),
        gather = expand(outpath+"gather/all/{sample}_gather.csv", sample=SAMPLES),
        tax = expand(outpath+"tax_annotated/all/{sample}_gather.with-lineages.csv", sample=SAMPLES)


rule sketch_signatures:
    input:
         read1 = outpath+"nonhost_reads/{sample}_non_host_reads_1.fixed.gz",
         read2 = outpath+"nonhost_reads/{sample}_non_host_reads_2.fixed.gz"
    output:
          sign = outpath+"signatures/{sample}-reads.sig"
    params:
          merge = "{sample}"
    conda:
         "sourmash.yaml"
    shell:"""
         sourmash sketch dna -p k=21,k=31,k=51,scaled=1000,abund --merge {params.merge} -o {output.sign} {input.read1} {input.read2}
         """

rule sourmash_compare:
    input:
         sign = expand(outpath+"signatures/{sample}-reads.sig", sample = SAMPLES)
    output:
         matrix = outpath+"compare/Noc_comp"
    params:
         csv = outpath+"compare/Noc_comp.csv"
    conda:
         "sourmash.yaml"
    shell:"""
         sourmash compare {input.sign} --output {output.matrix} -k 21 --csv {params.csv}
         """

rule sourmash_plot:
    input:
         matrix = outpath+"compare/Noc_comp"
    output:
         plot = outpath+"compare/Noc_comp.matrix.pdf"
    conda:
         "sourmash.yaml"
    shell:"""
         sourmash plot --labels --pdf {input.matrix}
         """

rule sourmash_fastgather:
    input:
         wort = outpath+"signatures/{sample}-reads.sig",
         against = outpath+"Sourmash_database/list-genbank-2022.03.txt",
    output:
         csv = outpath+"fastgather/{sample}_fastgather.csv"
    log:
         outpath+"logs/{sample}_fastgather.log"
    resources:
         rayon_exclude = 1
    threads: 128
    conda:
         "sourmash.yaml"
    shell: """
          /usr/bin/time -v sourmash scripts fastgather {input.wort} {input.against} \
             -o {output.csv} --threshold-bp 3000 -k 21 --scaled 1000 -c {threads} 2>&1 > {log}
           """

rule sourmash_gather:
    input:
         sign = outpath+"signatures/{sample}-reads.sig",
         db = "/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/Sourmash_database/genbank-2022.03.k21.zip",
         picklist = outpath+"fastgather/{sample}_fastgather.csv"
    output:
         csv = outpath+"gather/{sample}_gather.csv"
    conda:
         "sourmash.yaml"
    shell:"""
         sourmash gather {input.sign} {input.db} -k 21 --no-fail-on-empty-database --threshold-bp 3000 --scaled=1000 \
            --picklist {input.picklist}:match_md5:md5 -o {output.csv}
         """

rule sourmash_tax:
    input:
         gather = outpath+"gather/all/{sample}_gather.csv"
    output:
         tax = outpath+"tax_annotated/all/{sample}_gather.with-lineages.csv"
    params:
         db = expand("/home/hfm/Rumen_Microbiome_Genomics/1_Sequences_Guanhui/Sourmash_database/genbank-2022.03/genbank-2022.03-{db}.lineages.csv.gz", db = ["archaea", "bacteria", "fungi", "viral", "protozoal"]),
         dir = directory(outpath+"tax_annotated/all/")
    conda:
         "sourmash.yaml"
    shell:"""
         sourmash tax annotate -g {input.gather} -t {params.db} -o {params.dir}
         """