In [13]:
import jpy_tools.parseSnake as jps

In [14]:
snakeFile = jps.SnakeMakeFile()

In [15]:
snakeHeader = jps.SnakeHeader(
    snakeFile,
    "/public/home/liuzj/scripts/pipeline/analyzeNGSData/snakemake/config_20210330.yaml",
)
snakeHeader.addFeature("rawDataPath")
snakeHeader.addCode(
    """
import glob
import re

allR1Ls = glob.glob(f"{rawDataPath}*_1.fastq.gz")
sampleLs = [re.search(f"(?<={rawDataPath})[\w\W]+?(?=_1.fastq.gz)", x)[0] for x in allR1Ls]"""
)
snakeHeader.addLsToPool('sample')
snakeHeader.generateContent()

config contents:

     resultDir      :/public/home/liuzj/scripts/pipeline/analyzeNGSData/results/
       useQC        :True
    pipelineDir     :/public/home/liuzj/scripts/pipeline/analyzeNGSData/scripts/
    rawDataPath     :/public/home/liuzj/scripts/pipeline/analyzeNGSData/rawData/
       genome       :/public/home/liuzj/data/Araport11/HISAT2INDEX
      annoGtf       :/public/home/liuzj/data/Araport11/gene.gtf
      annoGff       :/public/home/liuzj/data/Araport11/gene.gff
     repreTrans     :/public/home/liuzj/data/Araport11/represent.proteinIdPos.txt
-----------------
 configfile: "/public/home/liuzj/scripts/pipeline/analyzeNGSData/snakemake/config_20210330.yaml"
pipelineDir = config['pipelineDir']
rawDataPath = config['rawDataPath']

import glob
import re

allR1Ls = glob.glob(f"{rawDataPath}*_1.fastq.gz")
sampleLs = [re.search(f"(?<={rawDataPath})[\w\W]+?(?=_1.fastq.gz)", x)[0] for x in allR1Ls]


In [16]:
fastpQC = jps.SnakeRule(snakeFile, 'fastpQC',16)
fastpQC.setInput(
    c = dict(
        readOne = 'f\'{rawDataPath}{{sample}}_1.fastq.gz\'', \
        readTwo = 'f\'{rawDataPath}{{sample}}_2.fastq.gz\'')
)
fastpQC.setOutput(
    a = ['{{sample}}_AfterQC_1.fastq.gz', '{{sample}}_AfterQC_2.fastq.gz']
)
fastpQC.setParams(
    b = ['useQC'] , a = ['{{sample}}_qcReport.html']
)
fastpQC.setShell("""
if [ {params.useQC} == True ]
then
    fastp -i {input.readOne} -o {output.AfterQC_1} -I {input.readTwo} -O {output.AfterQC_2} -w {threads} -h {params.qcReport}
else
    cp {input.readOne} {output.AfterQC_1} && cp {input.readTwo} {output.AfterQC_2}
fi
""")
fastpQC.generateContent()

2021-03-30 14:26:04.223 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 1


rule fastpQC:
    input:
        readOne = f'{rawDataPath}{{sample}}_1.fastq.gz',
        readTwo = f'{rawDataPath}{{sample}}_2.fastq.gz'
    output:
        AfterQC_1 = f"{config['resultDir']}step1_fastpQC/{{sample}}_AfterQC_1.fastq.gz",
        AfterQC_2 = f"{config['resultDir']}step1_fastpQC/{{sample}}_AfterQC_2.fastq.gz"
    params:
        useQC = config['useQC'],
        qcReport = f"{config['resultDir']}step1_fastpQC/{{sample}}_qcReport.html",
        gpu = "0"

    threads:16
    shell:
        """
cd {pipelineDir}
if [ {params.useQC} == True ]
then
    fastp -i {input.readOne} -o {output.AfterQC_1} -I {input.readTwo} -O {output.AfterQC_2} -w {threads} -h {params.qcReport}
else
    cp {input.readOne} {output.AfterQC_1} && cp {input.readTwo} {output.AfterQC_2}
fi
        """




In [17]:
mappingUsingHisat = jps.SnakeRule(snakeFile, 'mappingUsingHisat', 16)
mappingUsingHisat.setInput(
    a = ['{{sample}}_AfterQC_1.fastq.gz', '{{sample}}_AfterQC_2.fastq.gz']
)
mappingUsingHisat.setOutput(a = ['{{sample}}_sortedBam.bam'])
mappingUsingHisat.setParams(a = ['{{sample}}_unsortedSam.sam'], b=['genome'])
mappingUsingHisat.setShell("""
hisat2 -x {params.genome} -p {threads} --min-intronlen 20 --max-intronlen 10000 --dta --time -1 {input.AfterQC_1} -2 {input.AfterQC_2} -S {params.unsortedSam} &&\
     samtools sort {params.unsortedSam} -@ {threads} -O bam -o {output.sortedBam} &&\
         samtools index {output.sortedBam} &&\
             rm {params.unsortedSam}
""")
mappingUsingHisat.generateContent()

deDuplicates = jps.SnakeRule(snakeFile, 'deDuplicates', 2, 8)
deDuplicates.setInput(a = ['{{sample}}_sortedBam.bam'])
deDuplicates.setOutput(a = ['{{sample}}_sortedDedupBam.bam'])
deDuplicates.setParams(a = ['{{sample}}_sortedDedupMat.txt'])
deDuplicates.setShell("""
java -jar /public/apps/picard_2.20.2/picard.jar MarkDuplicates VALIDATION_STRINGENCY=SILENT REMOVE_DUPLICATES=true SORTING_COLLECTION_SIZE_RATIO=0.01 I={input.sortedBam} O={output.sortedDedupBam} M={params.sortedDedupMat} &&\
    samtools index {output.sortedDedupBam}
""")
deDuplicates.generateContent()

2021-03-30 14:26:04.423 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 2
2021-03-30 14:26:04.425 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 3


rule mappingUsingHisat:
    input:
        AfterQC_1 = f"{config['resultDir']}step1_fastpQC/{{sample}}_AfterQC_1.fastq.gz",
        AfterQC_2 = f"{config['resultDir']}step1_fastpQC/{{sample}}_AfterQC_2.fastq.gz"
    output:
        sortedBam = f"{config['resultDir']}step2_mappingUsingHisat/{{sample}}_sortedBam.bam"
    params:
        unsortedSam = f"{config['resultDir']}step2_mappingUsingHisat/{{sample}}_unsortedSam.sam",
        genome = config['genome'],
        gpu = "0"

    threads:16
    shell:
        """
cd {pipelineDir}
hisat2 -x {params.genome} -p {threads} --min-intronlen 20 --max-intronlen 10000 --dta --time -1 {input.AfterQC_1} -2 {input.AfterQC_2} -S {params.unsortedSam} &&     samtools sort {params.unsortedSam} -@ {threads} -O bam -o {output.sortedBam} &&         samtools index {output.sortedBam} &&             rm {params.unsortedSam}
        """


rule deDuplicates:
    input:
        sortedBam = f"{config['resultDir']}step2_mappingUsingHisat/{{sample}}_sortedBam.bam"


In [18]:
runStringTie = jps.SnakeRule(snakeFile, 'runStringTie', 16)
runStringTie.setInput(a = ['{{sample}}_sortedDedupBam.bam'], b = ['annoGff'])
runStringTie.setOutput(a = ['{{sample}}/geneStringtieMerge.gtf', '{{sample}}/geneStringtieAbund.tsv'])
runStringTie.setShell("""
stringtie -A {output.geneStringtieAbund} -e --rf -B -p {threads} -G {input.annoGff} -o {output.geneStringtieMerge} {input.sortedDedupBam}
""")
runStringTie.generateContent()

2021-03-30 14:26:04.572 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 4


rule runStringTie:
    input:
        sortedDedupBam = f"{config['resultDir']}step3_deDuplicates/{{sample}}_sortedDedupBam.bam",
        annoGff = config['annoGff']
    output:
        geneStringtieMerge = f"{config['resultDir']}step4_runStringTie/{{sample}}/geneStringtieMerge.gtf",
        geneStringtieAbund = f"{config['resultDir']}step4_runStringTie/{{sample}}/geneStringtieAbund.tsv"
    params:
        gpu = "0"

    threads:16
    shell:
        """
cd {pipelineDir}
stringtie -A {output.geneStringtieAbund} -e --rf -B -p {threads} -G {input.annoGff} -o {output.geneStringtieMerge} {input.sortedDedupBam}
        """




In [19]:
extractRnaRpkm = jps.SnakeRule(snakeFile, "extractRnaRpkm", 4)
extractRnaRpkm.setInput(a=["geneStringtieMerge"])
extractRnaRpkm.setOutput(a=["{{sample}}_rpkmRna.tsv", "{{sample}}_rpkmGene.tsv"])
extractRnaRpkm.setParams(
    d=dict(step4=["/"]),a=['sample']
    )
extractRnaRpkm.setShell(
    """
Rscript extract_rpkm_from_ballgown.R {params.sample}  {params.step4ResultDir}{params.sample} {output.rpkmRna} {output.rpkmGene} 
"""
)
extractRnaRpkm.generateContent()

2021-03-30 14:26:04.702 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 5


rule extractRnaRpkm:
    input:
        geneStringtieMerge = f"{config['resultDir']}step4_runStringTie/{{sample}}/geneStringtieMerge.gtf"
    output:
        rpkmRna = f"{config['resultDir']}step5_extractRnaRpkm/{{sample}}_rpkmRna.tsv",
        rpkmGene = f"{config['resultDir']}step5_extractRnaRpkm/{{sample}}_rpkmGene.tsv"
    params:
        step4ResultDir = f"{config['resultDir']}step4_runStringTie/",
        sample = '{sample}',
        gpu = "0"

    threads:4
    shell:
        """
cd {pipelineDir}
Rscript extract_rpkm_from_ballgown.R {params.sample}  {params.step4ResultDir}{params.sample} {output.rpkmRna} {output.rpkmGene}
        """




In [20]:
extractGeneCounts = jps.SnakeRule(snakeFile, 'extractGeneCounts', 1)
extractGeneCounts.setInput(e = ['rpkmRna'])
extractGeneCounts.setOutput(a = ['allSampleGeneCounts.csv', 'allSampleTransCounts.csv'])
extractGeneCounts.setParams(d = dict(step4=['/']))
extractGeneCounts.setShell("""
prepDE.py -g {output.allSampleGeneCounts} -t {output.allSampleTransCounts} -i {params.step4ResultDir}
""")
extractGeneCounts.generateContent()

2021-03-30 14:26:04.791 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 6


rule extractGeneCounts:
    input:
        rpkmRna = [f"{config['resultDir']}step5_extractRnaRpkm/{sample}_rpkmRna.tsv" for sample in sampleLs]
    output:
        allSampleGeneCounts = f"{config['resultDir']}step6_extractGeneCounts/allSampleGeneCounts.csv",
        allSampleTransCounts = f"{config['resultDir']}step6_extractGeneCounts/allSampleTransCounts.csv"
    params:
        step4ResultDir = f"{config['resultDir']}step4_runStringTie/",
        gpu = "0"

    threads:1
    shell:
        """
cd {pipelineDir}
prepDE.py -g {output.allSampleGeneCounts} -t {output.allSampleTransCounts} -i {params.step4ResultDir}
        """




In [21]:
calIrRatio = jps.SnakeRule(snakeFile, 'calIrRatio', 1)
calIrRatio.setInput(a = ['{{sample}}_sortedDedupBam.bam'], b= ['repreTrans'])
calIrRatio.setOutput(a = ['{{sample}}_irRatio.tsv'])
calIrRatio.setShell("""
python cal_ir_ratio.from_bam.by_splicing_Read_type.version2.py {input.sortedDedupBam} {input.repreTrans} {output.irRatio} 1 4 1
""")
calIrRatio.generateContent()

2021-03-30 14:26:04.961 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 7


rule calIrRatio:
    input:
        sortedDedupBam = f"{config['resultDir']}step3_deDuplicates/{{sample}}_sortedDedupBam.bam",
        repreTrans = config['repreTrans']
    output:
        irRatio = f"{config['resultDir']}step7_calIrRatio/{{sample}}_irRatio.tsv"
    params:
        gpu = "0"

    threads:1
    shell:
        """
cd {pipelineDir}
python cal_ir_ratio.from_bam.by_splicing_Read_type.version2.py {input.sortedDedupBam} {input.repreTrans} {output.irRatio} 1 4 1
        """




In [22]:
gatherIrRpkm = jps.SnakeRule(snakeFile, 'gatherIrRpkm', 1)
gatherIrRpkm.setInput(a = ['{{sample}}_irRatio.tsv', '{{sample}}_rpkmGene.tsv'])
gatherIrRpkm.setOutput(a = ['{{sample}}_gatherIrRpkm.tsv'])
gatherIrRpkm.setShell("""
python gatherIrRpkm.py -i {input.irRatio} -R {input.rpkmGene} -o {output.gatherIrRpkm}
""")
gatherIrRpkm.generateContent()

2021-03-30 14:26:05.082 | INFO     | jpy_tools.parseSnake:__init__:111 - Current step: 8


rule gatherIrRpkm:
    input:
        irRatio = f"{config['resultDir']}step7_calIrRatio/{{sample}}_irRatio.tsv",
        rpkmGene = f"{config['resultDir']}step5_extractRnaRpkm/{{sample}}_rpkmGene.tsv"
    output:
        gatherIrRpkm = f"{config['resultDir']}step8_gatherIrRpkm/{{sample}}_gatherIrRpkm.tsv"
    params:
        gpu = "0"

    threads:1
    shell:
        """
cd {pipelineDir}
python gatherIrRpkm.py -i {input.irRatio} -R {input.rpkmGene} -o {output.gatherIrRpkm}
        """




In [23]:
ruleAll = jps.SnakeAll(snakeFile)
ruleAll.addOutput('gatherIrRpkm', 'allSampleGeneCounts')
ruleAll.generateContent()

rule all:
    input:
        [f"{config['resultDir']}step8_gatherIrRpkm/{sample}_gatherIrRpkm.tsv" for sample in sampleLs],
        f"{config['resultDir']}step6_extractGeneCounts/allSampleGeneCounts.csv"


In [24]:
snakeFile.generateContent("/public/home/liuzj/scripts/pipeline/analyzeNGSData/snakemake/snakefile_20210330")

configfile: "/public/home/liuzj/scripts/pipeline/analyzeNGSData/snakemake/config_20210330.yaml"
pipelineDir = config['pipelineDir']
rawDataPath = config['rawDataPath']

import glob
import re

allR1Ls = glob.glob(f"{rawDataPath}*_1.fastq.gz")
sampleLs = [re.search(f"(?<={rawDataPath})[\w\W]+?(?=_1.fastq.gz)", x)[0] for x in allR1Ls]

rule all:
    input:
        [f"{config['resultDir']}step8_gatherIrRpkm/{sample}_gatherIrRpkm.tsv" for sample in sampleLs],
        f"{config['resultDir']}step6_extractGeneCounts/allSampleGeneCounts.csv"

rule fastpQC:
    input:
        readOne = f'{rawDataPath}{{sample}}_1.fastq.gz',
        readTwo = f'{rawDataPath}{{sample}}_2.fastq.gz'
    output:
        AfterQC_1 = f"{config['resultDir']}step1_fastpQC/{{sample}}_AfterQC_1.fastq.gz",
        AfterQC_2 = f"{config['resultDir']}step1_fastpQC/{{sample}}_AfterQC_2.fastq.gz"
    params:
        useQC = config['useQC'],
        qcReport = f"{config['resultDir']}step1_fastpQC/{{sample}}_qcReport.html",
     