genomeAnnotation.Snakemake

#run snakemake
#
#snakemake -d `pwd` -s `pwd`/genomeAnnotation.Snakemake --stats snakemake.stats -j 100 --cluster 'qsub {params.cluster}'

#################################
#                               #
#     Import modules            #
#                               #
#################################

import math
import os

#################################
#                               #
#      Own function definition  #
#                               #
#################################


def getFileNumber(file_name, entryNumber):
      f = open(file_name)
      total = 0
      for line in f:
          if ">" in line:
              total += 1
      f.close()
      return math.floor(2*total/entryNumber+1);    


#get bam file
def getBamFile(dir,species):
      list=[];
      for file in os.listdir(dir+'/'+species+'.reads/.'):
            if fnmatch.fnmatch(file,'*.*.bam'):
                  list.append(file);
      return list

#uniquify ids
def f10(seq, idfun=None):
    seen = set()
    results=[]
    if idfun is None:
        for x in seq:
            if x in seen:
                continue
            seen.add(x)
            results.append(x)
    else:
        for x in seq:
            x = idfun(x)
            if x in seen:
                continue
            seen.add(x)
            results.append(x)
    return results


#################################
#                               #
#      Variables setup          #
#                               #
#################################

    #################################                
    #Environment                    #
    #################################

#WORKDIR="/home/lv70539/htafer/genomeAnnotation"
#WORKDIR="/media/vsc2/genomeAnnotation"
#WORKDIR="/home/htafer/annotation/"
WORKDIR="/media/work/genomeAnnotation"
COMPUTEDIR="/tmp"
#COMPUTEDIR="/global/lv70539/htafer"
#COMPUTEDIR="/scratch"

    #################################
    #THREADS                        #
    #################################

THREADS=8

    ##################################
    #FILES                           #
    ##################################

#GENOME
ID="cImmunda";
REF=ID+".fasta"
#UNIREF
UNIREF=os.environ['HOME']+"/share/database/UniProt90pSaccharomyceta.fasta"
#BAMFILES
BAMFILES=WORKDIR+"/reads/{samples}.bam"
BAMS,=glob_wildcards(BAMFILES)
GFFS="cegma scipio".split()

#BIOLOGY
INTRON=2000


#Which rules are run locally # depends on which computer the pipeline is running
localrules: all, clean, geneMarkEs, cegma, scipio, cufflinks, cuffmerge, segemehl, segemehlIdx, bamToFastq, composeMerge, mergeAssemblies, trinityAlignment, trinityDeNovo,PASA

rule all:
     input: expand("reads/{samples}.PASA", samples=BAMS)


##################################
##                               #
##      tRNAscan-SE              #
##                               #
##################################
#
#
#
#
#

##################################
##                               #
##      Augustus                 #
##                               #
##################################


##################################
##                               #
##      SNAP                     #
##                               #
##################################


#################################################################################
#                                                                               #
#                             EVIDENCE MAPPING                                  #
#                                                                               #
#################################################################################


##################################
##                               #
##      PASA  FIRST PASS         #
##                               #
##################################

rule PASA:
     input: dn="reads/{samples}.trinityDN",gg="reads/{samples}.trinityGG",cf="reads/{samples}.cufflinks/transcripts.gtf"
     output: "reads/{samples}.PASA"
     params: cluster="-cwd -V"
     threads: THREADS
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/PASA${{prefix}}
     cd {COMPUTEDIR}/PASA${{prefix}}
     #Prepare files
     cat {WORKDIR}/{input.dn}/Trinity.fasta {WORKDIR}/{input.gg}/Trinity-GG.fasta > {COMPUTEDIR}/PASA${{prefix}}/transcripts.fasta
     $PASAHOME/misc_utilities/accession_extractor.pl < {COMPUTEDIR}/PASA${{prefix}}/transcripts.fasta > {COMPUTEDIR}/PASA${{prefix}}/tdn.accs
     #Prepare alignAssembly.config
     DBNAME=`echo {input.dn} | sed -r 's/reads\///g' | sed -r 's/.trinityDN//g' `
     cat $PASAHOME/pasa_conf/pasa.alignAssembly.Template.txt | sed -r "s/DBNAME/${{DBNAME}}/" > ./alignAssembly.config
     $PASAHOME/scripts/Launch_PASA_pipeline.pl -c ./alignAssembly.config -C -R -g {WORKDIR}/cImmunda.fasta --ALIGNERS blat,gmap -t ./transcripts.fasta --transcribed_is_aligned_orient --TDN ./tdn.accs --cufflinks_gtf {WORKDIR}/{input.cf} -I {INTRON} --stringent_alignment_overlap 30.0 --CPU {threads} 
     $PASAHOME//scripts/build_comprehensive_transcriptome.dbi -c alignAssembly.config -t transcripts.fasta --min_per_ID 95 --min_per_aligned 30
     mv {COMPUTEDIR}/PASA${{prefix}} {WORKDIR}/{output}
     """


##################################
##                               #
##      TRINITY                  #
##                               #
##################################
#Can we really rely on trinity to make the assemblies ?
#Apparently it works at least as good as newbler and MIRA for 454 data, which are similar to ion torrent
#http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0051188


rule trinityDeNovo:
     input: "reads/{samples}.fastq"
     output: "reads/{samples}.trinityDN"
     params: cluster="-cwd -V"
     threads: THREADS
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/trinity${{prefix}}
     Trinity --seqType fq --single {input} --SS_lib_type F --CPU {threads} --output {COMPUTEDIR}/trinity${{prefix}}  --max_memory 24G
     mv {COMPUTEDIR}/trinity${{prefix}} {WORKDIR}/{output}
     """


rule trinityAlignment:
     input: "reads/{samples}.sam.cf"
     output: dire="reads/{samples}.trinityGG"
     params: cluster="-cwd -V"     
     threads: THREADS
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/trinity${{prefix}}
     Trinity --genome_guided_bam {input} --genome_guided_max_intron {INTRON} --max_memory 10G --CPU {threads} --output {COMPUTEDIR}/trinity${{prefix}}
     mv {COMPUTEDIR}/trinity${{prefix}} {WORKDIR}/{output}
     """


##################################
##                               #
##      CUFFLINKS                #
##                               #
##################################


rule mergeAssemblies:
     input: 'reads/assemblies.txt'
     output: 'gff/cufflinks.gff',dir='gff'
     params: cluster="-cwd -V"
     threads: THREADS
     shell:"""
        cuffmerge -o {output.dir} -s {REF} {input} -p {threads}
        mv {output.dir}/merged.gtf {output[0]}
     """
        

rule composeMerge:
     input:
         expand('reads/{sample}.cufflinks/transcripts.gtf', sample=BAMS)
     output:
         txt='reads/assemblies.txt'
     run:
         with open(output.txt, 'w') as out:
             print(*input, sep="\n", file=out)


rule cufflinks:
     input: "reads/{samples}.sam.cf"
     output: dire="reads/{samples}.cufflinks", file="reads/{samples}.cufflinks/transcripts.gtf"
     params: cluster="-cwd -V"     
     threads: THREADS
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/${{prefix}}
     cufflinks -o {COMPUTEDIR}/${{prefix}} -p {threads} -u -I 2000 --max-bundle-length 10000  --min-intron-length 30 {WORKDIR}/{input}
     mv {COMPUTEDIR}/${{prefix}}/* {WORKDIR}/{output.dire}
     """

#
#
##################################
##                               #
##     Spliced RNAseq mapping    #
##                               #
##################################
#


rule segemehl:
     input: reads="reads/{samples}.fastq",genome=ID+".fasta", idx=ID+".idx"
     output: mapped="reads/{samples}.merged.sorted.sam", mappedCf="reads/{samples}.sam.cf"
     params: cluster="-cwd -V"     
     threads: THREADS
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/${{prefix}}
     #map
     segemehl.x -s -S -d {WORKDIR}/{input.genome} -i {WORKDIR}/{input.idx} -t {threads} -q {WORKDIR}/{input.reads} -u {COMPUTEDIR}/${{prefix}}/unmapped.sam > {COMPUTEDIR}/${{prefix}}/mapped.sam
     #sort mapped reads
     samtools view -bS {COMPUTEDIR}/${{prefix}}/mapped.sam | samtools sort - {COMPUTEDIR}/${{prefix}}/mapped.sorted
     samtools view -h  {COMPUTEDIR}/${{prefix}}/mapped.sorted.bam > {COMPUTEDIR}/${{prefix}}/mapped.sorted.sam
     #remap unmapped reads
     lack.x -s -d {input.genome} -q {COMPUTEDIR}/${{prefix}}/mapped.sorted.sam -r {COMPUTEDIR}/${{prefix}}/unmapped.sam -o {COMPUTEDIR}/${{prefix}}/remapped.sam -t {threads}
     #merge reads
     samtools view -Sh {COMPUTEDIR}/${{prefix}}/remapped.sam > {COMPUTEDIR}/${{prefix}}/merged.sam
     samtools view -S  {COMPUTEDIR}/${{prefix}}/mapped.sorted.sam >> {COMPUTEDIR}/${{prefix}}/merged.sam
     #sort merged reads
     samtools view -bS {COMPUTEDIR}/${{prefix}}/merged.sam | samtools sort -  {COMPUTEDIR}/${{prefix}}/merged.sorted
     samtools view -h {COMPUTEDIR}/${{prefix}}/merged.sorted.bam > {COMPUTEDIR}/${{prefix}}/merged.sorted.sam
     #produce cufflinks compatible sam file
     s2c.py -s {COMPUTEDIR}/${{prefix}}/merged.sorted.sam -d {INTRON} | grep -vP "^SQ\s+" >  {COMPUTEDIR}/${{prefix}}/merged.cf.sam
     samtools view -bS {COMPUTEDIR}/${{prefix}}/merged.cf.sam | samtools sort -  {COMPUTEDIR}/${{prefix}}/merged.sorted.cf
     samtools view -h {COMPUTEDIR}/${{prefix}}/merged.sorted.cf.bam > {COMPUTEDIR}/${{prefix}}/merged.sorted.cf.sam
     #mv results to final directory
     mv {COMPUTEDIR}/${{prefix}}/merged.sorted.cf.sam {WORKDIR}/{output.mappedCf}
     mv {COMPUTEDIR}/${{prefix}}/merged.sorted.sam {WORKDIR}/{output.mapped}
     #clean up
     rm -rf {COMPUTEDIR}/${{prefix}}
     """


rule segemehlIdx:
     input: genome=ID+".fasta"
     output: idx=ID+".idx"
     threads: 1
     shell:"""
     segemehl.x -s -d {input.genome} -x {output.idx}
     """

rule bamToFastq:
     input: expand("reads/{samples}.bam", samples=BAMS)
     output: expand("reads/{samples}.fastq", samples=BAMS)
     params: cluster="-cwd -V"
     threads: THREADS
     shell:"""
     parallel --no-notice -j {threads}  'bamToFastq -i {{}} -fq {{.}}.fastq' ::: {input}
     """


#################################
#                               #
#     Splice protein mapping    #
#                               #
#################################

rule scipio:
     input: ID+".fasta"
     output: ID+".scipio.gff"
     params: cluster="-cwd -V"
     threads: THREADS
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/${{prefix}}
     cat {UNIREF} | parallel -j {threads} -N20 --round-robin --pipe --recstart ">" "cat /dev/stdin > {COMPUTEDIR}/${{prefix}}/{{#}}; scipio.1.4.1.pl --min_score=0.3 --min_identity=60 --min_coverage=60 --max_mismatch=100 --multiple_results --blat_score=15 --blat_tilesize=7 --max_assemble_size={INTRON} --blat_params="-oneOff=1" --exhaust_align_size={INTRON} --exhaust_gap_size=30  --accepted_intron_penalty=1.0  --blat_output={COMPUTEDIR}/${{prefix}}/{{#}}.psl {WORKDIR}/{input} {COMPUTEDIR}/${{prefix}}/{{#}} --verbose  | yaml2gff.1.4.pl > {COMPUTEDIR}/${{prefix}}/{{#}}.yamlgff &&  scipiogff2gff.pl --in={COMPUTEDIR}/${{prefix}}/{{#}}.yamlgff --out={COMPUTEDIR}/${{prefix}}/{{#}}.gff && cat {COMPUTEDIR}/${{prefix}}/*.gff" > {WORKDIR}/{output}
     rm -rf {COMPUTEDIR}/{{$prefix}}
     """

##################################
##                               #
##     CEGMA CORE PROTEIN MAPPING#
##                               #
##################################
#
#

rule cegma:
     input: ID+".fasta"
     output: ID+".cegma.gff"
     threads: THREADS
     params: cluster="-cwd -V"     
     shell:"""
     prefix=`date --rfc-3339=ns  | md5sum | head -c 16`
     mkdir -p {COMPUTEDIR}/${{prefix}}
     cd {COMPUTEDIR}/${{prefix}}
     cegma -g {WORKDIR}/{input} --ext -v -T {threads}  --max_intron 2000
     mv {COMPUTEDIR}/${{prefix}} {WORKDIR}/{output}
     rm -rf {COMPUTEDIR}/{{$prefix}}
     """
#
#
#
##################################
##                               #
##     Ab-initio                 #
##                               #
##################################
#
rule geneMarkEs:
     input: ID+".fasta"
     output: ID+".gM"
     threads: THREADS
     params: cluster="-cwd -V"     
     shell:"""
     gmes_petap.pl --fungus --ES --cores {threads} --sequence {input}
     """


rule clean:
     shell: "rm -rf *.sizes *.masked *.split *.lastz *.psl *.chain *.preChain *.net *.maf *.axt *.sh.* *.out *.tbl *.cat *.scipio"

##Masking
rule repeatMasker:
     input: ID+".fasta"
     output: ID+".fasta.masked"
     threads: 16
     params: cluster="-cwd -V"     
     shell:"""
            RepeatMasker -qq -pa {threads} -species fungi {WORKDIR}/{input}
           """