This notebook is a test notebook from the NIGMS ME-INBRE project that will run an RNAseq workflow.

In [None]:
# Import necessary packages
import subprocess as sp
import os

In [None]:
# Test out the following commands. First,  set up your directory structure and download files with this command. 
# You can peek at what it is doing by typing cat scripts/setup.sh.
# Note that since this is Python script we have to wrap our bash commands in supprocess calls
shell_call = "sh scripts/setup.sh"
proc_shell = sp.call(shell_call,shell=True)


In [None]:
# Everything should now be downloaded
# Let's trim our raw reads
# First make a list of all files we want to trim (here we only have two)
files = [i.strip() for i in open("scripts/samples")]
# Then create a loop for each file
# use 'f' string to insert the {f} variable for each sample
for f in files:
    trim_call = f"trimmomatic SE -threads 4 data/raw_fastq/{f}.fastq data/trimmed/{f}_trimmed.fastq ILLUMINACLIP:TruSeq3-PE.fa:2:30:10:2:keepBothReads LEADING:3 TRAILING:3 MINLEN:36"
    proc_shell = sp.call(trim_call,shell=True)
    


In [None]:
# Let's align reads to the reference genome
# Asign Ref variable
REF='data/reference/M_chelonae_NZ_CP007220.fasta'
for f in files:
    bwa_call = f"bwa mem -t 4 -R '@RG\\tID:{f}\\tSM:{f}' {REF} data/trimmed/{f}_trimmed.fastq > data/aligned/{f}.sam"
    proc_bwa = sp.call(bwa_call,shell=True)

In [None]:
# Convert our sam file to binary form (bam)
for f in files:
    samtools_call = f"samtools view -S -b data/aligned/{f}.sam > data/aligned/{f}.bam"
    proc_samtools = sp.call(samtools_call,shell=True)

In [None]:
# Lets see how well our fastq files mapped to the reference genome
for f in files:
    depth_call = f"samtools flagstat data/aligned/{f}.bam > data/stats/{f}.flagstat.out"
    proc_depth = sp.call(depth_call,shell=True)

# Lets view the output
for f in files:
    print(f'mapping stats for {f}')
    catfile_call = f"cat data/stats/{f}.flagstat.out"
    proc_catfile = sp.call(catfile_call,shell=True)
    print ('\n'*2)


In [None]:
# Sort our BAM files so they are in the same order as the reference genome
for f in files:
    sort_call = f"gatk SortSam -I data/aligned/{f}.bam -O data/sorted/{f}.bam -SO coordinate"
    proc_sort = sp.call(sort_call,shell=True)

In [None]:
# Mark Duplicates from sequencing (optical) and the wetlab (PCR duplicates)
for f in files:
    dups_call = f"gatk MarkDuplicates I=data/sorted/{f}.bam O=data/mkdups/{f}.bam M=data/mkdups/{f}.metrics.txt"
    proc_dups = sp.call(dups_call,shell=True)

In [None]:
# Index BAMS
for f in files:
    index_call = f"samtools index -@ 4 data/mkdups/{f}.bam"
    proc_index = sp.call(index_call,shell=True)

In [None]:
# Call variants using GATK HaplotypeCAller
for f in files:
    var_call = f"gatk HaplotypeCaller -I data/mkdups/{f}.bam -O data/calls/{f}.vcf -R {REF} --output-mode EMIT_VARIANTS_ONLY"
    proc_var = sp.call(var_call,shell=True)

In [None]:
# Let's see how many variants we have! 
# How many variants do we have? 
for f in files:
    vcf_call = f"vcftools --vcf data/calls/{f}.vcf"
    proc_vcf = sp.call(vcf_call,shell=True)