In [0]:
import os, sys
from ipyparallel import Client
import matplotlib.pyplot as plt
%matplotlib inline
from subprocess import Popen, PIPE
from Bio import SeqIO
import pandas as pd

In [0]:
#get fastq files

qdir = '/home/lindb/wbp/concatenated/'
files = os.listdir(qdir)
files = [os.path.join(qdir,f) for f in files if f.endswith('d.fastq')]
len(files)

In [0]:
rc = Client(profile="xmn")
dview = rc[:]
lview = rc.load_balanced_view()
print(len(rc))

In [0]:
assembly = '/home/lindb/wbp/concatenated/velvet/auto_data_45/contigs.fa'

In [0]:
cd ~/wbp/concatenated/raw_mapping/

In [0]:
!/home/lindb/g/src/bowtie2-2.2.6/bowtie2-build -f $assembly $assembly

In [0]:
# --very-fast-local
# Same as: -D 5 -R 1 -N 0 -L 25 -i S,1,2.00

# --fast-local
# Same as: -D 10 -R 2 -N 0 -L 22 -i S,1,1.75

# --sensitive-local
# Same as: -D 15 -R 2 -N 0 -L 20 -i S,1,0.75 (default in --local mode)

# --very-sensitive-local
# Same as: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50

@lview.remote()
def run_bowtie2(args):
    import os, stopwatch, multiprocessing
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    assembly, reads = args
    parent = os.path.dirname(reads)
    outdir = os.path.join(parent, "raw_mapping_samfiles")
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    sam = os.path.join(outdir, "%s.sam" % (os.path.basename(reads)))
    cmd = "/home/lindb/g/src/bowtie2-2.2.6/bowtie2 --local -D 20 -R 3 -N 1 -L 20 -i S,1,0.50 -p %d -x %s -U %s -S %s" % (cpus,
                                                               assembly,
                                                               reads,
                                                               sam)
    res = None
    res = !$cmd
#     if not os.path.exists(sam):
#         res = !$cmd
    timer.stop()
    return assembly, sam, cmd, timer.elapsed, res

In [0]:
#make sure they're the right files
for f in files:
    print f
    break

In [0]:
len(files)

In [0]:
#map with bowtie2
bowtie_jobs = []
for f in files:
    bowtie_jobs.append(run_bowtie2((assembly, f)))

In [0]:
#check on progress
count = 0
for j in bowtie_jobs:
    if j.ready():
        count += 1
print count,'/',len(bowtie_jobs)

In [0]:
bowtie_jobs

In [0]:
from io import StringIO

In [0]:
#make sure there are no errors!
count = 0
ecount = 0
for j in bowtie_jobs:
    if j.ready():
        try:
            res = [x.strip() for x in j.r[-1]]
        except Exception as E:
            print "%s engine died" % os.path.basename(files[count]).split(".")[0]
            ecount += 1
            #print E,'\n'
        #print files[count].split("/")[-1]
        #print res
        res = ""
    count += 1
ecount

In [0]:
#look at overall mapping percentages across samples
overall = []
for j in bowtie_jobs:
    if j.ready():
        try:
            path = j.r[1]
            res_file = "%s.stats" % path
            res = j.r[-1]
            res = [x.strip() for x in res]
            percent = res[-1].split()[0][:-1]
            overall.append(float(percent))
        except Exception as E:
            pass
pd.Series(overall).describe()

In [0]:
#add in 11 that failed above because engine died; I had to run in another window
sec_overall = [85.83, 92.78, 80.02, 76.73, 94.65, 91.83, 94.56, 78.43, 88.95, 99.24, 93.76]
len(sec_overall)

In [0]:
test = sec_overall + overall
len(test)

In [0]:
len(overall)

In [0]:
overall

In [0]:
for x in overall:
    if x not in test:
        print "crap"
for y in sec_overall:
    if y not in test:
        print 'crap'

In [0]:
pd.Series(test).describe()

In [0]:
#output from pd.Series(test).describe()
#count    244.000000
#mean      84.548607
#std       10.013239
#min       38.010000
#25%       78.427500
#50%       84.770000
#75%       92.625000
#max       99.410000
#dtype: float64

# convert sam to bam

In [0]:
#get all of the *.sam files
DIR = '/home/lindb/wbp/concatenated/raw_mapping_samfiles/'
sam_files = os.listdir(DIR)
sam_files = [os.path.join(DIR,f) for f in sam_files if f.endswith('sam')]
len(sam_files)

In [0]:
#example
sam_files[0]

In [0]:
sorted(sam_files)

In [0]:
#to make sure function will name like I want it to
sam = sam_files[0]
bam = sam.replace(".sam", ".bam")
bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
bam_index = bam_sorted.replace(".bam", ".bai")
print sam
print bam
print bam_sorted
print bam_index
del sam,bam,bam_sorted,bam_index

In [0]:
@lview.remote()
def convert_sam_to_bam(sam):
    import stopwatch, multiprocessing, os
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count()
    bam = sam.replace(".sam", ".bam")
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    bam_index = bam_sorted.replace(".bam", ".bai")
    if not os.path.exists(bam):
        !samtools view -bS $sam > $bam
#    if os.path.exists(bam):
#        if not os.path.exists(bam_sorted):
#            !samtools sort -@ $cpus $bam -o $bam_sorted
#    if os.path.exists(bam_sorted):
#        if not os.path.exists(bam_index):
#            !samtools index $bam_sorted $bam_index
    timer.stop()
    return bam, bam_sorted, bam_index, timer.elapsed

In [0]:
#convert *.sam to *.bam
sam_bam_jobs = []
for f in sam_files:
    sam_bam_jobs.append(convert_sam_to_bam(os.path.abspath(f)))

In [0]:
count = 0
for j in sam_bam_jobs:
    if j.ready():
        count += 1
count,len(sam_bam_jobs)        

In [0]:
#make sure there are no errors!
count = 0
ecount = 0
rcount = 0
for j in sam_bam_jobs:
    if j.ready():
        rcount += 1
        try:
            hey = j.r
            #print j.r
        except Exception as E:
            print "%s engine died" % os.path.basename(sam_files[count]).split(".")[0]
            print E, '\n'
            ecount += 1
            #print E,'\n'
        #print files[count].split("/")[-1]
        #print res
        res = ""
    count += 1
ecount, rcount

In [0]:
#get bam files
bam_files = os.listdir(DIR)
bam_files = [os.path.join(DIR,f) for f in bam_files if f.endswith('q.bam')]
len(bam_files)

In [0]:
count = 0
for bam in bam_files:
    bam_sorted = "%s_sorted.bam" % bam.replace(".bam", "")
    if not os.path.exists(bam_sorted):        
        num = os.path.basename(bam)[:3]
        text = '''#!/bin/bash
#$ -N run%s
#$ -V
#$ -j y
#$ -cwd

cd /home/lindb/wbp/concatenated/raw_mapping_samfiles/
samtools sort -@ 24 %s -o %s
''' % (num,bam,bam_sorted)
        f = "sort%s.sh" % str(num)
        filE = '/home/lindb/wbp/concatenated/raw_mapping_samfiles/sortrunfiles/%s' % f
        with open(filE,'w') as o:
            o.write("%s" % text)    

In [0]:
@lview.remote()
def index_sorted_bams(bam_sorted):
    import stopwatch, multiprocessing, os
    timer = stopwatch.Timer()
    cpus = multiprocessing.cpu_count() 
    bam_index = bam_sorted.replace(".bam", ".bai")
    if not os.path.exists(bam_index):
        !samtools index $bam_sorted $bam_index    
    timer.stop()
    return bam_sorted, bam_index, timer.elapsed            

In [0]:
sorted_bams = os.listdir(DIR)
sorted_bams = [os.path.join(DIR,f) for f in sorted_bams if 'sorted.bam' in f]
len(sorted_bams)

In [0]:
for bam_sorted in sorted_bams:
    print bam_sorted
    break

In [0]:
count = 0
for bam_sorted in sorted_bams:
    bam_index = bam_sorted.replace(".bam", ".bai")
    if not os.path.exists(bam_index):        
        num = os.path.basename(bam_sorted)[:3]
        text = '''#!/bin/bash
#$ -N run%s
#$ -V
#$ -j y
#$ -cwd

cd /home/lindb/wbp/concatenated/raw_mapping_samfiles/
samtools index %s %s
''' % (num,bam_sorted,bam_index)
        f = "index%s.sh" % str(num)
        filE = '/home/lindb/wbp/concatenated/raw_mapping_samfiles/sortrunfiles/%s' % f
        with open(filE,'w') as o:
            o.write("%s" % text)    

# add rg info

In [0]:
from collections import OrderedDict

In [0]:
#get files containing library and barcode IDs for each sample
bdir = '/home/lindb/wbp/raw_reads/barcodefiles/' 
bfiles = os.listdir(bdir)
bfiles = [os.path.join(bdir,f) for f in bfiles]
barfiles = []
libs = []
for f in bfiles:
    lib = os.path.basename(f).split("_")[0]
    if lib not in libs:
        libs.append(lib)
        barfiles.append(f)
barfiles

In [0]:
#grab libID and barcodeID for each sample from the files
badids = ['NA','nan','5_b11','8_a3','8_a4','8_a6','8_a7','2_a7','2_a9','5_b11','5_b9']
rgDict = OrderedDict()
for f in barfiles:
    df = pd.read_csv(f,header=0,sep="\t")
    lib = "lib%s" % str(os.path.basename(f).split("_")[0])
    for row in df.index:
        samp = str(df.loc[row,'sample.ID'])
        while len(samp) < 3:
            samp = '0%s' % samp
        if samp not in badids:
            rgDict[samp] = OrderedDict()
            bc = df.loc[row,'barcode2'].replace("CTCTTTCCCTACACGACGCTCTTCCGATCT", "")[:-1]
            #print samp,bc
            rgDict[samp]['lib'] = lib
            rgDict[samp]['bc'] = bc
len(rgDict.keys())

In [0]:
bcs = []
for samp in rgDict.keys():
    if rgDict[samp]['bc'] not in bcs:
        bcs.append(rgDict[samp]['bc'])
len(bcs)    #should be 96!!!!    

In [0]:
#take a look to see how function will work
bam = '/home/lindb/wbp/concatenated/raw_mapping_samfiles/218compiled.fastq_sorted.bam'
base = os.path.basename(bam).split(".")
samp = base[0][:3]
bam_rg = bam.replace(".bam", "_rg.bam")
rglb = rgDict[samp]['lib'] #lib ID
rgpu = rgDict[samp]['bc']  #barcode ID
rgsm = base[0]
rgid = "_".join([rglb,rgsm,rgpu])
print "base=",base
print "bam_rg=",bam_rg
print "rglb=",rglb #population ID
print 'rgpu=',rgpu
print 'rgsm =',rgsm
print 'rgid=',rgid
del bam, base,samp,bam_rg,rglb,rgpu,rgsm,rgid

In [0]:
@lview.remote()
def add_rg_info_to_bam(args):
    rgDict , bam = args
    import os
    cmd = "java -jar /home/lindb/g/src/picard-tools-1.130/picard.jar AddOrReplaceReadGroups"
    base = os.path.basename(bam).split(".")
    samp = base[0][:3]
    bam_rg = bam.replace(".bam", "_rg.bam")
    rglb = rgDict[samp]['lib']    #libID
    rgpu = rgDict[samp]['bc']     #barcodeID
    rgsm = base[0]                #sampID
    rgid = "_".join([rglb,rgsm,rgpu])  #sampID_barcodeID
    rg_string = "RGID=%s RGLB=%s RGPL=illumina RGPU=%s RGSM=%s" % (rgid,
                                                                   rglb,
                                                                   rgpu,
                                                                   rgsm)
    cmd = "%s INPUT=%s OUTPUT=%s %s CREATE_INDEX=true" % (cmd,bam,bam_rg,rg_string)
    print cmd
    if not os.path.exists(bam_rg):
        !$cmd
    return bam_rg, rg_string, cmd

In [0]:
#get sorted.bam files
DIR = '/home/lindb/wbp/concatenated/raw_mapping_samfiles/'
sorted_bams = os.listdir(DIR)
sorted_bams = [os.path.join(DIR,f) for f in sorted_bams if 'sorted.bam' in f]
len(sorted_bams)

In [0]:
rg_lst = []
for f in sorted_bams:
    rg_lst.append(add_rg_info_to_bam([rgDict,f]))

In [0]:
rcount = 0
for j in rg_lst:
    if j.ready():
        rcount += 1
rcount,len(rg_lst)

In [0]:
for f in sorted_bams:
    rg_bai = f.replace(".bam","_rg.bai")
    rg_bam = rg_bai.replace(".bai",".bam")
    if not os.path.exists(rg_bai):
        print 'bai',f
    if not os.path.exists(rg_bam):
        print 'bam',f