In [0]:
import os,sys
from ipyparallel import Client
from collections import defaultdict
import tempfile
from Bio.SeqIO.QualityIO import FastqGeneralIterator
import socket
import stopwatch
from multiprocessing import Pool, Manager
import shutil
import tempfile
from collections import OrderedDict
import pandas as pd
import numpy as np


In [0]:
import shutil

In [0]:
file_dir = "/gpfs_fs/home/lindb/wbp/raw_reads"
proc_files = []
for root, dirs, files in os.walk(file_dir):
    for f in files:
        if f.startswith('WBP'):
            proc_files.append(os.path.join(root,f))
len(proc_files)

In [0]:
proc_files[199]

In [0]:
sorted(proc_files)[0]

In [0]:
proc_files

In [0]:
rc = Client(profile="xmn")
dview = rc[:]
lview = rc.load_balanced_view()
len(lview)

In [0]:
def format_fastq_tuple(title, seq, qual):
    assert len(seq) == len(qual)
    return "@%s\n%s\n+\n%s\n" % (title, seq, qual)

def get_writers(barcodes, f):
    import tempfile
    from collections import defaultdict
    w = defaultdict()
    file_list = []
    for b, name in list(barcodes.items()):
        file_name = "%s_%s_%s.fastq" % (f, name, b)
        file_list.append(file_name)
        w[b] = open(file_name, "w")
    return w, file_list

def get_barcodes(f):
    from collections import defaultdict
    import os
    bcs = defaultdict()
    bc_lens = set()
    b = os.path.join(os.path.dirname(f), "barcodes.txt")
    h = open(b)
    h.readline()
    for line in h:
        line = line.strip()
        data = line.split("\t")
        if len(data) > 1: #skip blank lines
            if "," in data[0]:
                data[0] = data[0].replace(" ", "").replace(",", "-")
            bc = data[4].upper().replace("CTCTTTCCCTACACGACGCTCTTCCGATCT", "")[:-1]
            bc_lens.add(len(bc))
            bcs[bc] = data[1] + "_" + data[0]        
    return bcs, bc_lens    

def check_barcodes(barcodes):
    for i in list(barcodes.keys()):
        for j in list(barcodes.keys()):
            if i != j:
                assert not j.startswith(i)
                
def copy_file(src, dst):
    import shutil
    shutil.copy(src, dst)
                
def demult(f):
    print(f)
    import socket, stopwatch, os
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    from multiprocessing import Pool
    from collections import defaultdict
    hostname = socket.gethostname()
    timer = stopwatch.Timer()
    out_dir = os.path.dirname(f)
    barcodes, barcode_lens = get_barcodes(f)
    check_barcodes(barcodes)
    writers, writer_list = get_writers(barcodes, f)
    count = 0
    found = defaultdict(int)
    bad_barcode = 0
    n_start = 0
    for title, seq, qual in FastqGeneralIterator(open(f)):

        if seq.startswith("N"):
            seq = seq[1:]
            qual = qual[1:]
            n_start += 1
        
        has_barcode = False    
        
        for l in barcode_lens: 
            bc = seq[0:l]
            if bc in barcodes:
                found[bc] += 1
                has_barcode = True
                w = writers[bc]
                w.write(format_fastq_tuple(title, seq[l:], qual[l:]))
                break
            
        count += 1
            
        if count % 100000 == 0:
            print(hostname, f, count)
        
        if not has_barcode:
            bad_barcode += 1
    
    for k, v in writers.items():
        v.close()
            
    return f, count, bad_barcode, found, writer_list

dview['format_fastq_tuple'] = format_fastq_tuple
dview['demult'] = demult
dview['check_barcodes'] = check_barcodes
dview['get_writers'] = get_writers
dview['get_barcodes'] = get_barcodes
dview['copy_file'] = copy_file


In [0]:
demult_jobs = []
for f in proc_files:
    demult_jobs.append(lview.apply_async(demult, f))

In [0]:
np.sum([j.ready() for j in demult_jobs])

In [0]:
len(demult_jobs)

In [0]:
proc_files[0]

In [0]:
import pickle

In [0]:
demult_results = [j.r for j in demult_jobs if j.ready()]

In [0]:
pickle.dump(demult_results, open("/home/lindb/wbp/raw_reads/demult_results.pkl", "w"))

In [0]:
demult_results = pickle.load(open("/home/lindb/wbp/raw_reads/demult_results.pkl", "r"))

In [0]:
samp_dict = {}
for res in demult_results:
    files = res[-1]
    for f in files:
        d = f.split("_")
        sample = d[-2]
        if not sample in samp_dict:
            samp_dict[sample] = []
        samp_dict[sample].append(f)

In [0]:
len(samp_dict.keys())

In [0]:
sorted(samp_dict.keys())

# concatenate demult to individual files

In [0]:
file_dir = "/gpfs_fs/home/lindb/wbp/raw_reads"
file_dir

In [0]:
good_dict = {}
for s in samp_dict:
    if s == 'NA':
        pass
    elif s == 'nan':
        pass
    elif s == 'b11':
        pass
    elif s == 'a3':
        pass
    elif s == 'a4':
        pass
    elif s == 'a6':
        pass
    elif s == 'a7':
        pass
    elif s == 'a9':
        pass    
    elif s == 'b11':
        pass
    elif s == 'b9':
        pass    
    else:
        good_dict[s] = samp_dict[s]

In [0]:
len(good_dict.keys())

In [0]:
for samp in good_dict.keys():
    for f in good_dict[samp]:
        if not os.path.exists(f):
            print f

In [0]:
with dview.sync_imports():
    import os
    from collections import defaultdict
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    import socket
    from collections import OrderedDict

In [0]:
hosts = dview.apply(socket.gethostname)

In [0]:
host_dict = {}
for i, host in enumerate(hosts.get()):
    if not host in host_dict:
        host_dict[host] = []
    host_dict[host].append(i)
host_dict

In [0]:
single_host = rc.load_balanced_view(targets=[v[0] for k,v in host_dict.items()])

In [0]:
len(single_host)

In [0]:
#old don't use
#@lview.remote()
#def old_join_fastq(args):
    joined, file_list = args
    !touch {joined}
    for f in file_list:
        !cat {f} >> {joined}
    return joined

In [0]:
@lview.remote()
def join_fastq(args):
    joined, file_list = args
    for f in file_list:
        for title,seq,qual in FastqGeneralIterator(open(f)):
            with open(joined,'a') as o:
                o.write(format_fastq_tuple(title,seq,qual))
    return joined

In [0]:
#old_jobs = []
#for sample_id, file_list in good_dict.items():
#    if sample_id == '12':
#        new_base_name = sample_id + "TESTcompiled.fastq"
#        dir_name = "/gpfs_fs/home/lindb/wbp/concatenated"
#        joined = os.path.join(dir_name,new_base_name)
#        old_jobs.append(old_join_fastq((joined,file_list)))

In [0]:
len(good_dict.keys())

In [0]:
good_dict.keys()

In [0]:
#send off files to be concatenated
join_jobs = []
for sample_id, file_list in good_dict.items():
    new_base_name = sample_id + "compiled.fastq"
    dir_name = "/gpfs_fs/home/lindb/wbp/concatenated"
    joined = os.path.join(dir_name, new_base_name)
    join_jobs.append(join_fastq((joined, file_list)))

In [0]:
len(join_jobs)

In [0]:
#check progress
np.sum([j.ready() for j in join_jobs])

In [0]:
#get a list of concatenated files
compdir = '/home/lindb/wbp/concatenated/'
files = os.listdir(compdir)
files = [os.path.join(compdir,f) for f in files if 'compiled' in f]
len(files)

In [0]:
#check to make sure nothing is wonky
#if everything is good, no errors should print
for f in files:
    for t,s,q in FastqGeneralIterator(open(f)):
        pass

In [0]:
@lview.remote()
def do_it(f):
    from Bio.SeqIO.QualityIO import FastqGeneralIterator
    import os,sys
    for t,s,q in FastqGeneralIterator(open(f)):
        pass
    return f

In [0]:
lst = []
for f in files:
    lst.append(do_it(f))

In [0]:
#check progress
np.sum([j.ready() for j in lst])

In [0]:
for j in lst:
    if j.ready():
        if not j.r.startswith('/home/'):
            print j.r

In [0]:
#having sample IDs of the same length is handy!
for f in files:
    bname = os.path.basename(f)
    if len(bname) == 17:
        new_name = bname
    if len(bname) == 16:
        new_name = '0' + bname
    if len(bname) == 15:
        new_name = '00' + bname
    if not len(new_name) == 17:
        print new_name
    dst = os.path.join(compdir,new_name)
    shutil.move(f,dst)

In [0]:
#file with greatest reads = 012compiled.fastq
#split into 50bp or 150bp for VelvetOptimiser.pl
f = '/home/lindb/wbp/concatenated/012compiled.fastq'
w50 = open("/%s_50.fastq" % f,  "w")
w150 = open("/%s_150.fastq" % f, "w")
for t, s, q in FastqGeneralIterator(open(f)):
    w = w50
    if len(s) > 50:
        w = w150
    w.write(format_fastq_tuple(t, s, q))
w50.close()
w150.close()

In [0]:
#check one last time
fz = ['/home/lindb/wbp/concatenated/012compiled.fastq_150.fastq','/home/lindb/wbp/concatenated/012compiled.fastq_50.fastq']
for f in fz:
    for t,s,q in FastqGeneralIterator(open(f)):
        pass