deplex supports fastq

jgurtowski · Jun 3, 2014 · eb03d7f · eb03d7f
1 parent f8f9365
commit eb03d7f
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 19 deletions.
diff --git a/CA_Makefile b/CA_Makefile
@@ -1,19 +1,19 @@
 
-BASENAME = simsga
-GENOME_SIZE = 120000000
+BASENAME = p2
+GENOME_SIZE = 4600000
 
 SHELL = /bin/bash
 
 CORRECTION_ROOT = /seq/schatz/a.thaliana/xiwang/james_workspace/nucmer-round-sim
 
-TIGLEN_BIN = /bluearc/data/schatz/mschatz/devel/bin/tig_length.pl 
-PBTOOLS_BIN = /bluearc/home/schatz/gurtowsk/workspace/pbtools
+TIGLEN_BIN = /sonas-hs/schatz/hpc/home/mschatz/build/bin/tig_length.pl 
+PBTOOLS_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/workspace/pbtools
 
 JOIN_BIN = $(PBTOOLS_BIN)/join.py
 
-CA_BIN = /bluearc/home/schatz/gurtowsk/sources/wgs.svn/Linux-amd64/bin
-AMOS_BIN = /bluearc/home/schatz/gurtowsk/sources/amos/bin
-MUMMER_BIN = /bluearc/home/schatz/gurtowsk/sources/MUMmer
+CA_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/sources/wgs.svn/Linux-amd64/bin
+AMOS_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/sources/amos/bin
+MUMMER_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/sources/MUMmer
 
 
 STATS_BIN = $(AMOS_BIN)/stats

diff --git a/deplex_pb.py b/deplex_pb.py
@@ -6,24 +6,33 @@
 
 from itertools import starmap,izip,chain
 
-from seqio import iteratorFromExtension, fastaRecordToString
+from seqio import iteratorFromExtension, recordToString, FastqRecord
 from nucio import fileIterator
+from args import parseArgs, getHelpStr, CLArgument
+
+description = ("Usage: deplex_pb.py [options] file1.{fa,fq} [file2.{fa,fq} ..]\n\n"
+               "Deplexes a file based on some delimiter")
+
+argument_list = [["delim", "delim", str, "/", "Delimiter to split the input"]]
+
+arguments = map(CLArgument._make, argument_list)
 
 if not len(sys.argv) >= 2:
-    sys.exit("deplex_pb.py in.{fa,fq} [in2.{fa,fq} ..]\n")
+    sys.exit(getHelpStr(description, arguments) + "\n")
 
-files = sys.argv[1:]
-its = map(iteratorFromExtension, files)
+(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)
 
-file_its = starmap(fileIterator, izip(files,its))
+its = map(iteratorFromExtension, args_remaining)
 
+file_its = starmap(fileIterator, izip(args_remaining,its))
 
 fh_h = {}
 
 for entry in chain.from_iterable(file_its):
-    h = entry.name.split("/")[0]
+    h = entry.name.split(p_arg_map["delim"])[0]
+    ext = ".fastq" if isinstance(entry, FastqRecord) else ".fasta"
     if not h in fh_h:
-        fh_h[h] = open(h+".fa","w")
-    fh_h[h].write(fastaRecordToString(entry))
+        fh_h[h] = open(h+ext,"w")
+    fh_h[h].write(recordToString(entry))
     fh_h[h].write("\n")
 
diff --git a/m4io.py b/m4io.py
@@ -2,6 +2,7 @@
 from operator import itemgetter,attrgetter
 from collections import namedtuple
 from itertools import groupby, imap
+from functools import partial
 from nucio import fileIterator, lineRecordIterator
 from nucio import M4Record, M4RecordTypes
 from misc import identityFunc
@@ -35,11 +36,24 @@ def filterBestScore(alignments):
     return min(alignments,key=score_getter)
 
 def longestNonOverlapping(alignments):
-    '''Find the longest non overlapping alignments'''
+    '''
+    Gets the best non-overlapping alignment sections 
+    with respect to the query sequence
+    
+    Just greedily sorts the
+    alignments by length and adds them sequentially as long
+    as they don't overlap another alignment (with respect to the query)
+    '''
+
     align_len = lambda r : r.qend - r.qstart
     alignments.sort(key=align_len, reverse=True)
-    longest = alignments[0]
-    not_overlaps = lambda y : y.qend < longest.qstart or y.qstart > longest.qend
-    newset = [longest] + filter(not_overlaps, alignments)
+
+    newset = []
+    not_overlaps = lambda y,other : y.qend < other.qstart or y.qstart > other.qend    
+
+    for a in alignments:
+        if all(map(partial(not_overlaps, a), newset)):
+            newset.append(a)
+
     return newset
 
diff --git a/reference_segments.py b/reference_segments.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+#Outputs a genome's chromosome segments
+# i.e. splits the genome according to the formula
+# in the assembly complexity paper
+#   no stretch of more than 1 million N's
+import sys
+
+from seqio import fastaIterator
+from nucio import fileIterator
+from itertools import repeat
+
+if not len(sys.argv) == 2:
+    sys.exit("reference_segments.py in.fa")
+
+
+
+store_table = {'A':[],
+         'C':[],
+         'G':[],
+         'T':[],
+         'N':[]}
+
+previous = 0
+curr_count = 1
+table = 2
+accumulator = ["N",-1,store_table]    
+
+def runs(acc, nxt):
+    prev_letter = acc[previous]
+    acc[curr_count] += 1
+    if not prev_letter == nxt:
+        if acc[curr_count] > 0:
+            acc[table][prev_letter].append(acc[curr_count])
+        acc[previous] = nxt
+        acc[curr_count] = 0
+
+    return accumulator
+
+for entry in fileIterator(sys.argv[1],fastaIterator):
+    reduce(runs, entry.seq, accumulator)
+    runs(accumulator, 'X') #get last sequence
+    accumulator[previous] = 'N'
+    accumulator[curr_count] = -1
+
+print max(store_table['N'])