Skip to content

Commit

Permalink
deplex supports fastq
Browse files Browse the repository at this point in the history
  • Loading branch information
james committed Jun 3, 2014
1 parent f8f9365 commit eb03d7f
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 19 deletions.
14 changes: 7 additions & 7 deletions CA_Makefile
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@

BASENAME = simsga
GENOME_SIZE = 120000000
BASENAME = p2
GENOME_SIZE = 4600000

SHELL = /bin/bash

CORRECTION_ROOT = /seq/schatz/a.thaliana/xiwang/james_workspace/nucmer-round-sim

TIGLEN_BIN = /bluearc/data/schatz/mschatz/devel/bin/tig_length.pl
PBTOOLS_BIN = /bluearc/home/schatz/gurtowsk/workspace/pbtools
TIGLEN_BIN = /sonas-hs/schatz/hpc/home/mschatz/build/bin/tig_length.pl
PBTOOLS_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/workspace/pbtools

JOIN_BIN = $(PBTOOLS_BIN)/join.py

CA_BIN = /bluearc/home/schatz/gurtowsk/sources/wgs.svn/Linux-amd64/bin
AMOS_BIN = /bluearc/home/schatz/gurtowsk/sources/amos/bin
MUMMER_BIN = /bluearc/home/schatz/gurtowsk/sources/MUMmer
CA_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/sources/wgs.svn/Linux-amd64/bin
AMOS_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/sources/amos/bin
MUMMER_BIN = /sonas-hs/schatz/hpc/home/gurtowsk/sources/MUMmer


STATS_BIN = $(AMOS_BIN)/stats
Expand Down
25 changes: 17 additions & 8 deletions deplex_pb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,33 @@

from itertools import starmap,izip,chain

from seqio import iteratorFromExtension, fastaRecordToString
from seqio import iteratorFromExtension, recordToString, FastqRecord
from nucio import fileIterator
from args import parseArgs, getHelpStr, CLArgument

description = ("Usage: deplex_pb.py [options] file1.{fa,fq} [file2.{fa,fq} ..]\n\n"
"Deplexes a file based on some delimiter")

argument_list = [["delim", "delim", str, "/", "Delimiter to split the input"]]

arguments = map(CLArgument._make, argument_list)

if not len(sys.argv) >= 2:
sys.exit("deplex_pb.py in.{fa,fq} [in2.{fa,fq} ..]\n")
sys.exit(getHelpStr(description, arguments) + "\n")

files = sys.argv[1:]
its = map(iteratorFromExtension, files)
(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)

file_its = starmap(fileIterator, izip(files,its))
its = map(iteratorFromExtension, args_remaining)

file_its = starmap(fileIterator, izip(args_remaining,its))

fh_h = {}

for entry in chain.from_iterable(file_its):
h = entry.name.split("/")[0]
h = entry.name.split(p_arg_map["delim"])[0]
ext = ".fastq" if isinstance(entry, FastqRecord) else ".fasta"
if not h in fh_h:
fh_h[h] = open(h+".fa","w")
fh_h[h].write(fastaRecordToString(entry))
fh_h[h] = open(h+ext,"w")
fh_h[h].write(recordToString(entry))
fh_h[h].write("\n")

22 changes: 18 additions & 4 deletions m4io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from operator import itemgetter,attrgetter
from collections import namedtuple
from itertools import groupby, imap
from functools import partial
from nucio import fileIterator, lineRecordIterator
from nucio import M4Record, M4RecordTypes
from misc import identityFunc
Expand Down Expand Up @@ -35,11 +36,24 @@ def filterBestScore(alignments):
return min(alignments,key=score_getter)

def longestNonOverlapping(alignments):
'''Find the longest non overlapping alignments'''
'''
Gets the best non-overlapping alignment sections
with respect to the query sequence
Just greedily sorts the
alignments by length and adds them sequentially as long
as they don't overlap another alignment (with respect to the query)
'''

align_len = lambda r : r.qend - r.qstart
alignments.sort(key=align_len, reverse=True)
longest = alignments[0]
not_overlaps = lambda y : y.qend < longest.qstart or y.qstart > longest.qend
newset = [longest] + filter(not_overlaps, alignments)

newset = []
not_overlaps = lambda y,other : y.qend < other.qstart or y.qstart > other.qend

for a in alignments:
if all(map(partial(not_overlaps, a), newset)):
newset.append(a)

return newset

46 changes: 46 additions & 0 deletions reference_segments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python

#Outputs a genome's chromosome segments
# i.e. splits the genome according to the formula
# in the assembly complexity paper
# no stretch of more than 1 million N's
import sys

from seqio import fastaIterator
from nucio import fileIterator
from itertools import repeat

if not len(sys.argv) == 2:
sys.exit("reference_segments.py in.fa")



store_table = {'A':[],
'C':[],
'G':[],
'T':[],
'N':[]}

previous = 0
curr_count = 1
table = 2
accumulator = ["N",-1,store_table]

def runs(acc, nxt):
prev_letter = acc[previous]
acc[curr_count] += 1
if not prev_letter == nxt:
if acc[curr_count] > 0:
acc[table][prev_letter].append(acc[curr_count])
acc[previous] = nxt
acc[curr_count] = 0

return accumulator

for entry in fileIterator(sys.argv[1],fastaIterator):
reduce(runs, entry.seq, accumulator)
runs(accumulator, 'X') #get last sequence
accumulator[previous] = 'N'
accumulator[curr_count] = -1

print max(store_table['N'])

0 comments on commit eb03d7f

Please sign in to comment.