Skip to content

Commit

Permalink
removed biopython dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
james committed Dec 7, 2013
1 parent 6eb01e6 commit b3d3e32
Show file tree
Hide file tree
Showing 17 changed files with 119 additions and 27 deletions.
14 changes: 12 additions & 2 deletions CA_Makefile
@@ -1,10 +1,10 @@

BASENAME = r2
BASENAME = simcor
GENOME_SIZE = 120000000

SHELL = /bin/bash

CORRECTION_ROOT = /seq/schatz/a.thaliana/xiwang/james_workspace/nucmer-round2
CORRECTION_ROOT = /seq/schatz/a.thaliana/xiwang/james_workspace/nucmer-round-sim

TIGLEN_BIN = /bluearc/data/schatz/mschatz/devel/bin/tig_length.pl
PBTOOLS_BIN = /bluearc/home/schatz/gurtowsk/workspace/pbtools
Expand All @@ -13,6 +13,8 @@ JOIN_BIN = $(PBTOOLS_BIN)/join.py

CA_BIN = /bluearc/home/schatz/gurtowsk/sources/wgs.svn/Linux-amd64/bin
AMOS_BIN = /bluearc/home/schatz/gurtowsk/sources/amos/bin
MUMMER_BIN = /bluearc/home/schatz/gurtowsk/sources/MUMmer


STATS_BIN = $(AMOS_BIN)/stats

Expand Down Expand Up @@ -76,6 +78,14 @@ getsc:
idx=$$(grep "$$rn" $(CORRECTION_ROOT)/ReadIndex.txt | awk '{print $$2}'); \
grep "$$rn" $(CORRECTION_ROOT)/$${idx}.delta.pre.1.sc


.PHONY : getrawsc

getrawsc:
rn=$$(echo $(read) | awk '{split($$1,a,"_corrected"); print a[1]}') ; \
idx=$$(grep "$$rn" $(CORRECTION_ROOT)/ReadIndex.txt | awk '{print $$2}'); \
$(MUMMER_BIN)/show-coords -rclo $(CORRECTION_ROOT)/$${idx}.delta | grep "$$rn"

clean:
rm -f unitigs.layout.clean.names unitigs.layout.clean \
unitigs.layout.stats ReadIndex.txt unitigs.layout
2 changes: 0 additions & 2 deletions README
@@ -1,7 +1,5 @@
PB Correction and other PB tools

Requires: BioPython

PBTOOLS_HOME=/path/to/this/directory

Running Correction.
Expand Down
24 changes: 12 additions & 12 deletions correct.sh
Expand Up @@ -9,10 +9,10 @@ source ~/.bashrc
##SET THE FOLLOWING PARAMETERS

#path to correct script in pbtools repo
CORRECT_SCRIPT=/path/to/ectools/pb_correct.py
CORRECT_SCRIPT=/bluearc/home/schatz/gurtowsk/workspace/ectools/pb_correct.py

#pre filter delta file
PRE_DELTA_FILTER_SCRIPT=/path/to/ectools/pre_delta_filter.py
PRE_DELTA_FILTER_SCRIPT=/bluearc/home/schatz/gurtowsk/workspace/ectools/pre_delta_filter.py

#smallest alignment allowed, filter out alignments smaller than this
MIN_ALIGNMENT_LEN=200
Expand All @@ -25,10 +25,10 @@ WIGGLE_PCT=0.05
CONTAINED_PCT_ID=0.80

#path to high identity unitigs
UNITIG_FILE=/path/to/short-read.utg.fa
UNITIG_FILE=/seq/schatz/a.thaliana/xiwang/james_workspace/nucmer-round-sim/misim.utg.g10frg.fa

#Trim out regions with lower identity than
CLR_PCT_ID=0.99
CLR_PCT_ID=0.96

#Minimum read length to output after splitting/trimming
MIN_READ_LEN=3000
Expand All @@ -40,15 +40,15 @@ FILE=p${suffix}

ORIGINAL_DIR=`pwd`

#Move to sge temp storage
Move to sge temp storage
if [[ $TMPDIR ]]
then
cd $TMPDIR
fi

cp ${ORIGINAL_DIR}/${FILE} .

nucmer --maxmatch -l 11 -b 10000 -g 1000 -p ${FILE} ${FILE} ${UNITIG_FILE}
nucmer --maxmatch -l 11 -b 2000 -g 1000 -p ${FILE} ${FILE} ${UNITIG_FILE}

cp ${FILE}.delta ${ORIGINAL_DIR}

Expand All @@ -57,19 +57,19 @@ python ${PRE_DELTA_FILTER_SCRIPT} ${FILE}.delta ${WIGGLE_PCT} ${CONTAINED_PCT_ID
cp ${FILE}.delta.pre ${ORIGINAL_DIR}
cp ${FILE}.delta.pre.log ${ORIGINAL_DIR}

delta-filter -l $MIN_ALIGNMENT_LEN -i 70.0 -1 ${FILE}.delta.pre > ${FILE}.delta.pre.1
delta-filter -l $MIN_ALIGNMENT_LEN -i 70.0 -r ${FILE}.delta.pre > ${FILE}.delta.pre.r

cp ${FILE}.delta.pre.1 ${ORIGINAL_DIR}
cp ${FILE}.delta.pre.r ${ORIGINAL_DIR}

show-coords -l -H -r ${FILE}.delta.pre.1 > ${FILE}.delta.pre.1.sc
show-coords -l -H -r ${FILE}.delta.pre.r > ${FILE}.delta.pre.r.sc

cp ${FILE}.delta.pre.1.sc ${ORIGINAL_DIR}
cp ${FILE}.delta.pre.r.sc ${ORIGINAL_DIR}

show-snps -H -l -r ${FILE}.delta.pre.1 > ${FILE}.snps
show-snps -H -l -r ${FILE}.delta.pre.r > ${FILE}.snps

cp ${FILE}.snps ${ORIGINAL_DIR}

python ${CORRECT_SCRIPT} ${FILE} ${FILE}.snps ${FILE}.delta.pre.1.sc ${CLR_PCT_ID} ${MIN_READ_LEN} ${FILE}
python ${CORRECT_SCRIPT} ${FILE} ${FILE}.snps ${FILE}.delta.pre.r.sc ${CLR_PCT_ID} ${MIN_READ_LEN} ${FILE}

cp ${FILE}.cor.fa ${ORIGINAL_DIR}
cp ${FILE}.cor.pileup ${ORIGINAL_DIR}
Expand Down
32 changes: 32 additions & 0 deletions fix_delta_header.py
@@ -0,0 +1,32 @@
#!/usr/bin/env python

import sys
import os


###
#Assume ref is now in the cwd
###


if not len(sys.argv) == 2:
print "fix_delta_header.py file.delta"
sys.exit(1)

cwd = os.getcwd()


fname = sys.argv[1]
ftmp = sys.argv[1]+".copy_tmp_new"

print "%s" % fname

with open(fname) as dfile:
with open(ftmp,"w") as tmp:
ref,query = dfile.readline().split()
new_ref = os.path.join(cwd,os.path.basename(ref))
tmp.write(" ".join([new_ref, query]) +"\n")
for line in dfile:
tmp.write(line)

os.rename(ftmp, fname)
4 changes: 2 additions & 2 deletions gc_count.py
Expand Up @@ -3,7 +3,7 @@
import sys
import operator

from Bio import SeqIO
from seqio import fastaIterator
from itertools import groupby,izip,imap

from io import NucRecord, NucRecordTypes, getNucmerAlignmentIterator
Expand All @@ -25,7 +25,7 @@

reads = {}

for entry in SeqIO.parse(rfh, "fasta"):
for entry in fastaIterator(rfh):
reads[str(entry.name)] = str(entry.seq)
sys.stderr.write("Loaded reads\n")

Expand Down
4 changes: 2 additions & 2 deletions partition.py
Expand Up @@ -2,7 +2,7 @@

import sys
import os
from Bio import SeqIO
from seqio import fastaIterator

if not len(sys.argv) == 4:
print "partition.py <reads_per_file (int)> <files_per_dir (int)> <input.fa>"
Expand All @@ -20,7 +20,7 @@ def pstr(num):
fnum = 0
fh = None
readidx_fh = open("ReadIndex.txt", "w")
for record in SeqIO.parse(fa_fh,"fasta"):
for record in fastaIterator(fa_fh):
if total_reads % rpf == 0:
if total_reads % (rpf * fpd) == 0:
dnum += 1
Expand Down
4 changes: 2 additions & 2 deletions pb_correct.py
Expand Up @@ -4,7 +4,7 @@

from io import *

from Bio import SeqIO
from seqio import fastaIterator

from operator import itemgetter
from itertools import groupby, repeat, izip_longest, imap, count, chain
Expand Down Expand Up @@ -106,7 +106,7 @@ def get_contiguous_ranges(ranges):
snp_it = lineRecordIterator(sfh, NucSNPRecord, NucSNPRecordTypes)


reads = dict(map(lambda r : (str(r.name), str(r.seq)), SeqIO.parse(rfh, "fasta")))
reads = dict(map(lambda r : (str(r.name), str(r.seq)), fastaIterator(rfh)))
alignments = dict(map(lambda (n,a): (n,list(a)),
groupby(alignment_it, lambda x: x.sname)))

Expand Down
4 changes: 2 additions & 2 deletions pb_sim.py
Expand Up @@ -5,7 +5,7 @@
import sys

from collections import namedtuple
from Bio import SeqIO
from seqio import fastaIterator
from itertools import dropwhile,count,repeat
import misc
import random
Expand Down Expand Up @@ -34,7 +34,7 @@ def mismatch(pos,seq):


#read genome into mem
chromosomes = map(lambda r: Chromosome._make((str(r.name),str(r.seq))), SeqIO.parse(gfh, "fasta"))
chromosomes = map(lambda r: Chromosome._make((str(r.name),str(r.seq))), fastaIterator(gfh))
chrom_lengths = map(lambda c: len(c.seq), chromosomes)
genome_length = sum(chrom_lengths)
chrom_lengths_ivtf = map(misc.accumulator(0), map(lambda x: float(x)/genome_length , chrom_lengths))
Expand Down
2 changes: 1 addition & 1 deletion pre_delta_filter.py
Expand Up @@ -65,12 +65,12 @@ def alignment_is_good(record, a):
efh.write(deltaAlignmentHeaderToString(alnmt)+ "\n")

#0 or 1 alignments, good to go! Otherwise idk
#just log them and output them anyway
if len(pfalignments) > 1:
efh.write("Multiple Alignments\n")
efh.write(deltaRecordHeaderToString(record) + "\n")
for a in pfalignments:
efh.write(deltaAlignmentHeaderToString(a)+ "\n")
pfalignments = []

#substitute old alignments with filtered ones
record.alignments[:] = []
Expand Down
4 changes: 2 additions & 2 deletions qualgen.py
Expand Up @@ -2,7 +2,7 @@

import sys

from Bio import SeqIO
from seqio import fastaIterator

if not len(sys.argv) == 2:
print "qualgen.py read.fa"
Expand All @@ -11,6 +11,6 @@
reads = sys.argv[1]

with open(reads) as rfh:
for record in SeqIO.parse(rfh,"fasta"):
for record in fastaIterator(rfh):
print ">"+str(record.name)
print " ".join(["60"]*len(record.seq))
2 changes: 2 additions & 0 deletions test/bad.fa
@@ -0,0 +1,2 @@
fl
cd
3 changes: 3 additions & 0 deletions test/bad.fq
@@ -0,0 +1,3 @@
ca
asdf
asdf
5 changes: 5 additions & 0 deletions test/corrupt.fa
@@ -0,0 +1,5 @@
>ff
ddadd
>c
>d
>f
6 changes: 6 additions & 0 deletions test/corrupt.fq
@@ -0,0 +1,6 @@
@acg
ffffffffff
+
FFFFFFFFFF
@
fff
4 changes: 4 additions & 0 deletions test/good.fa
@@ -0,0 +1,4 @@
>good
CACACACA
>dddd
CCCCCCCCC
4 changes: 4 additions & 0 deletions test/good.fq
@@ -0,0 +1,4 @@
@good
ccccccccc
+
ddddddddd
28 changes: 28 additions & 0 deletions test/seqio_test.py
@@ -0,0 +1,28 @@
import sys

sys.path.append('..')

from seqio import fastaIterator, fastqIterator, seqIterator

for i in ["bad.fa","corrupt.fa","good.fa"]:
with open(i) as fh:
try:
for record in fastaIterator(fh):
print record.name
print record.seq
except Exception as e:
print e

for i in ["bad.fq","corrupt.fq","good.fq"]:
with open(i) as fh:
try:
for record in fastqIterator(fh):
print record.name
print record.seq
print record.desc
print record.qual
except Exception as e:
print e



0 comments on commit b3d3e32

Please sign in to comment.