/
nanopolish_makerange.py
executable file
·37 lines (29 loc) · 1.16 KB
/
nanopolish_makerange.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#! /usr/bin/env python
import sys
import argparse
from Bio.SeqIO.FastaIO import SimpleFastaParser
parser = argparse.ArgumentParser(description='Partition a genome into a set of overlapping segments')
parser.add_argument('--segment-length', type=int, default=50000)
parser.add_argument('--overlap-length', type=int, default=200)
args, extra = parser.parse_known_args()
if len(extra) != 1:
sys.stderr.write("Error: a genome file is expected\n")
filename = extra[0]
with open(filename) as handle:
recs = [(title.split(None, 1)[0], len(seq))
for title, seq in SimpleFastaParser(handle)]
SEGMENT_LENGTH = args.segment_length
OVERLAP_LENGTH = args.overlap_length
MIN_SEGMENT_LENGTH = 5 * OVERLAP_LENGTH
for name, length in recs:
n_segments = (length / SEGMENT_LENGTH) + 1
start = 0
while start < length:
end = start + SEGMENT_LENGTH
# If this segment will end near the end of the contig, extend it to end
if length - end < MIN_SEGMENT_LENGTH:
print("%s:%d-%d" % (name, start, length - 1))
start = length
else:
print("%s:%d-%d" % (name, start, end + OVERLAP_LENGTH))
start = end