-
Notifications
You must be signed in to change notification settings - Fork 9
/
pyadapter_trim.py
executable file
·164 lines (142 loc) · 4.78 KB
/
pyadapter_trim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
# Author: Jason Buenrostro, Stanford University
# The following program will compress daisy chain seq data into singe molecules
##### IMPORT MODULES #####
# import necessary for python
import os
import re
import sys
import gzip
import string
import Levenshtein
from Bio import SeqIO
from Bio import AlignIO
from optparse import OptionParser
##### DEFINE FUNCTIONS #####
# Reverse complement
complement = string.maketrans('ATCGN', 'TAGCN')
def reverse_complement(sequence):
return sequence.upper().translate(complement)[::-1]
# Align with mismatch, find first and move on, assumes only one
def fuzz_align(s_seq,l_seq,mismatch):
for i, base in enumerate(l_seq): # loop through equal size windows
l_subset = l_seq[i:i+len(s_seq)]
dist = Levenshtein.distance(l_subset, s_seq)
if dist <= mismatch: # find first then break
return i, dist
break
#### OPTIONS ####
# define options
opts = OptionParser()
usage = "usage: %prog [options] [inputs] This will trim adapters"
opts = OptionParser(usage=usage)
opts.add_option("-a", help="<Read1> Accepts fastq or fastq.gz")
opts.add_option("-b", help="<Read2> Accepts fastq or fastq.gz")
options, arguments = opts.parse_args()
# return usage information if no argvs given
if len(sys.argv)==1:
os.system(sys.argv[0]+" --help")
sys.exit()
##### INPUTS AND OUTPUTS #####
# name input and outputs
p1_in = options.a
p2_in = options.b
# name outputs and print to working dir
p1_file = p1_in.split('/')[-1]
p2_file = p2_in.split('/')[-1]
p1_out = re.sub(".fastq", ".trim.fastq", p1_file)
p2_out = re.sub(".fastq", ".trim.fastq", p2_file)
#check for file type and open input file
append = p1_in.split('.')[-1]
if append == "fastq":
p1_rds = open(p1_in,'r')
p2_rds = open(p2_in,'r')
p1_out = re.sub(".fastq", ".trim.fastq", p1_file)
p2_out = re.sub(".fastq", ".trim.fastq", p2_file)
elif append == "fq":
p1_rds = open(p1_in,'r')
p2_rds = open(p2_in,'r')
p1_out = re.sub(".fq", ".trim.fastq", p1_file)
p2_out = re.sub(".fq", ".trim.fastq", p2_file)
elif append == "gz":
p1_rds = gzip.open(p1_in,'r')
p2_rds = gzip.open(p2_in,'r')
p1_out = re.sub(".fastq.gz", ".trim.fastq", p1_file)
p2_out = re.sub(".fastq.gz", ".trim.fastq", p2_file)
else:
sys.exit("ERROR! The input file2 must be a .fastq or .fastq.gz")
##### SCRIPT #####
# initialize variables
i=0;j=0;k=0;tot_b=0;count=1
n=20 # match seq
mismatch=1 # only allow 0-1 mismatches for now, if allow two then gets mis indexed, to fix this need to change fuzz_align to save L as a vector and reiterate to find 2nd
# initilize write files
r1_write = open(p1_out, 'w')
r2_write = open(p2_out, 'w')
while 1:
# read lines
p1_line = p1_rds.readline()
p2_line = p2_rds.readline()
# break if at end of file
if not p1_line:
break
# load fastq into memory
if count ==1:
seqhead1 = p1_line
seqhead2 = p2_line
elif count ==2:
seq1 = p1_line.rstrip()
seq2 = p2_line.rstrip()
elif count ==3:
qualhead1 = p1_line
qualhead2 = p2_line
elif count ==4:
qual1 = p1_line.rstrip()
qual2 = p2_line.rstrip()
# align reads to themselves
i = i+1 # total reads
rc_seq2 = reverse_complement(seq2[0:n])
idx = seq1.rfind(rc_seq2) # look for perfect match
if idx > 0:
j = j+1 # 0 mismatchs
elif mismatch>0:
hold = fuzz_align(rc_seq2,seq1,mismatch) # else allow for mismatch
if hold:
idx,mis=hold
if mis == 1:
k=k+1 # 1 mismatch
# trim reads if idx exist
if idx > 0:
# keep track on how much trimming
tot_b = tot_b+len(seq2[idx+n:-1]) #track total bases trimmed
# trim data
seq1 = seq1[0:idx+n-1] # modified to sub1 because some aligners (bowtie) dont like perfectly overlapping reads
seq2 = seq2[0:idx+n-1]
qual1 = qual1[0:idx+n-1]
qual2 = qual2[0:idx+n-1]
# print read1
r1_write.write(seqhead1)
r1_write.write(seq1+"\n")
r1_write.write(qualhead1)
r1_write.write(qual1+"\n")
# print read2
r2_write.write(seqhead2)
r2_write.write(seq2+"\n")
r2_write.write(qualhead2)
r2_write.write(qual2+"\n")
# increment count
count = count + 1
if count == 5:
count = 1
else:
count = count
# close files to write the file
r1_write.close()
r2_write.close()
p1_rds.close()
p2_rds.close()
# give summary
print str(i)+" sequences total"
print str(j)+" sequences trimmed with 0 mismatches"
print str(k)+" sequences trimmed with 1 mismatch"
print str(tot_b/(j+k))+" mean number of bases trimmed for reads requiring trimming"