forked from biopython/biopython
/
Prodoc.py
331 lines (264 loc) · 10.4 KB
/
Prodoc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# Copyright 2000 by Jeffrey Chang. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""
This module is OBSOLETE.
Most of the functionality in this module has moved to Bio.ExPASy.Prodoc;
please see
Bio.ExPASy.Prodoc.read To read a Prodoc file containing one entry.
Bio.ExPASy.Prodoc.parse Iterates over entries in a Prodoc file.
Bio.ExPASy.Prodoc.Record Holds Prodoc data.
Bio.ExPASy.Prodoc.Reference Holds data from a Prodoc reference.
The other functions and classes in Bio.Prosite.Prodoc (including
Bio.Prosite.Prodoc.index_file and Bio.Prosite.Prodoc.Dictionary) are
considered deprecated, and were not moved to Bio.ExPASy.Prodoc. If you use
this functionality, please contact the Biopython developers at
biopython-dev@biopython.org to avoid permanent removal of this module from
Biopython.
This module provides code to work with the prosite.doc file from
Prosite, available at http://www.expasy.ch/prosite/.
Tested with:
Release 15.0, July 1998
Release 16.0, July 1999
Release 20.22, 13 November 2007
Functions:
parse Iterates over entries in a Prodoc file.
index_file Index a Prodoc file for a Dictionary.
_extract_record Extract Prodoc data from a web page.
Classes:
Record Holds Prodoc data.
Reference Holds data from a Prodoc reference.
Dictionary Accesses a Prodoc file using a dictionary interface.
RecordParser Parses a Prodoc record into a Record object.
_Scanner Scans Prodoc-formatted data.
_RecordConsumer Consumes Prodoc data to a Record object.
"""
from types import *
import os
import sgmllib
from Bio import File
from Bio import Index
from Bio.ParserSupport import *
def parse(handle):
import cStringIO
parser = RecordParser()
text = ""
for line in handle:
text += line
if line[:5] == '{END}':
handle = cStringIO.StringIO(text)
record = parser.parse(handle)
text = ""
yield record
def read(handle):
parser = RecordParser()
record = parser.parse(handle)
# We should have reached the end of the record by now
remainder = handle.read()
if remainder:
raise ValueError("More than one Prodoc record found")
return record
# It may be a good idea to rewrite read(), parse() at some point to avoid
# using the old-style "parser = RecordParser(); parser.parse(handle)" approach.
class Record:
"""Holds information from a Prodoc record.
Members:
accession Accession number of the record.
prosite_refs List of tuples (prosite accession, prosite name).
text Free format text.
references List of reference objects.
"""
def __init__(self):
self.accession = ''
self.prosite_refs = []
self.text = ''
self.references = []
class Reference:
"""Holds information from a Prodoc citation.
Members:
number Number of the reference. (string)
authors Names of the authors.
citation Describes the citation.
"""
def __init__(self):
self.number = ''
self.authors = ''
self.citation = ''
class Dictionary:
"""Accesses a Prodoc file using a dictionary interface.
"""
__filename_key = '__filename'
def __init__(self, indexname, parser=None):
"""__init__(self, indexname, parser=None)
Open a Prodoc Dictionary. indexname is the name of the
index for the dictionary. The index should have been created
using the index_file function. parser is an optional Parser
object to change the results into another form. If set to None,
then the raw contents of the file will be returned.
"""
self._index = Index.Index(indexname)
self._handle = open(self._index[Dictionary.__filename_key])
self._parser = parser
def __len__(self):
return len(self._index)
def __getitem__(self, key):
start, len = self._index[key]
self._handle.seek(start)
data = self._handle.read(len)
if self._parser is not None:
return self._parser.parse(File.StringHandle(data))
return data
def __getattr__(self, name):
return getattr(self._index, name)
class RecordParser(AbstractParser):
"""Parses Prodoc data into a Record object.
"""
def __init__(self):
self._scanner = _Scanner()
self._consumer = _RecordConsumer()
def parse(self, handle):
self._scanner.feed(handle, self._consumer)
return self._consumer.data
class _Scanner:
"""Scans Prodoc-formatted data.
Tested with:
Release 15.0, July 1998
"""
def feed(self, handle, consumer):
"""feed(self, handle, consumer)
Feed in Prodoc data for scanning. handle is a file-like
object that contains prosite data. consumer is a
Consumer object that will receive events as the report is scanned.
"""
if isinstance(handle, File.UndoHandle):
uhandle = handle
else:
uhandle = File.UndoHandle(handle)
while 1:
line = uhandle.peekline()
if not line:
break
elif is_blank_line(line):
# Skip blank lines between records
uhandle.readline()
continue
else:
self._scan_record(uhandle, consumer)
def _scan_record(self, uhandle, consumer):
consumer.start_record()
self._scan_accession(uhandle, consumer)
self._scan_prosite_refs(uhandle, consumer)
read_and_call(uhandle, consumer.noevent, start='{BEGIN}')
self._scan_text(uhandle, consumer)
self._scan_refs(uhandle, consumer)
self._scan_copyright(uhandle, consumer)
read_and_call(uhandle, consumer.noevent, start='{END}')
consumer.end_record()
def _scan_accession(self, uhandle, consumer):
read_and_call(uhandle, consumer.accession, start='{PDOC')
def _scan_prosite_refs(self, uhandle, consumer):
while attempt_read_and_call(uhandle, consumer.prosite_reference,
start='{PS'):
pass
def _scan_text(self, uhandle, consumer):
while 1:
line = safe_readline(uhandle)
if (line[0] == '[' and line[3] == ']' and line[4] == ' ') or \
line[:5] == '{END}':
uhandle.saveline(line)
break
consumer.text(line)
def _scan_refs(self, uhandle, consumer):
while 1:
line = safe_readline(uhandle)
if line[:5] == '{END}' or is_blank_line(line):
uhandle.saveline(line)
break
consumer.reference(line)
def _scan_copyright(self, uhandle, consumer):
# Cayte Lindner found some PRODOC records with the copyrights
# appended at the end. We'll try and recognize these.
read_and_call_while(uhandle, consumer.noevent, blank=1)
if attempt_read_and_call(uhandle, consumer.noevent, start='+----'):
read_and_call_until(uhandle, consumer.noevent, start='+----')
read_and_call(uhandle, consumer.noevent, start='+----')
read_and_call_while(uhandle, consumer.noevent, blank=1)
class _RecordConsumer(AbstractConsumer):
"""Consumer that converts a Prodoc record to a Record object.
Members:
data Record with Prodoc data.
"""
def __init__(self):
self.data = None
def start_record(self):
self.data = Record()
def end_record(self):
self._clean_data()
def accession(self, line):
line = line.rstrip()
if line[0] != '{' or line[-1] != '}':
raise ValueError("I don't understand accession line\n%s" % line)
acc = line[1:-1]
if acc[:4] != 'PDOC':
raise ValueError("Invalid accession in line\n%s" % line)
self.data.accession = acc
def prosite_reference(self, line):
line = line.rstrip()
if line[0] != '{' or line[-1] != '}':
raise ValueError("I don't understand accession line\n%s" % line)
acc, name = line[1:-1].split('; ')
self.data.prosite_refs.append((acc, name))
def text(self, line):
self.data.text = self.data.text + line
def reference(self, line):
if line[0] == '[' and line[3] == ']': # new reference
self._ref = Reference()
self._ref.number = line[1:3].strip()
if line[1] == 'E':
# If it's an electronic reference, then the URL is on the
# line, instead of the author.
self._ref.citation = line[4:].strip()
else:
self._ref.authors = line[4:].strip()
self.data.references.append(self._ref)
elif line[:4] == ' ':
if not self._ref:
raise ValueError("Unnumbered reference lines\n%s" % line)
self._ref.citation = self._ref.citation + line[5:]
else:
raise Exception("I don't understand the reference line\n%s" % line)
def _clean_data(self):
# get rid of trailing newlines
for ref in self.data.references:
ref.citation = ref.citation.rstrip()
ref.authors = ref.authors.rstrip()
def index_file(filename, indexname, rec2key=None):
"""index_file(filename, indexname, rec2key=None)
Index a Prodoc file. filename is the name of the file.
indexname is the name of the dictionary. rec2key is an
optional callback that takes a Record and generates a unique key
(e.g. the accession number) for the record. If not specified,
the id name will be used.
"""
import os
if not os.path.exists(filename):
raise ValueError("%s does not exist" % filename)
index = Index.Index(indexname, truncate=1)
index[Dictionary._Dictionary__filename_key] = filename
handle = open(filename)
records = parse(handle)
end = 0L
for record in records:
start = end
end = long(handle.tell())
length = end - start
if rec2key is not None:
key = rec2key(record)
else:
key = record.accession
if not key:
raise KeyError("empty key was produced")
elif key in index:
raise KeyError("duplicate key %s found" % key)
index[key] = start, length