-
Notifications
You must be signed in to change notification settings - Fork 11
/
doi.py
150 lines (115 loc) · 4.44 KB
/
doi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import re
from collections import defaultdict
from more_itertools import peekable
from ..identifier import Identifier
DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'b', 'u', 'i', 's', 'ins', 'del', 'code', 'tt', 'blockquote',
'pre']
TAGS_RE = re.compile(r'<(/\s*)?(' + '|'.join(HTML_TAGS) + ')(\s[^>\n\r]+)?>', re.I)
'''
DOI_RE = re.compile(r'\b(10\.\d+/[^\s\|\]\}\?\,]+)')
def extract_regex(text):
for match in DOI_RE.finditer(text):
id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
yield Identifier("doi", id)
import mwparserfromhell as mwp
def extract_mwp(text):
no_tags = mwp.parse(text).strip_code()
for match in DOI_RE.finditer(no_tags):
id = re.sub(TAGS_RE, "", match.group(1)).rstrip(".")
yield Identifier("doi", id)
'''
LEXICON = [
(DOI_START_RE.pattern, 'doi_start'),
(r'\(', 'open_paren'),
(r'\)', 'close_paren'),
(r'\[', 'open_bracket'),
(r'\]', 'close_bracket'),
(r'<!--', 'comment_start'),
(r'-->', 'comment_end'),
(TAGS_RE.pattern, 'tag'),
(r'<', 'open_angle'),
(r'>', 'close_angle'),
(r'\{', 'open_curly'),
(r'\}', 'close_curly'),
(r'\|', 'pipe'),
(r'[,\.;!]', 'punct'),
(r'[\?#]', 'url_end'),
(r'[\n\r]+', 'break'),
(r'\s+', 'whitespace'),
(r'\w+', 'word'),
(r'.', 'etc')
]
def extract_island(text):
tokens = tokenize_finditer(text, LEXICON)
tokens = peekable(tokens)
while tokens.peek(None) is not None:
if tokens.peek()[0] == 'doi_start':
yield ('doi', read_doi(tokens))
next(tokens)
def tokenize_finditer(text, lexicon=LEXICON):
pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
for pattern, name in lexicon)
group_regex = re.compile(pattern, re.I|re.U|re.M)
for match in group_regex.finditer(text):
yield match.lastgroup, match.group(0)
"""
def tokenize_scanner(text, lexicon=LEXICON):
scanner = re.Scanner(lexicon)
tokens, remainder = scanner.scan(text)
return tokens
"""
#from mwcites.extractors.doi import tokenize_scan
#list(tokenize_scan("foo bar baz.{}"))
def read_doi(tokens):
assert tokens.peek()[0] == 'doi_start'
depth = defaultdict(lambda: 0)
doi_buffer = [next(tokens)[1]]
while tokens.peek(None) is not None:
name, match = tokens.peek()
if name in ('url_end', 'break', 'whitespace', 'tag', 'pipe',
'comment_start', 'comment_end'):
break
elif name == 'open_bracket':
depth['bracket'] += 1
doi_buffer.append(next(tokens)[1])
elif name == 'open_curly':
depth['curly'] += 1
doi_buffer.append(next(tokens)[1])
elif name == 'close_bracket':
if depth['bracket'] > 0:
depth['bracket'] -= 1
doi_buffer.append(next(tokens)[1])
else:
break
elif name == 'close_curly':
if depth['curly'] > 0:
depth['curly'] -= 1
doi_buffer.append(next(tokens)[1])
else:
break
else:
doi_buffer.append(next(tokens)[1])
# Do not return a doi with punctuation at the end
return re.sub(r'[\.,!]+$', '', ''.join(doi_buffer))
def tokenize_search(text, start, lexicon=LEXICON):
pattern = '|'.join("(?P<{0}>{1})".format(name, pattern)
for pattern, name in lexicon)
group_regex = re.compile(pattern, re.I|re.U)
match = group_regex.search(text, start)
while match is not None:
yield match.lastgroup, match.group(0)
match = group_regex.search(text, match.span()[1])
def extract_search(text, lexicon=LEXICON):
last_end = 0
for match in DOI_START_RE.finditer(text):
if match.span()[0] > last_end:
tokens = tokenize_search(text, match.span()[0], lexicon=lexicon)
tokens = peekable(tokens)
doi = read_doi(tokens)
last_end = match.span()[0] + len(doi)
yield Identifier('doi', doi)
else:
last_end = max(match.span()[1], last_end)
extract = extract_search # Setting the default to the best method