/
api.py
165 lines (137 loc) · 5.92 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding: utf-8 -*-
#
# This file is part of Invenio.
# Copyright (C) 2015, 2016 CERN.
#
# Invenio is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Invenio is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Invenio; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Invenio-OAIHarvester API to harvest items from OAI-PMH servers.
If you need to schedule or run harvests from inside of Python, you can use our
API:
.. code-block:: python
from invenio_oaiharvester.api import get_records
request, records = get_records(identifiers=["oai:arXiv.org:1207.7214"],
url="http://export.arxiv.org/oai2")
for record in records:
print rec.raw
"""
from __future__ import absolute_import, print_function
import datetime
from invenio_db import db
from sickle import Sickle
from sickle.oaiexceptions import NoRecordsMatch
from .errors import NameOrUrlMissing, WrongDateCombination
from .utils import get_oaiharvest_object
def list_records(metadata_prefix=None, from_date=None, until_date=None,
url=None, name=None, setspecs=None, encoding=None):
"""Harvest multiple records from an OAI repo.
:param metadata_prefix: The prefix for the metadata return
(defaults to 'oai_dc').
:param from_date: The lower bound date for the harvesting (optional).
:param until_date: The upper bound date for the harvesting (optional).
:param url: The The url to be used to create the endpoint.
:param name: The name of the OAIHarvestConfig to use instead of passing
specific parameters.
:param setspecs: The 'set' criteria for the harvesting (optional).
:param encoding: Override the encoding returned by the server. ISO-8859-1
if it is not provided by the server.
:return: request object, list of harvested records
"""
lastrun = None
if name:
url, _metadata_prefix, lastrun, _setspecs = get_info_by_oai_name(name)
# In case we provide a prefix, we don't want it to be
# overwritten by the one we get from the name variable.
if metadata_prefix is None:
metadata_prefix = _metadata_prefix
if setspecs is None:
setspecs = _setspecs
elif not url:
raise NameOrUrlMissing(
"Retry using the parameters -n <name> or -u <url>."
)
request = Sickle(url, encoding=encoding)
# By convention, when we have a url we have no lastrun, and when we use
# the name we can either have from_date (if provided) or lastrun.
dates = {
'from': from_date or lastrun,
'until': until_date
}
# Sanity check
if (dates['until'] is not None) and (dates['from'] > dates['until']):
raise WrongDateCombination("'Until' date larger than 'from' date.")
lastrun_date = datetime.datetime.now()
# Use a dict to only return the same record once
# (e.g. if it is part of several sets)
records = {}
setspecs = setspecs.split() or [None]
for spec in setspecs:
params = {
'metadataPrefix': metadata_prefix or "oai_dc"
}
params.update(dates)
if spec:
params['set'] = spec
try:
for record in request.ListRecords(**params):
records[record.header.identifier] = record
except NoRecordsMatch:
continue
# Update lastrun?
if from_date is None and until_date is None and name is not None:
oai_source = get_oaiharvest_object(name)
oai_source.update_lastrun(lastrun_date)
oai_source.save()
db.session.commit()
return request, records.values()
def get_records(identifiers, metadata_prefix=None, url=None, name=None,
encoding=None):
"""Harvest specific records from an OAI repo via OAI-PMH identifiers.
:param metadata_prefix: The prefix for the metadata return
(defaults to 'oai_dc').
:param identifiers: list of unique identifiers for records to be harvested.
:param url: The The url to be used to create the endpoint.
:param name: The name of the OAIHarvestConfig to use instead of passing
specific parameters.
:param encoding: Override the encoding returned by the server. ISO-8859-1
if it is not provided by the server.
:return: request object, list of harvested records
"""
if name:
url, _metadata_prefix, _, __ = get_info_by_oai_name(name)
# In case we provide a prefix, we don't want it to be
# overwritten by the one we get from the name variable.
if metadata_prefix is None:
metadata_prefix = _metadata_prefix
elif not url:
raise NameOrUrlMissing(
"Retry using the parameters -n <name> or -u <url>."
)
request = Sickle(url, encoding=encoding)
records = []
for identifier in identifiers:
arguments = {
'identifier': identifier,
'metadataPrefix': metadata_prefix or "oai_dc"
}
records.append(request.GetRecord(**arguments))
return request, records
def get_info_by_oai_name(name):
"""Get basic OAI request data from the OAIHarvestConfig model.
:param name: name of the source (OAIHarvestConfig.name)
:return: (url, metadataprefix, lastrun as YYYY-MM-DD, setspecs)
"""
obj = get_oaiharvest_object(name)
lastrun = obj.lastrun.strftime("%Y-%m-%d")
return obj.baseurl, obj.metadataprefix, lastrun, obj.setspecs