-
Notifications
You must be signed in to change notification settings - Fork 38
/
api.py
288 lines (237 loc) · 10.4 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020-2023 CERN.
# Copyright (C) 2021 Northwestern University.
#
# Invenio-Drafts-Resources is free software; you can redistribute it and/or
# modify it under the terms of the MIT License; see LICENSE file for more
# details.
"""Record, Draft and Parent Record API classes.
These classes belongs to the data access layer and MUST ONLY be accessed from
within the service layer. It's wrong to use these classes in the presentation
layer.
A record and a draft share a single parent record. The parent record is used
to store properties common to all versions of a record (e.g. access control).
The draft and record share the same UUID, and thus both also share a single
persistent identifier. The parent record has its own UUID and own persistent
identifier.
"""
import uuid
from datetime import datetime, timedelta
from invenio_db import db
from invenio_pidstore.models import PIDStatus
from invenio_pidstore.providers.recordid_v2 import RecordIdProviderV2
from invenio_records.systemfields import ModelField
from invenio_records_resources.records import Record as RecordBase
from invenio_records_resources.records.systemfields import PIDField, PIDStatusCheckField
from sqlalchemy.orm.exc import NoResultFound
from .systemfields import ParentField, VersionsField
#
# Persistent identifier providers
#
class DraftRecordIdProviderV2(RecordIdProviderV2):
"""Draft PID provider."""
default_status_with_obj = PIDStatus.NEW
#
# Record API classes
#
class ParentRecord(RecordBase):
"""Parent record API."""
# Configuration
model_cls = None
pid = PIDField("id", provider=DraftRecordIdProviderV2, delete=True)
class Record(RecordBase):
"""Record API."""
#: Class attribute to make it easy to check if record is a draft or not.
is_draft = False
#
# Configuration to be set by a subclass
#
#: The record's SQLAlchemy model class. Must be set by the subclass.
model_cls = None
#: The parent state's SQLAlchemy model class. Must be set by the subclass.
versions_model_cls = None
#: The parent record's API class. Must be set by the subclass.
parent_record_cls = None
#
# System fields
#
#: The internal persistent identifier. Records and drafts share UUID.
pid = PIDField("id", provider=DraftRecordIdProviderV2, delete=True)
#: System field to check if a record has been published.
is_published = PIDStatusCheckField(status=PIDStatus.REGISTERED)
#: The parent record - the draft is responsible for creating the parent.
parent = ParentField(
ParentRecord, create=False, soft_delete=False, hard_delete=False
)
#: Version relationship
versions = VersionsField(create=True, set_latest=True)
@classmethod
def get_records_by_parent(cls, parent, with_deleted=True, ids_only=False):
"""Get all sibling records for the specified parent record."""
with db.session.no_autoflush:
rec_models = cls.model_cls.query.filter_by(parent_id=parent.id)
if not with_deleted:
rec_models = rec_models.filter_by(is_deleted=False)
if ids_only:
return (rec_model.id for rec_model in rec_models)
else:
return (
cls(rec_model.data, model=rec_model, parent=parent)
for rec_model in rec_models
)
@classmethod
def get_latest_by_parent(cls, parent, id_only=False):
"""Get the latest record for the specified parent record.
It might return None if there is no latest published version yet.
"""
with db.session.no_autoflush:
version = cls.versions_model_cls.query.filter_by(
parent_id=parent.id
).one_or_none()
has_latest = version and version.latest_id
if not has_latest:
return None
rec_model = cls.model_cls.query.filter_by(id=version.latest_id).one()
if id_only:
return rec_model.id
else:
return cls(rec_model.data, model=rec_model, parent=parent)
@classmethod
def publish(cls, draft):
"""Publish a draft as a new record.
If a record already exists, we simply get the record. If a draft has
not yet been published, we create the record.
The caller is responsible for registering the internal persistent
identifiers (see ``register()``).
"""
if draft.is_published:
record = cls.get_record(draft.id)
else:
record = cls.create(
{},
# A draft and record share UUID, so we reuse the draft's id/pid
id_=draft.id,
pid=draft.pid,
# Link the record with the parent record and set the versioning
# relationship.
parent=draft.parent,
versions=draft.versions,
)
# Merge the PIDs into the current db session if not already in the
# session.
cls.pid.session_merge(record)
cls.parent_record_cls.pid.session_merge(record.parent)
return record
def register(self):
"""Register the internal persistent identifiers."""
if not self.parent.pid.is_registered():
self.parent.pid.register()
self.parent.commit()
self.pid.register()
class Draft(Record):
"""Draft base API for metadata creation and manipulation."""
#: Class attribute to make it easy to check if record is a draft or not.
is_draft = True
#
# Configuration to be set by a subclass
#
#: The record's SQLAlchemy model class. Must be set by the subclass.
model_cls = None
#: The parent state's SQLAlchemy model class. Must be set by the subclass.
versions_model_cls = None
#: The parent record's API class. Must be set by the subclass.
parent_record_cls = None
#
# System fields
#
#: The internal persistent identifier. Records and drafts share UUID.
pid = PIDField("id", provider=DraftRecordIdProviderV2, delete=False)
#: The parent record - the draft is responsible for creating the parent.
parent = ParentField(ParentRecord, create=True, soft_delete=False, hard_delete=True)
#: Version relationship
versions = VersionsField(create=True, set_next=True)
#: The expiry date of the draft.
expires_at = ModelField()
#: Revision id of record from which this draft was created.
fork_version_id = ModelField()
@classmethod
def new_version(cls, record):
"""Create a draft for a new version of a record.
The caller is responsible for:
1) checking if a draft for a new version already exists
2) moving the record data into the draft data.
"""
return cls.create(
{},
# We create a new id, because this is for a new version.
id=uuid.uuid4(),
# Links the draft with the same parent (i.e. a new version).
parent=record.parent,
versions=record.versions,
# New drafts without a record (i.e. unpublished drafts) must set
# the fork version id to None.
fork_version_id=None,
)
@classmethod
def edit(cls, record):
"""Create a draft for editing an existing version of a record."""
try:
# We soft-delete a draft once it has been published, in order to
# keep the version_id counter around for optimistic concurrency
# control (both for search indexing and for REST API clients)
draft = cls.get_record(record.id, with_deleted=True)
if draft.is_deleted:
draft.undelete()
# Below line is needed to dump PID back into the draft.
draft.pid = record.pid
# Ensure record is link with the parent
draft.parent = record.parent
draft.versions = record.versions
# Ensure we record the revision id we forked from
draft.fork_version_id = record.revision_id
# Note, other values like bucket_id values was kept in the
# soft-deleted record, so we are not setting them again here.
except NoResultFound:
# If a draft was ever force deleted, then we re-create it.
# A classic scenario for this case is editing a published record
# after enough time has passed for its original draft to have
# been cleaned up. It then needs to be recreated.
draft = cls.create(
{},
# A draft to edit a record must share the id and uuid.
id_=record.id,
pid=record.pid,
# Link it with the same parent record
parent=record.parent,
versions=record.versions,
# Record which record version we forked from.
fork_version_id=record.revision_id,
)
return draft
@classmethod
def cleanup_drafts(cls, td, search_gc_deletes=60):
"""Clean up (hard delete) all the soft deleted drafts.
The soft-deleted drafts in the last timedelta span of time won't be deleted,
including `search_gc_deletes` seconds timedelta. This ensures that only
drafts fully removed from the search cluster can be hard-deleted (e.g. when
`td` is very short), avoiding search conflicts.
:param int search_gc_deletes: time in seconds, corresponding to the search cluster
setting `index.gc_deletes` (see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-delete.html#delete-versioning),
default to 60 seconds. Search cluster caches deleted documents for `index.gc_deletes` seconds.
"""
timestamp = datetime.utcnow() - td - timedelta(seconds=search_gc_deletes)
draft_model = cls.model_cls
models = draft_model.query.filter(
draft_model.is_deleted == True, # noqa
draft_model.updated < timestamp,
).all()
# we need to clear the foreign keys in the version info
for model in models:
draft = cls(model.data, model=model)
draft.versions.clear_next()
# now we can delete the drafts without violating foreign keys
ids = [model.id for model in models]
draft_model.query.filter(draft_model.id.in_(ids)).delete(
synchronize_session=False
)