/
fair.py
463 lines (394 loc) · 19.2 KB
/
fair.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
"""
Calculates the FAIR score of a resource.
usage: fair.py [-h] [-o OUTPUT] [-v VALIDATE] input
positional arguments:
input The path of an RDF file or URL of RDF data online.
options:
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
A path for an output file or an output format. If set to a file
path, the output will be written to the file, rather than returned
to standard out. If a format is given, that format will be returned
to. standard out. For a given output file path, the file extension
determines the format and must be one of .ttl, .rdf, .json-ld, .nt
for Turtle, RDF/XML, JSON-LD or N-Triples. If a format is given, it
must be one of text/turtle, application/rdf+xml,
application/ld+json, text/nt
-v VALIDATE, --validate VALIDATE
Validate the input with the IDN CP's validator before trying to
score it
"""
from pathlib import Path
from typing import Optional, Union
import httpx
from pyshacl import validate as val
from rdflib import Graph, URIRef, Namespace, Literal
from rdflib.namespace import DCAT, DCTERMS, PROV, RDF
from rdflib.term import Node
from calculators._SCORES import SCORES
from calculators.functions import (
machine_readability_score,
shared_vocabs_ontologies,
licensing_score,
provenance_score,
data_source_score,
)
from calculators.parser import (
_create_parser,
_load_input_graph,
_bind_extra_prefixes,
_create_observation_group,
_create_observation,
_forward_chain_dcat,
_get_valid_output_dir,
_get_valid_output_file_and_type,
)
QB = Namespace("http://purl.org/linked-data/cube#")
RDF_FILE_SUFFIXES = {
".ttl": "text/turtle",
".rdf": "application/rdf+xml",
".json-ld": "application/ld+json",
".nt": "text/nt",
}
EXTRA_PREFIXES = {
"scores": SCORES,
"qb": QB,
}
def calculate_f(metadata: Graph, resource: URIRef, score_container: Node) -> Graph:
"""
F1. (meta)data are assigned a globally unique and eternally persistent identifier.
F2. data are described with rich metadata.
F3. (meta)data are registered or indexed in a searchable resource.
F4. metadata specify the data identifier.
"""
f_value = 0
# from https://ardc.edu.au/resource/fair-data-self-assessment-tool/
# Does the dataset have any identifiers assigned?
# 0 No identifier
# 1 Local identifier
# 3 Web address (URL)
# 8 Globally Unique, citable and persistent (e.g. DOI, PURL, ARK or Handle)
# score will always be 3 or 8 for catalogued resources in RDF
f_value += 3
# if the URL is a DOI etc, +1:
pid_indicators = [
"doi:",
"doi.org",
"ark:",
"purl.org",
"linked.data.gov.au",
"handle.net",
"w3id.org",
]
for pi in pid_indicators:
if pi in str(resource):
f_value += 5
break
# TODO: should we test the URL/PID to see if it resolves?
# Is the dataset identifier included in all metadata records/files describing the data?
# 0 No
# 1 Yes
# always yes for now
f_value += 1
# How is the data described with metadata?
# 0 The data is not described
# 1 Brief title and description
# 3 Comprehensively, but in a text-based, non-standard format
# 4 Comprehensively (see suggestion) using a recognised formal machine-readable metadata schema
# IDN CP data will always be at least +1 here, +3 if more DCTERMS elements are present other than title & desc,
# and +4 if all the following are present: title, description, created, modified, type qualifiedAttribution (1+)
f_value += 1
c = 0
for p in metadata.predicates(resource, None):
if p == DCTERMS.created:
c += 1
elif p == DCTERMS.modified:
c += 1
elif p == DCTERMS.type:
c += 1
elif p == PROV.qualifiedAttribution:
c += 1
if c == 1:
f_value += 1
elif c == 2:
f_value += 2
elif c > 2:
f_value += 3
# What type of repository or registry is the metadata record in?
# 0 The data is not described in any repository
# 2 Local institutional repository
# 2 Domain-specific repository
# 2 Generalist public repository
# 4 Data is in one place but discoverable through several registries
# If a catalogue is indicated, +2. If the catalogue responds to a ping for RDF, +4
catalogue = None
for o in metadata.objects(resource, DCTERMS.isPartOf):
catalogue = str(o)
if catalogue is not None:
f_value += 2
RDF_MEDIA_TYPES = [
"text/turtle",
"text/n3",
"application/ld+json",
"application/n-triples",
"application/n-quads",
"application/rdf+xml",
]
try:
x = httpx.get(
catalogue,
headers={"Accept": ", ".join(RDF_MEDIA_TYPES)},
follow_redirects=True,
)
if x.is_success:
f_value += 2 # changed to maximum of four to align with calculator here https://github.com/au-research/FAIR-Data-Assessment-Tool
except httpx.HTTPError:
pass
return _create_observation(score_container, SCORES.fairFScore, Literal(f_value))
def calculate_a(metadata: Graph, resource: URIRef, score_container: Node) -> Graph:
"""
A1 (meta)data are retrievable by their identifier using a standardized communications protocol.
A1.1 the protocol is open, free, and universally implementable.
A1.2 the protocol allows for an authentication and authorization procedure, where necessary.
A2 metadata are accessible, even when the data are no longer available.
"""
a_value = 0
# How accessible is the data?
# 0. No access to data or metadata
# 1. Access to metadata only
# 2. Unspecified conditional access e.g. contact the data custodian for access
# 3. Embargoed access after a specified date
# 4. A de-identified / modified subset of the data is publicly accessible
# 5. Fully accessible to persons who meet explicitly stated conditions, e.g. ethics approval for sensitive data
# look for a declared availability classification
declared = False
DAR = Namespace("https://linked.data.gov.au/def/data-access-rights/")
for o in metadata.objects(resource, DCAT.theme):
declared = True
# David: scores doubled to align with https://github.com/au-research/FAIR-Data-Assessment-Tool/
if o in [DAR.protected, DAR.restricted]:
a_value += 0
elif o == DAR["metadata-only"]:
a_value += 2
elif o == DAR.conditional:
a_value += 4
elif o == DAR.embargoed:
a_value += 6
# 4
elif o == DAR.open:
a_value += 10
if not declared:
# TODO: try some other method
pass
# Is the data available online without requiring specialised protocols or tools once access has been approved?
# 0. No access to data
# 1. By individual arrangement
# 2. File download from online location
# 3. Non-standard web service (e.g. OpenAPI/Swagger/informal API)
# 4. Standard web service API (e.g. OGC)
return _create_observation(score_container, SCORES.fairAScore, Literal(a_value))
def calculate_i(metadata: Graph, resource: URIRef, score_container: Node) -> Graph:
"""
... describe function inputs/outputs ...
I score notes:
I1. (meta)data use a formal, accessible, shared, and broadly applicable language for knowledge representation.
I2. (meta)data use vocabularies that follow FAIR principles.
I3. (meta)data include qualified references to other (meta)data.
additionally ..
3. Data Objects can be Interoperable only if:
3.1. (Meta) data is machine-actionable [8]
[8] in eScience, machine-readability of data is imminent. Metadata being machine readable is a conditio sine qua
non for FAIRness. Having the actual data elements also machine-readable will make the Data Object of a higher
level of interoperability and makes functional interlinking and analysis in broader context much easier, but it
is not a pre-condition for FAIR data publishing. Some data elements, for instance images and ‘raw data’ can not
always be made machine-processable. Being published with FAIR metadata is of very high value in its own right.
3.2. (Meta) data formats utilize shared vocabularies and/or ontologies [9]
[9] When the use of community adopted and public terminology systems is not possible, for instance for reasons
described in explanatory note 5, or because the Data Objects contain concepts that have not yet been described
in any public vocabulary or ontology known to the provider, the provider should nevertheless try to create a
term vocabulary of their own and publish it publicly and openly, preferably in a machine-readable form. The
vocabulary or ontology that constrains each constrained data field should be unambiguously identified either by
the field itself or by the associated Data Object metadata. For non-constrained fields, whenever possible the
value-type of the field should be annotated using a publicly-accessible vocabulary or ontology. This annotation
should be clear in the Data Object metadata.
3.3 (Meta) data within the Data Object should thus be both syntactically parseable and semantically
machine-accessible [10]
[10] Both syntax and semantics of data models and formats used for (Meat) data in Data Objects should be easy to
identify and use, parse or translate by machines. As in the case of identifier schemes and vocabularies, a wide
variety of data formats (ranging from URI-featuring spread-sheets such as RightField or OntoMaton to rich RDF) can
be principally FAIR. It is obvious that any parsing and translation protocol is error-prone and the ideal situation
is to restrict FAIR data publishing to as few community adopted formats and standards as possible. However, if a
provider can prove that an alternative data model/format is unambiguously parsable to one of the community adopted
FAIR formats, there is no particular reason why such a format could not be considered FAIR. Some data types may
simply be not ‘capturable’ in one of the existing formats, and in that case maybe only part of the data elements can
be parsed. FAIRports will increasingly offer guidance and assistance in such cases.
"""
i_value = 0
# 3.1 is the *data* machine-readable?
i_value += machine_readability_score(metadata, resource)
# 3.1 the metadata is assumed machine-readable in order to use this tool
i_value += 2
# 3.2 "(Meta) data formats utilize shared vocabularies and/or ontologies"
i_value += shared_vocabs_ontologies(metadata, resource)
# 3.3 "(Meta) data within the Data Object should thus be both syntactically parseable and semantically machine-accessible"
# If the data is both machine-readable and uses shared vocabularies and/or ontologies, then it *should* also be parseable
# and machine-accessible.
# The total possible score for the data machine readability and shared vocabularies and/or ontologies is 4, if at
# least 3 points are scored, a further 2 points are added, if at least 1 point is scored, a further 1 point is added.
i_value_ignoring_metadata = i_value - 2
if i_value_ignoring_metadata >= 3:
i_value += 2
elif i_value_ignoring_metadata >= 1:
i_value += 2
return _create_observation(score_container, SCORES.fairIScore, Literal(i_value))
def calculate_r(metadata: Graph, resource: URIRef, score_container: Node) -> Graph:
"""
R1. (meta)data have a plurality of accurate and relevant attributes.
R1.1. (meta)data are released with a clear and accessible data usage license.
R1.2. (meta)data are associated with their provenance.
R1.3. (meta)data meet domain-relevant community standards.
4. For Data Objects to be Re-usable additional criteria are:
4.1 Data Objects should be compliant with principles 1-3
4.2 (Meta) data should be sufficiently well-described and rich that it can be automatically (or with minimal
human effort) linked or integrated, like-with-like, with other data sources [11 and JDDCP 7 and JDDCP 8]
4.3 Published Data Objects should refer to their sources with rich enough metadata and provenance to enable
proper citation (ref to JDDCP 1-3).
JDDCP 1-3:
1. Importance
Data should be considered legitimate, citable products of research. Data citations should be accorded the same
importance in the scholarly record as citations of other research objects, such as publications[1].
2. Credit and Attribution
Data citations should facilitate giving scholarly credit and normative and legal attribution to all contributors
to the data, recognizing that a single style or mechanism of attribution may not be applicable to all data[2].
3. Evidence
In scholarly literature, whenever and wherever a claim relies upon data, the corresponding data should be cited[3].
David comment: Assume R1. / 4.1 is referring to "F", "A" and "I" principles. As these are scored separately it would be duplicative
to score them again here (?).
"""
r_value = 0
# R1.1. "(meta)data are released with a clear and accessible data usage license."
r_value += licensing_score(metadata, resource) # max score is 2
# R1.2. "(meta)data are associated with their provenance."
# Assume provenance is declared through the use of a standard set of properties, such those in the provenance
# ontology
r_value += provenance_score(metadata) # max score is 3
# R1.3. "(meta)data meet domain-relevant community standards."
# interpreted as referring to 4.3, which in turn refers to JDDCP 1-3.
# This has been interpreted that, if a dcterms:source is declared, it should ideally be a URI,
# and additional provenance information for it should exist.
# logic implemented: if a dcterms:source is declared, check its type: if URI 2 points, otherwise, if it is a literal
# AND has a datatype of xsd:anyURI, 1 point, otherwise 0 points.
r_value += data_source_score(metadata, resource) # max score is 2
return _create_observation(score_container, SCORES.fairRScore, Literal(r_value))
def calculate_fair(g: Graph, resource: URIRef) -> Graph:
s = Graph(bind_namespaces="rdflib")
_bind_extra_prefixes(s, EXTRA_PREFIXES)
og_node, og_graph = _create_observation_group(resource, SCORES.FairScore)
s += og_graph
s += calculate_f(g, resource, og_node)
s += calculate_a(g, resource, og_node)
s += calculate_i(g, resource, og_node)
s += calculate_r(g, resource, og_node)
return s
def calculate_fair_per_resource(g: Graph) -> Graph:
scores = Graph(bind_namespaces="rdflib")
_bind_extra_prefixes(scores, EXTRA_PREFIXES)
for r in g.subjects(RDF.type, DCAT.Resource):
scores += calculate_fair(g, r) # type: ignore
return scores
def normalise_fair_scores(g: Graph) -> Graph:
"""
Normalizes FAIR scores to a range between 0 and 1, where 0 is the lowest possible score and 1 is the highest
possible score.
"""
for s in g.subjects(SCORES.hasScore, None):
og_node, og_graph = _create_observation_group(s, SCORES.FairScoreNormalised)
f_value = next(g.objects(subject=None, predicate=SCORES.fairFScore))
a_value = next(g.objects(subject=None, predicate=SCORES.fairAScore))
i_value = next(g.objects(subject=None, predicate=SCORES.fairIScore))
r_value = next(g.objects(subject=None, predicate=SCORES.fairRScore))
g += _create_observation(
og_node, SCORES.fairFScoreNormalised, Literal(f"{int(f_value) / 17:.2f}")
)
g += _create_observation(
og_node, SCORES.fairAScoreNormalised, Literal(f"{int(a_value) / 10:.2f}")
)
g += _create_observation(
og_node, SCORES.fairIScoreNormalised, Literal(f"{int(i_value) / 8:.2f}")
)
g += _create_observation(
og_node, SCORES.fairRScoreNormalised, Literal(f"{int(r_value) / 7:.2f}")
)
g.add((og_node, RDF.type, SCORES.FairScoreNormalised))
g.add((og_node, RDF.type, QB.ObservationGroup))
g.add((og_node, SCORES.refResource, s))
g.add((s, SCORES.hasScore, og_node))
return g
def main(
input: Union[Path, str, Graph],
output: Optional[str] = "text/turtle",
validate: bool = False,
):
"""The main function of this module. Accepts a path to an RDF file, a URL leading to RDF or an RDFLib graph
as input and returns either an RDFLib Graph object, an RDF stream in the given format or writes RDF to a file with
format specified by file ending"""
# load input
if isinstance(input, Graph):
g = input
else:
g = _load_input_graph(input)
# build out input
_forward_chain_dcat(g)
# TODO point to https://data.idnau.org/pid/cp/validator.ttl as validator.
# validate
if validate:
validator = Path(__file__).parent.parent.absolute().parent / "validator.ttl"
conforms, report_graph, report_text = val(g, shacl_graph=str(validator))
if not conforms:
raise ValueError(
f"Input is not valid IDN CP. Validation errors are:\n{report_text}"
)
# calculate
scores = calculate_fair_per_resource(g)
norm_scores = normalise_fair_scores(scores)
scores += norm_scores
# generate output
# std out
if output in RDF_FILE_SUFFIXES.values():
if output == "application/ld+json":
jsonld_context = {
"@vocab": "https://linked.data.gov.au/def/scores/",
"dcat": "http://www.w3.org/ns/dcat#",
"qb": "http://purl.org/linked-data/cube#",
"time": "http://www.w3.org/2006/time#",
"xsd": "http://www.w3.org/2001/XMLSchema#",
}
# adding all prefixes bound to the graph to the JSON-LD context seems not to work
# for prefix, namespace in scores.namespace_manager.namespaces():
# jsonld_context[prefix] = namespace
print(
scores.serialize(
format=output, indent=4, context=jsonld_context, auto_compact=True
)
)
else:
print(
scores.serialize(
format="longturtle" if output == "text/turtle" else output
)
)
# write to file
elif output.endswith(tuple(RDF_FILE_SUFFIXES.keys())):
p = Path(output)
output_dir = _get_valid_output_dir(p)
output_file, output_format = _get_valid_output_file_and_type(p)
return scores.serialize(
destination=p,
format="longturtle" if output_format == "text/turtle" else output_format,
)
# return Graph object
else:
return scores
if __name__ == "__main__":
args = _create_parser().parse_args()
main(args.input, args.output, args.validate)