-
Notifications
You must be signed in to change notification settings - Fork 0
/
canonicalization.ts
293 lines (263 loc) · 12.1 KB
/
canonicalization.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
/**
* Top level entry point for the canonicalization algorithm.
*
* @copyright Ivan Herman 2023
*
* @packageDocumentation
*/
import * as rdf from '@rdfjs/types';
import {
GlobalState, BNodeId, Hash, Quads, NDegreeHashResult, concatNquads,
quadsToNquads, parseNquads, InputDataset, C14nResult, InputQuads,
isQuads
} from './common';
import { computeFirstDegreeHash } from './hash1DegreeQuads';
import { computeNDegreeHash } from './hashNDegreeQuads';
import { IDIssuer } from './issueIdentifier';
import { bntqToLogItem, ndhrToLogItem, htbnToLogItem, LogItem } from './logging';
import { BnodeSet } from './common';
import * as n3 from 'n3';
/**
* A trivial mapping from blank nodes to their IDs; the return value is used
* as part of the canonicalization return structure
*/
const createBidMap = (graph: Quads): ReadonlyMap<rdf.BlankNode, BNodeId> => {
// We collect the bnodes from the graph in one place,
// using a TermSet will automatically remove duplicates
const bnodes: BnodeSet = new BnodeSet();
const addBnode = (term: rdf.Term): void => {
if (term.termType === "BlankNode") {
bnodes.add(term);
}
};
for (const quad of graph) {
addBnode(quad.subject);
addBnode(quad.object);
addBnode(quad.graph);
};
return new Map(Array.from(bnodes).map((node: rdf.BlankNode): [rdf.BlankNode, BNodeId] => [node, node.value]));
};
/**
* Implementation of the main [steps on the top level](https://www.w3.org/TR/rdf-canon/#canon-algo-algo) of the algorithm specification.
*
* @param state - the overall canonicalization state + interface to the underlying RDF environment
* @param input
* @param copy - whether the input should be copied to a local store (e.g., if the input is a generator, or the uniqueness of quads are not guaranteed). If this
* parameter is not used (i.e., value is `undefined`) the copy is always done _unless_ the input is an `rdf.DatasetCore` instance.
* @returns - A semantically identical set of Quads using canonical BNode labels, plus other information.
*
* @async
*/
export async function computeCanonicalDataset(state: GlobalState, input: InputDataset, copy: boolean | undefined = undefined): Promise<C14nResult> {
const finalInput = async (): Promise<InputQuads> => {
if (typeof input === 'string') {
return await parseNquads(input as string);
} else {
if (copy ?? !isQuads(input)) {
const retval = new n3.Store();
for (const quad of input) retval.add(quad);
return retval;
}
return input;
}
}
// Re-initialize the state information: canonicalization should always start with a clean state
state.bnode_to_quads = {};
state.hash_to_bnodes = {};
state.canonical_issuer = new IDIssuer();
state.current_n_degree_call = 0;
// The input to the algorithm can be either an nQuads document, or a dataset
// representation with Quads. This function makes the nQuad document "disappear" from
// the rest of the processing.
const input_dataset: InputQuads = await finalInput();
const retval: Quads = new n3.Store();
// Step 2
// All quads are 'classified' depending on what bnodes they contain
// Results in a mapping from bnodes to all quads that they are part of.
//
// Note that the algorithm is slightly simpler than in the spec, because there
// no need for a separate "map entry" because in RDF-JS each term carries its
// own ID directly
{
for (const quad of input_dataset) {
const bnode_map = (t: rdf.Term): void => {
if (t.termType === "BlankNode") {
const bnode: BNodeId = t.value;
if (state.bnode_to_quads[bnode] === undefined) {
state.bnode_to_quads[bnode] = [quad];
} else {
state.bnode_to_quads[bnode].push(quad);
}
}
};
bnode_map(quad.subject);
// The algorithm is not prepared for generalized graphs, ie, predicates cannot be bnodes
bnode_map(quad.object);
bnode_map(quad.graph);
}
}
// "Side step" (not directly in the specification): the number of bnodes is used to set the maximum number of hash-n-degree-quads call
// This is the mechanism used to avoid poison graphs.
state.maximum_n_degree_call = state.complexity_number * Object.keys(state.bnode_to_quads).length;
/* @@@ */
state.logger.info("ca.2", "Entering the canonicalization function (4.4.3 (2)).", {
"Bnode to quads": bntqToLogItem(state.bnode_to_quads)
});
/* @@@ */
// Step 3
{
/* @@@ */ state.logger.push("ca.3");
/* @@@ */ state.logger.push("ca.3.1");
// Compute a hash value for each bnode (depending on the quads it appear in)
// In simple cases a hash value refers to one bnode only; in unlucky cases there
// may be more. Hence the usage of the hash_to_bnodes map.
// The code below serializes a series of Promise references which is not nice
// However, if I use a Promise.all, although that works, it messes up the log entries' order for some reasons
// Because, deep underneath all, the hash function operates on in-memory data in one block (as opposed to streaming),
// it probably does not really matter speed-wise...
for (const n of Object.keys(state.bnode_to_quads)) {
const hfn: Hash = await computeFirstDegreeHash(state, n);
// Step 3.2
if (state.hash_to_bnodes[hfn] === undefined) {
state.hash_to_bnodes[hfn] = [n];
} else {
state.hash_to_bnodes[hfn].push(n);
}
}
/* @@@ */ state.logger.pop();
/* @@@ */
state.logger.info("ca.3.2", "Calculated first degree hashes (4.4.3. (3))", {
"Hash to bnodes": htbnToLogItem(state.hash_to_bnodes)
});
state.logger.pop();
/* @@@ */
}
// Step 4
{
// For each hash take the corresponding bnode, and issue a new, canonical id in a sequence.
// This works only for those hashes where there is one associated bnode; for the ones
// where this is not the case, step 5 will kick in later.
// It is important to order the hashes, because it influences the order of issuing the canonical ids.
// If a bnode is "handled", ie, it does have a canonical ID, it is removed from the
// state structure on hash->bnodes.
const hashes: Hash[] = Object.keys(state.hash_to_bnodes).sort();
const logItems: LogItem[] = [];
for (const hash of hashes) {
const identifier_list: BNodeId[] = state.hash_to_bnodes[hash];
// Step 4.1
// Filter out the nasty cases
if (identifier_list.length > 1) continue;
// Step 4.2
// Here is the essential part: issue the canonical identifier.
// Note that the IdIssuer automatically stores the (existing, issued) pairs for the
// bnode identifier; these are retrieved in the last step when a new, normalized
// graph is created.
const canon_id = state.canonical_issuer.issueID(identifier_list[0]);
/* @@@ */
logItems.push({
"identifier": identifier_list[0],
"hash": hash,
"canonical id": canon_id
});
/* @@@ */
// Step 4.3
// Remove the corresponding hash
delete state.hash_to_bnodes[hash];
}
/* @@@ */
state.logger.info("ca.4", "Canonicalization function (4.4.3 (4)).", ...logItems);
/* @@@ */
}
// Step 5
// This step takes care of the bnodes that do not have been canonicalized in the previous step,
// because their simple, first degree hashes are not unique.
{
/* @@@ */
state.logger.push("ca.5", "Calculate hashes for identifiers with shared hashes (4.4.3. (5)).");
state.logger.debug("ca.5.extra", "", {
"Hash to bnodes": htbnToLogItem(state.hash_to_bnodes)
});
/* @@@ */
const hashes: Hash[] = Object.keys(state.hash_to_bnodes).sort();
/* @@@ */ if (hashes.length > 0) state.logger.push("ca.5.1");
for (const hash of hashes) {
const identifier_list: BNodeId[] = state.hash_to_bnodes[hash];
// This cycle takes care of all problematic cases that share the same hash
// Step 5.1
// This stores a calculated hash and its relates identifier issuer for each
// bnode related to this particular hash value
const hash_path_list: NDegreeHashResult[] = [];
// Step 5.2
/* @@@ */ state.logger.push("ca.5.2");
for (const n of identifier_list) {
if (state.canonical_issuer.isSet(n)) {
// Step 5.2.1
continue;
} else {
// Step 5.2.2
const temporary_issuer = new IDIssuer('b');
// Step 5.2.3
// Note that bn is not used as a separate value; putting the variable declaration in comment
// to make eslint happy
/* const bn = */ temporary_issuer.issueID(n);
// Step 5.2.4
const result: NDegreeHashResult = await computeNDegreeHash(state, n, temporary_issuer);
hash_path_list.push(result);
}
}
/* @@@ */ state.logger.pop();
// Step 5.3
const ordered_hash_path_list = hash_path_list.sort((a: NDegreeHashResult, b: NDegreeHashResult): number => {
if (a.hash < b.hash) return -1;
else if (a.hash > b.hash) return 1;
else return 0;
});
/* @@@ */
state.logger.debug("ca.5.2.extra", "Canonicalization function, after (4.4.3 (5.2)), ordered hash past list.", {
"computed for": hash,
"hash path list": ndhrToLogItem(ordered_hash_path_list)
});
/* @@@ */
for (const result of ordered_hash_path_list) {
// Step 5.3.1
for (const [existing, _temporary] of result.issuer) {
state.canonical_issuer.issueID(existing);
}
}
}
/* @@@ */ if (hashes.length > 0) state.logger.pop();
/* @@@ */ state.logger.pop();
}
// Step 6
{
// This function replaces the term with its canonical equivalent, if applicable
const replace_bnode = (term: rdf.Term): rdf.Term => {
if (term.termType === "BlankNode") {
const canonical = state.canonical_issuer.issueID(term.value);
return state.dataFactory.blankNode(canonical);
} else {
return term;
}
};
for (const quad of input_dataset) {
// Step 6.1 & 6.2
const subject_copy = replace_bnode(quad.subject) as rdf.Quad_Subject;
const object_copy = replace_bnode(quad.object) as rdf.Quad_Object;
const graph_copy = replace_bnode(quad.graph) as rdf.Quad_Graph;
retval.add(state.dataFactory.quad(subject_copy, quad.predicate, object_copy, graph_copy));
}
}
/* @@@ */
state.logger.info("ca.6", "Leaving the canonicalization function (4.4.3)", {
"issuer": state.canonical_issuer.toLogItem(),
});
/* @@@ */
// Step 7
const return_value: C14nResult = {
canonical_form: concatNquads(quadsToNquads(retval)),
canonicalized_dataset: retval,
bnode_identifier_map: createBidMap(retval),
issued_identifier_map: state.canonical_issuer.issued_identifier_map as ReadonlyMap<BNodeId, BNodeId>,
};
return return_value;
}