From 2c38e7c9611db018a0e7fd61be33d1d027e83545 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 13:37:35 +0200 Subject: [PATCH 01/16] Add number of root nodes to graph storage statistics Default to "1" for already imported corpora. --- CHANGELOG.md | 1 + core/src/graph/storage/adjacencylist.rs | 2 ++ core/src/graph/storage/disk_adjacency.rs | 2 ++ core/src/graph/storage/mod.rs | 12 ++++++++++-- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbdc41385..7497f2ac4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ configuration. Can be used to additionally output all log messages to the given file. - `Graph:ensure_loaded_parallel` returns the actually loaded components that did exist. +- Add number of root nodes to graph storage statistics. ### Fixed diff --git a/core/src/graph/storage/adjacencylist.rs b/core/src/graph/storage/adjacencylist.rs index 9a105acc9..6972c33a2 100644 --- a/core/src/graph/storage/adjacencylist.rs +++ b/core/src/graph/storage/adjacencylist.rs @@ -334,6 +334,7 @@ impl WriteableGraphStorage for AdjacencyListStorage { cyclic: false, rooted_tree: true, nodes: 0, + root_nodes: 0, dfs_visit_ratio: 0.0, }; @@ -370,6 +371,7 @@ impl WriteableGraphStorage for AdjacencyListStorage { } } } + stats.root_nodes = roots.len(); let fan_outs = get_fan_outs(&self.edges); let sum_fan_out: usize = fan_outs.iter().sum(); diff --git a/core/src/graph/storage/disk_adjacency.rs b/core/src/graph/storage/disk_adjacency.rs index d86fcee22..b61a22a26 100644 --- a/core/src/graph/storage/disk_adjacency.rs +++ b/core/src/graph/storage/disk_adjacency.rs @@ -368,6 +368,7 @@ impl WriteableGraphStorage for DiskAdjacencyListStorage { cyclic: false, rooted_tree: true, nodes: 0, + root_nodes: 0, dfs_visit_ratio: 0.0, }; @@ -404,6 +405,7 @@ impl WriteableGraphStorage for DiskAdjacencyListStorage { roots.remove(&e.target); } } + stats.root_nodes = roots.len(); let fan_outs = get_fan_outs(&self.edges)?; let sum_fan_out: usize = fan_outs.iter().sum(); diff --git a/core/src/graph/storage/mod.rs b/core/src/graph/storage/mod.rs index f9a426e33..f2c885398 100644 --- a/core/src/graph/storage/mod.rs +++ b/core/src/graph/storage/mod.rs @@ -29,6 +29,10 @@ pub struct GraphStatistic { /// Number of nodes in this graph storage (both source and target nodes). pub nodes: usize, + /// Number of root nodes in this graph storage. + #[serde(default = "default_number_root_nodes")] + pub root_nodes: usize, + /// Average fan out. pub avg_fan_out: f64, /// Max fan-out of 99% of the data. @@ -46,12 +50,16 @@ pub struct GraphStatistic { pub dfs_visit_ratio: f64, } +fn default_number_root_nodes() -> usize { + 1 +} + impl std::fmt::Display for GraphStatistic { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, - "nodes={}, avg_fan_out={:.2}, max_fan_out={}, max_depth={}", - self.nodes, self.avg_fan_out, self.max_fan_out, self.max_depth + "nodes={}, avg_fan_out={:.2}, max_fan_out={}, fan_out_99%={}, inv_fan_out_99%={}, max_depth={}", + self.nodes, self.avg_fan_out, self.max_fan_out, self.fan_out_99_percentile, self.inverse_fan_out_99_percentile, self.max_depth )?; if self.cyclic { write!(f, ", cyclic")?; From bfb66e14eb537ce4892d21de167b13bfc7432e4e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 14:50:01 +0200 Subject: [PATCH 02/16] Adjust disk based graph storage to serialize their graph statistics as TOML file --- core/src/graph/storage/disk_adjacency.rs | 30 ++++++++----- core/src/graph/storage/disk_path.rs | 30 ++++++++----- core/src/graph/storage/legacy.rs | 54 ++++++++++++++++++++++++ core/src/graph/storage/mod.rs | 2 + 4 files changed, 96 insertions(+), 20 deletions(-) create mode 100644 core/src/graph/storage/legacy.rs diff --git a/core/src/graph/storage/disk_adjacency.rs b/core/src/graph/storage/disk_adjacency.rs index b61a22a26..65846aab9 100644 --- a/core/src/graph/storage/disk_adjacency.rs +++ b/core/src/graph/storage/disk_adjacency.rs @@ -13,6 +13,7 @@ use std::ops::Bound; use transient_btree_index::BtreeConfig; pub const SERIALIZATION_ID: &str = "DiskAdjacencyListV1"; +const STATISTICS_FILE_NAME: &str = "edge_stats.toml"; pub struct DiskAdjacencyListStorage { edges: DiskMap, @@ -143,11 +144,22 @@ impl GraphStorage for DiskAdjacencyListStorage { where Self: std::marker::Sized, { - // Read stats - let stats_path = location.join("edge_stats.bin"); - let f_stats = std::fs::File::open(stats_path)?; - let input = std::io::BufReader::new(f_stats); - let stats = bincode::deserialize_from(input)?; + // Read stats from file. + let stats_path_toml = location.join(STATISTICS_FILE_NAME); + let legacy_stats_path_bin = location.join("edge_stats.bin"); + + let stats = if stats_path_toml.is_file() { + let file_content = std::fs::read_to_string(stats_path_toml)?; + toml::from_str(&file_content)? + } else if legacy_stats_path_bin.is_file() { + let f_stats = std::fs::File::open(legacy_stats_path_bin)?; + let input = std::io::BufReader::new(f_stats); + // This is a legacy file which needs an older version of the struct + let legacy_stats: Option = bincode::deserialize_from(input)?; + legacy_stats.map(|s| s.into()) + } else { + None + }; let result = DiskAdjacencyListStorage { edges: DiskMap::new( @@ -179,11 +191,9 @@ impl GraphStorage for DiskAdjacencyListStorage { self.inverse_edges .write_to(&location.join("inverse_edges.bin"))?; self.annos.save_annotations_to(location)?; - // Write stats with bincode - let stats_path = location.join("edge_stats.bin"); - let f_stats = std::fs::File::create(stats_path)?; - let mut writer = std::io::BufWriter::new(f_stats); - bincode::serialize_into(&mut writer, &self.stats)?; + // Write stats as TOML file + let file_content = toml::to_string(&self.stats)?; + std::fs::write(location.join(STATISTICS_FILE_NAME), file_content)?; Ok(()) } diff --git a/core/src/graph/storage/disk_path.rs b/core/src/graph/storage/disk_path.rs index 04292c522..11457974c 100644 --- a/core/src/graph/storage/disk_path.rs +++ b/core/src/graph/storage/disk_path.rs @@ -16,12 +16,13 @@ use crate::{ util::disk_collections::{DiskMap, EvictionStrategy, DEFAULT_BLOCK_CACHE_CAPACITY}, }; -use super::{EdgeContainer, GraphStatistic, GraphStorage}; +use super::{legacy, EdgeContainer, GraphStatistic, GraphStorage}; use binary_layout::prelude::*; pub(crate) const MAX_DEPTH: usize = 15; pub(crate) const SERIALIZATION_ID: &str = "DiskPathV1_D15"; const ENTRY_SIZE: usize = (MAX_DEPTH * 8) + 1; +const STATISTICS_FILE_NAME: &str = "edge_stats.toml"; binary_layout!(node_path, LittleEndian, { length: u8, @@ -372,10 +373,21 @@ impl GraphStorage for DiskPathStorage { ))?; // Read stats - let stats_path = location.join("edge_stats.bin"); - let f_stats = std::fs::File::open(stats_path)?; - let input = std::io::BufReader::new(f_stats); - let stats = bincode::deserialize_from(input)?; + let stats_path_toml = location.join(STATISTICS_FILE_NAME); + let legacy_stats_path_bin = location.join("edge_stats.bin"); + + let stats = if stats_path_toml.is_file() { + let file_content = std::fs::read_to_string(stats_path_toml)?; + toml::from_str(&file_content)? + } else if legacy_stats_path_bin.is_file() { + let f_stats = std::fs::File::open(legacy_stats_path_bin)?; + let input = std::io::BufReader::new(f_stats); + // This is a legacy file which needs an older version of the struct + let legacy_stats: Option = bincode::deserialize_from(input)?; + legacy_stats.map(|s| s.into()) + } else { + None + }; Ok(Self { paths, @@ -413,11 +425,9 @@ impl GraphStorage for DiskPathStorage { // Save edge annotations self.annos.save_annotations_to(location)?; - // Write stats with bincode - let stats_path = location.join("edge_stats.bin"); - let f_stats = std::fs::File::create(stats_path)?; - let mut writer = std::io::BufWriter::new(f_stats); - bincode::serialize_into(&mut writer, &self.stats)?; + // Write stats as TOML file + let file_content = toml::to_string(&self.stats)?; + std::fs::write(location.join(STATISTICS_FILE_NAME), file_content)?; Ok(()) } diff --git a/core/src/graph/storage/legacy.rs b/core/src/graph/storage/legacy.rs new file mode 100644 index 000000000..3340b8a6d --- /dev/null +++ b/core/src/graph/storage/legacy.rs @@ -0,0 +1,54 @@ +//! Legacy structures of graph storages. Old versions of graph storages need to +//! be kept for compatibility reasons, but are not further developed. If +//! possible, only the legacy data structure is kept, the graph storage is +//! converted into a newer version and there is no specific implementation for +//! the old data structure. + +use super::GraphStatistic; + +/// Some general statistical numbers specific to a graph component +#[derive(Serialize, Deserialize, Clone)] +pub(crate) struct GraphStatisticV1 { + /// True if the component contains any cycle. + pub cyclic: bool, + + /// True if the component consists of [rooted trees](https://en.wikipedia.org/wiki/Tree_(graph_theory)). + pub rooted_tree: bool, + + /// Number of nodes in this graph storage (both source and target nodes). + pub nodes: usize, + + /// Average fan out. + pub avg_fan_out: f64, + /// Max fan-out of 99% of the data. + pub fan_out_99_percentile: usize, + + /// Max inverse fan-out of 99% of the data. + pub inverse_fan_out_99_percentile: usize, + + /// Maximal number of children of a node. + pub max_fan_out: usize, + /// Maximum length from a root node to a terminal node. + pub max_depth: usize, + + /// Only valid for acyclic graphs: the average number of times a DFS will visit each node. + pub dfs_visit_ratio: f64, +} + +impl From for GraphStatistic { + fn from(value: GraphStatisticV1) -> Self { + let root_nodes = if value.nodes > 0 { 1 } else { 0 }; + Self { + cyclic: value.cyclic, + rooted_tree: value.rooted_tree, + nodes: value.nodes, + root_nodes, + avg_fan_out: value.avg_fan_out, + fan_out_99_percentile: value.fan_out_99_percentile, + inverse_fan_out_99_percentile: value.inverse_fan_out_99_percentile, + max_fan_out: value.max_fan_out, + max_depth: value.max_depth, + dfs_visit_ratio: value.dfs_visit_ratio, + } + } +} diff --git a/core/src/graph/storage/mod.rs b/core/src/graph/storage/mod.rs index f2c885398..0e2de8b48 100644 --- a/core/src/graph/storage/mod.rs +++ b/core/src/graph/storage/mod.rs @@ -7,6 +7,8 @@ pub mod prepost; pub mod registry; pub mod union; +pub(crate) mod legacy; + use crate::annostorage::{EdgeAnnotationStorage, NodeAnnotationStorage}; use crate::{ annostorage::AnnotationStorage, From 686e4e29c5f7cd0f96915f07ae6967cd3cd668d0 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 16:05:52 +0200 Subject: [PATCH 03/16] Serialize all fields of the graph storages separate Also add a fallback to deserialize the old component.bin with the legacy graph statistic field --- .../snapshots/cli__show_corpus_info.snap | 20 +++---- core/src/graph/storage/adjacencylist.rs | 43 +++++++++++++-- core/src/graph/storage/dense_adjacency.rs | 31 +++++++++-- core/src/graph/storage/disk_adjacency.rs | 24 +-------- core/src/graph/storage/disk_path.rs | 27 +++------- core/src/graph/storage/legacy.rs | 52 ++++++++++++++++++- core/src/graph/storage/linear.rs | 33 ++++++++++-- core/src/graph/storage/mod.rs | 50 +++++++++++++++--- core/src/graph/storage/prepost.rs | 34 ++++++++++-- 9 files changed, 238 insertions(+), 76 deletions(-) diff --git a/cli/tests/snapshots/cli__show_corpus_info.snap b/cli/tests/snapshots/cli__show_corpus_info.snap index e99870165..c663216a9 100644 --- a/cli/tests/snapshots/cli__show_corpus_info.snap +++ b/cli/tests/snapshots/cli__show_corpus_info.snap @@ -21,52 +21,52 @@ Status: "fully loaded" Token search shortcut possible: true ------------ Component Coverage//: 0 annnotations -Stats: nodes=92, avg_fan_out=2.17, max_fan_out=11, max_depth=1 +Stats: nodes=92, avg_fan_out=2.17, max_fan_out=11, fan_out_99%=11, inv_fan_out_99%=9, max_depth=1 Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Coverage/annis/: 0 annnotations -Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, max_depth=1, tree +Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Coverage/default_ns/: 0 annnotations -Stats: nodes=56, avg_fan_out=0.93, max_fan_out=10, max_depth=1 +Stats: nodes=56, avg_fan_out=0.93, max_fan_out=10, fan_out_99%=10, inv_fan_out_99%=2, max_depth=1 Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Coverage/annis/inherited-coverage: 0 annnotations -Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, max_depth=1, tree +Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Dominance/syntax/: 0 annnotations -Stats: nodes=92, avg_fan_out=0.96, max_fan_out=3, max_depth=9, tree +Stats: nodes=92, avg_fan_out=0.96, max_fan_out=3, fan_out_99%=3, inv_fan_out_99%=1, max_depth=9, tree Implementation: PrePostOrderO16L8V1 Status: "fully loaded" ------------ Component Pointing/default_ns/anaphoric: 0 annnotations -Stats: nodes=8, avg_fan_out=0.50, max_fan_out=1, max_depth=1, tree +Stats: nodes=8, avg_fan_out=0.50, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=1, tree Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component Ordering/annis/: 0 annnotations -Stats: nodes=44, avg_fan_out=0.91, max_fan_out=1, max_depth=10, tree +Stats: nodes=44, avg_fan_out=0.91, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=10, tree Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component LeftToken/annis/: 0 annnotations -Stats: nodes=92, avg_fan_out=0.65, max_fan_out=1, max_depth=1 +Stats: nodes=92, avg_fan_out=0.65, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=3, max_depth=1 Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component RightToken/annis/: 0 annnotations -Stats: nodes=84, avg_fan_out=0.71, max_fan_out=1, max_depth=1 +Stats: nodes=84, avg_fan_out=0.71, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=8, max_depth=1 Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component PartOf/annis/: 0 annnotations -Stats: nodes=115, avg_fan_out=0.99, max_fan_out=1, max_depth=4 +Stats: nodes=115, avg_fan_out=0.99, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=26, max_depth=4 Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ diff --git a/core/src/graph/storage/adjacencylist.rs b/core/src/graph/storage/adjacencylist.rs index 6972c33a2..49c8a798d 100644 --- a/core/src/graph/storage/adjacencylist.rs +++ b/core/src/graph/storage/adjacencylist.rs @@ -7,7 +7,12 @@ use crate::{ types::{AnnoKey, Annotation, Edge, NodeID}, }; -use super::{EdgeContainer, GraphStatistic, GraphStorage, WriteableGraphStorage}; +use super::{ + deserialize_gs_field, + legacy::{self, AdjacencyListStorageV1}, + load_statistics_from_location, save_statistics_to_toml, serialize_gs_field, EdgeContainer, + GraphStatistic, GraphStorage, WriteableGraphStorage, +}; use itertools::Itertools; use rustc_hash::FxHashSet; use serde::Deserialize; @@ -123,13 +128,34 @@ impl GraphStorage for AdjacencyListStorage { where for<'de> Self: std::marker::Sized + Deserialize<'de>, { - let mut result: Self = super::default_deserialize_gs(location)?; + let legacy_path = location.join("component.bin"); + let mut result: Self = if legacy_path.is_file() { + let component: AdjacencyListStorageV1 = deserialize_gs_field(location, "component")?; + Self { + stats: component.stats.map(|s| GraphStatistic::from(s)), + edges: component.edges, + inverse_edges: component.inverse_edges, + annos: component.annos, + } + } else { + let stats = load_statistics_from_location(location)?; + Self { + edges: deserialize_gs_field(location, "edges")?, + inverse_edges: deserialize_gs_field(location, "inverse_edges")?, + annos: deserialize_gs_field(location, "annos")?, + stats: stats, + } + }; + result.annos.after_deserialization(); Ok(result) } fn save_to(&self, location: &Path) -> Result<()> { - super::default_serialize_gs(self, location)?; + serialize_gs_field(&self.edges, "edges", location)?; + serialize_gs_field(&self.inverse_edges, "inverse_edges", location)?; + serialize_gs_field(&self.annos, "annos", location)?; + save_statistics_to_toml(location, self.stats.as_ref())?; Ok(()) } @@ -448,5 +474,16 @@ impl WriteableGraphStorage for AdjacencyListStorage { } } +impl From for AdjacencyListStorage { + fn from(value: legacy::AdjacencyListStorageV1) -> Self { + Self { + edges: value.edges, + inverse_edges: value.inverse_edges, + annos: value.annos, + stats: value.stats.map(|s| GraphStatistic::from(s)), + } + } +} + #[cfg(test)] mod tests; diff --git a/core/src/graph/storage/dense_adjacency.rs b/core/src/graph/storage/dense_adjacency.rs index 1f5d6355e..c48b55026 100644 --- a/core/src/graph/storage/dense_adjacency.rs +++ b/core/src/graph/storage/dense_adjacency.rs @@ -1,4 +1,7 @@ -use super::{EdgeContainer, GraphStatistic, GraphStorage}; +use super::{ + deserialize_gs_field, legacy::DenseAdjacencyListStorageV1, load_statistics_from_location, + save_statistics_to_toml, serialize_gs_field, EdgeContainer, GraphStatistic, GraphStorage, +}; use crate::{ annostorage::{ inmemory::AnnoStorageImpl, AnnotationStorage, EdgeAnnotationStorage, NodeAnnotationStorage, @@ -219,13 +222,35 @@ impl GraphStorage for DenseAdjacencyListStorage { where for<'de> Self: std::marker::Sized + Deserialize<'de>, { - let mut result: Self = super::default_deserialize_gs(location)?; + let legacy_path = location.join("component.bin"); + let mut result: Self = if legacy_path.is_file() { + let component: DenseAdjacencyListStorageV1 = + deserialize_gs_field(location, "component")?; + Self { + edges: component.edges, + inverse_edges: component.inverse_edges, + annos: component.annos, + stats: component.stats.map(|s| GraphStatistic::from(s)), + } + } else { + let stats = load_statistics_from_location(location)?; + Self { + edges: deserialize_gs_field(location, "edges")?, + inverse_edges: deserialize_gs_field(location, "inverse_edges")?, + annos: deserialize_gs_field(location, "annos")?, + stats: stats, + } + }; + result.annos.after_deserialization(); Ok(result) } fn save_to(&self, location: &Path) -> Result<()> { - super::default_serialize_gs(self, location)?; + serialize_gs_field(&self.edges, "edges", location)?; + serialize_gs_field(&self.inverse_edges, "inverse_edges", location)?; + serialize_gs_field(&self.annos, "annos", location)?; + save_statistics_to_toml(location, self.stats.as_ref())?; Ok(()) } } diff --git a/core/src/graph/storage/disk_adjacency.rs b/core/src/graph/storage/disk_adjacency.rs index 65846aab9..f38b9bdfb 100644 --- a/core/src/graph/storage/disk_adjacency.rs +++ b/core/src/graph/storage/disk_adjacency.rs @@ -13,7 +13,6 @@ use std::ops::Bound; use transient_btree_index::BtreeConfig; pub const SERIALIZATION_ID: &str = "DiskAdjacencyListV1"; -const STATISTICS_FILE_NAME: &str = "edge_stats.toml"; pub struct DiskAdjacencyListStorage { edges: DiskMap, @@ -144,23 +143,7 @@ impl GraphStorage for DiskAdjacencyListStorage { where Self: std::marker::Sized, { - // Read stats from file. - let stats_path_toml = location.join(STATISTICS_FILE_NAME); - let legacy_stats_path_bin = location.join("edge_stats.bin"); - - let stats = if stats_path_toml.is_file() { - let file_content = std::fs::read_to_string(stats_path_toml)?; - toml::from_str(&file_content)? - } else if legacy_stats_path_bin.is_file() { - let f_stats = std::fs::File::open(legacy_stats_path_bin)?; - let input = std::io::BufReader::new(f_stats); - // This is a legacy file which needs an older version of the struct - let legacy_stats: Option = bincode::deserialize_from(input)?; - legacy_stats.map(|s| s.into()) - } else { - None - }; - + let stats = load_statistics_from_location(location)?; let result = DiskAdjacencyListStorage { edges: DiskMap::new( Some(&location.join("edges.bin")), @@ -191,10 +174,7 @@ impl GraphStorage for DiskAdjacencyListStorage { self.inverse_edges .write_to(&location.join("inverse_edges.bin"))?; self.annos.save_annotations_to(location)?; - // Write stats as TOML file - let file_content = toml::to_string(&self.stats)?; - std::fs::write(location.join(STATISTICS_FILE_NAME), file_content)?; - + save_statistics_to_toml(location, self.stats.as_ref())?; Ok(()) } diff --git a/core/src/graph/storage/disk_path.rs b/core/src/graph/storage/disk_path.rs index 11457974c..9264bd616 100644 --- a/core/src/graph/storage/disk_path.rs +++ b/core/src/graph/storage/disk_path.rs @@ -16,13 +16,15 @@ use crate::{ util::disk_collections::{DiskMap, EvictionStrategy, DEFAULT_BLOCK_CACHE_CAPACITY}, }; -use super::{legacy, EdgeContainer, GraphStatistic, GraphStorage}; +use super::{ + load_statistics_from_location, save_statistics_to_toml, EdgeContainer, GraphStatistic, + GraphStorage, +}; use binary_layout::prelude::*; pub(crate) const MAX_DEPTH: usize = 15; pub(crate) const SERIALIZATION_ID: &str = "DiskPathV1_D15"; const ENTRY_SIZE: usize = (MAX_DEPTH * 8) + 1; -const STATISTICS_FILE_NAME: &str = "edge_stats.toml"; binary_layout!(node_path, LittleEndian, { length: u8, @@ -372,22 +374,7 @@ impl GraphStorage for DiskPathStorage { location.join(crate::annostorage::ondisk::SUBFOLDER_NAME), ))?; - // Read stats - let stats_path_toml = location.join(STATISTICS_FILE_NAME); - let legacy_stats_path_bin = location.join("edge_stats.bin"); - - let stats = if stats_path_toml.is_file() { - let file_content = std::fs::read_to_string(stats_path_toml)?; - toml::from_str(&file_content)? - } else if legacy_stats_path_bin.is_file() { - let f_stats = std::fs::File::open(legacy_stats_path_bin)?; - let input = std::io::BufReader::new(f_stats); - // This is a legacy file which needs an older version of the struct - let legacy_stats: Option = bincode::deserialize_from(input)?; - legacy_stats.map(|s| s.into()) - } else { - None - }; + let stats = load_statistics_from_location(location)?; Ok(Self { paths, @@ -425,9 +412,7 @@ impl GraphStorage for DiskPathStorage { // Save edge annotations self.annos.save_annotations_to(location)?; - // Write stats as TOML file - let file_content = toml::to_string(&self.stats)?; - std::fs::write(location.join(STATISTICS_FILE_NAME), file_content)?; + save_statistics_to_toml(location, self.stats.as_ref())?; Ok(()) } diff --git a/core/src/graph/storage/legacy.rs b/core/src/graph/storage/legacy.rs index 3340b8a6d..ad011bd48 100644 --- a/core/src/graph/storage/legacy.rs +++ b/core/src/graph/storage/legacy.rs @@ -4,7 +4,20 @@ //! converted into a newer version and there is no specific implementation for //! the old data structure. -use super::GraphStatistic; +use std::collections::HashMap; + +use rustc_hash::FxHashMap; + +use crate::{ + annostorage::inmemory::AnnoStorageImpl, + types::{Edge, NodeID, NumValue}, +}; + +use super::{ + linear::RelativePosition, + prepost::{OrderVecEntry, PrePost}, + GraphStatistic, +}; /// Some general statistical numbers specific to a graph component #[derive(Serialize, Deserialize, Clone)] @@ -52,3 +65,40 @@ impl From for GraphStatistic { } } } + +/// An adjacency list based storage that uses the [`GraphStatisticV1`] +#[derive(Serialize, Deserialize, Clone)] +pub(crate) struct AdjacencyListStorageV1 { + pub(crate) edges: HashMap>, + pub(crate) inverse_edges: HashMap>, + pub(crate) annos: AnnoStorageImpl, + pub(crate) stats: Option, +} + +/// An adjacency list based storage that uses the [`GraphStatisticV1`] and is +/// optimized for graphs where almost all nodes have an outgoing edge. +#[derive(Serialize, Deserialize, Clone)] +pub(crate) struct DenseAdjacencyListStorageV1 { + pub(crate) edges: Vec>, + pub(crate) inverse_edges: HashMap>, + pub(crate) annos: AnnoStorageImpl, + pub(crate) stats: Option, +} + +/// A graph storage for linar graphs that uses the [`GraphStatisticV1`]. +#[derive(Serialize, Deserialize, Clone)] +pub(crate) struct LinearGraphStorageV1 { + pub(crate) node_to_pos: HashMap>, + pub(crate) node_chains: HashMap>, + pub(crate) annos: AnnoStorageImpl, + pub(crate) stats: Option, +} + +/// A graph storage for trees that uses the [`GraphStatisticV1`] and indexes graphs using the pre/post order. +#[derive(Serialize, Deserialize, Clone)] +pub(crate) struct PrePostOrderStorageV1 { + pub(crate) node_to_order: FxHashMap>>, + pub(crate) order_to_node: Vec>, + pub(crate) annos: AnnoStorageImpl, + pub(crate) stats: Option, +} diff --git a/core/src/graph/storage/linear.rs b/core/src/graph/storage/linear.rs index f3a9b155d..4b6776c1b 100644 --- a/core/src/graph/storage/linear.rs +++ b/core/src/graph/storage/linear.rs @@ -1,4 +1,7 @@ -use super::{EdgeContainer, GraphStatistic, GraphStorage}; +use super::{ + deserialize_gs_field, legacy::LinearGraphStorageV1, load_statistics_from_location, + save_statistics_to_toml, serialize_gs_field, EdgeContainer, GraphStatistic, GraphStorage, +}; use crate::{ annostorage::{ inmemory::AnnoStorageImpl, AnnotationStorage, EdgeAnnotationStorage, NodeAnnotationStorage, @@ -12,7 +15,7 @@ use serde::{Deserialize, Serialize}; use std::{clone::Clone, collections::HashMap, path::Path}; #[derive(Serialize, Deserialize, Clone)] -struct RelativePosition { +pub(crate) struct RelativePosition { pub root: NodeID, pub pos: PosT, } @@ -165,13 +168,35 @@ where where for<'de> Self: std::marker::Sized + Deserialize<'de>, { - let mut result: Self = super::default_deserialize_gs(location)?; + let legacy_path = location.join("component.bin"); + let mut result: Self = if legacy_path.is_file() { + let component: LinearGraphStorageV1 = + deserialize_gs_field(location, "component")?; + Self { + node_to_pos: component.node_to_pos, + node_chains: component.node_chains, + annos: component.annos, + stats: component.stats.map(|s| GraphStatistic::from(s)), + } + } else { + let stats = load_statistics_from_location(location)?; + Self { + node_to_pos: deserialize_gs_field(location, "node_to_pos")?, + node_chains: deserialize_gs_field(location, "node_chains")?, + annos: deserialize_gs_field(location, "annos")?, + stats: stats, + } + }; + result.annos.after_deserialization(); Ok(result) } fn save_to(&self, location: &Path) -> Result<()> { - super::default_serialize_gs(self, location)?; + serialize_gs_field(&self.node_to_pos, "node_to_pos", location)?; + serialize_gs_field(&self.node_chains, "node_chains", location)?; + serialize_gs_field(&self.annos, "annos", location)?; + save_statistics_to_toml(location, self.stats.as_ref())?; Ok(()) } diff --git a/core/src/graph/storage/mod.rs b/core/src/graph/storage/mod.rs index 0e2de8b48..04aefdd27 100644 --- a/core/src/graph/storage/mod.rs +++ b/core/src/graph/storage/mod.rs @@ -200,30 +200,64 @@ pub trait GraphStorage: EdgeContainer { fn save_to(&self, location: &Path) -> Result<()>; } -pub fn default_serialize_gs(gs: &GS, location: &Path) -> Result<()> +pub fn serialize_gs_field(field: &T, field_name: &str, location: &Path) -> Result<()> where - GS: Serialize, + T: Serialize, { - let data_path = location.join("component.bin"); + let data_path = location.join(format!("{field_name}.bin")); let f_data = std::fs::File::create(data_path)?; let mut writer = std::io::BufWriter::new(f_data); - bincode::serialize_into(&mut writer, gs)?; + bincode::serialize_into(&mut writer, field)?; Ok(()) } -pub fn default_deserialize_gs(location: &Path) -> Result +pub fn deserialize_gs_field(location: &Path, field_name: &str) -> Result where - for<'de> GS: std::marker::Sized + Deserialize<'de>, + for<'de> T: std::marker::Sized + Deserialize<'de>, { - let data_path = location.join("component.bin"); + let data_path = location.join(format!("{field_name}.bin")); let f_data = std::fs::File::open(data_path)?; let input = std::io::BufReader::new(f_data); let result = bincode::deserialize_from(input)?; - Ok(result) } +const STATISTICS_FILE_NAME: &str = "stats.toml"; + +pub fn load_statistics_from_location(location: &Path) -> Result> { + let stats_path_toml = location.join(STATISTICS_FILE_NAME); + let legacy_stats_path_bin = location.join("edge_stats.bin"); + + let stats = if stats_path_toml.is_file() { + let file_content = std::fs::read_to_string(stats_path_toml)?; + let stats: GraphStatistic = toml::from_str(&file_content)?; + Some(stats) + } else if legacy_stats_path_bin.is_file() { + let f_stats = std::fs::File::open(legacy_stats_path_bin)?; + let input = std::io::BufReader::new(f_stats); + // This is a legacy file which needs an older version of the struct + let legacy_stats: Option = bincode::deserialize_from(input)?; + legacy_stats.map(|s| s.into()) + } else { + None + }; + Ok(stats) +} + +pub fn save_statistics_to_toml(location: &Path, stats: Option<&GraphStatistic>) -> Result<()> { + let file_path = location.join(STATISTICS_FILE_NAME); + if file_path.is_file() { + std::fs::remove_file(&file_path)?; + } + + if let Some(stats) = stats { + let file_content = toml::to_string(stats)?; + std::fs::write(file_path, file_content)?; + } + Ok(()) +} + /// Trait for accessing graph storages which can be written to. pub trait WriteableGraphStorage: GraphStorage { /// Add an edge to this graph storage. diff --git a/core/src/graph/storage/prepost.rs b/core/src/graph/storage/prepost.rs index 3dd22cb81..0550dac34 100644 --- a/core/src/graph/storage/prepost.rs +++ b/core/src/graph/storage/prepost.rs @@ -1,4 +1,7 @@ -use super::{EdgeContainer, GraphStatistic, GraphStorage}; +use super::{ + deserialize_gs_field, legacy::PrePostOrderStorageV1, load_statistics_from_location, + save_statistics_to_toml, serialize_gs_field, EdgeContainer, GraphStatistic, GraphStorage, +}; use crate::{ annostorage::{ inmemory::AnnoStorageImpl, AnnotationStorage, EdgeAnnotationStorage, NodeAnnotationStorage, @@ -20,7 +23,7 @@ pub struct PrePost { } #[derive(Serialize, Deserialize, Clone, Debug)] -enum OrderVecEntry { +pub(crate) enum OrderVecEntry { None, Pre { post: OrderT, @@ -197,13 +200,36 @@ where where for<'de> Self: std::marker::Sized + Deserialize<'de>, { - let mut result: Self = super::default_deserialize_gs(location)?; + let legacy_path = location.join("component.bin"); + let mut result: Self = if legacy_path.is_file() { + let component: PrePostOrderStorageV1 = + deserialize_gs_field(location, "component")?; + Self { + node_to_order: component.node_to_order, + order_to_node: component.order_to_node, + annos: component.annos, + stats: component.stats.map(|s| GraphStatistic::from(s)), + } + } else { + let stats = load_statistics_from_location(location)?; + Self { + node_to_order: deserialize_gs_field(location, "node_to_order")?, + order_to_node: deserialize_gs_field(location, "order_to_node")?, + annos: deserialize_gs_field(location, "annos")?, + stats: stats, + } + }; + result.annos.after_deserialization(); + Ok(result) } fn save_to(&self, location: &Path) -> Result<()> { - super::default_serialize_gs(self, location)?; + serialize_gs_field(&self.node_to_order, "node_to_order", location)?; + serialize_gs_field(&self.order_to_node, "order_to_node", location)?; + serialize_gs_field(&self.annos, "annos", location)?; + save_statistics_to_toml(location, self.stats.as_ref())?; Ok(()) } From 0b6d18098e90b6e5c83f38a7f34cf71d2f6f734e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 16:57:41 +0200 Subject: [PATCH 04/16] Use "corpus" as node estimation for PartOf component --- core/src/graph/storage/mod.rs | 4 ++-- graphannis/src/annis/db/aql/operators/edge_op.rs | 13 +++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/core/src/graph/storage/mod.rs b/core/src/graph/storage/mod.rs index 04aefdd27..3bb627af8 100644 --- a/core/src/graph/storage/mod.rs +++ b/core/src/graph/storage/mod.rs @@ -60,8 +60,8 @@ impl std::fmt::Display for GraphStatistic { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( f, - "nodes={}, avg_fan_out={:.2}, max_fan_out={}, fan_out_99%={}, inv_fan_out_99%={}, max_depth={}", - self.nodes, self.avg_fan_out, self.max_fan_out, self.fan_out_99_percentile, self.inverse_fan_out_99_percentile, self.max_depth + "nodes={}, root nodes={}, avg_fan_out={:.2}, max_fan_out={}, fan_out_99%={}, inv_fan_out_99%={}, max_depth={}", + self.nodes, self.root_nodes, self.avg_fan_out, self.max_fan_out, self.fan_out_99_percentile, self.inverse_fan_out_99_percentile, self.max_depth )?; if self.cyclic { write!(f, ", cyclic")?; diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index 3c56d0a60..dd9473cbb 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -43,14 +43,23 @@ impl BaseEdgeOp { })?; gs.push(gs_for_component); } + let all_part_of_components = spec + .components + .iter() + .all(|c| c.get_type() == AnnotationComponentType::PartOf); + let node_type = if all_part_of_components { + "corpus" + } else { + "node" + }; Ok(BaseEdgeOp { gs, spec, max_nodes_estimate: db.get_node_annos().guess_max_count( Some(&NODE_TYPE_KEY.ns), &NODE_TYPE_KEY.name, - "node", - "node", + node_type, + node_type, )?, inverse: false, }) From bfc07784b8f53f155ba120c77cf5360e5eddd3d3 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 17:03:09 +0200 Subject: [PATCH 05/16] Fix clippy issues --- core/src/graph/storage/adjacencylist.rs | 6 +++--- core/src/graph/storage/dense_adjacency.rs | 4 ++-- core/src/graph/storage/linear.rs | 4 ++-- core/src/graph/storage/prepost.rs | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/core/src/graph/storage/adjacencylist.rs b/core/src/graph/storage/adjacencylist.rs index 49c8a798d..c70f19f2f 100644 --- a/core/src/graph/storage/adjacencylist.rs +++ b/core/src/graph/storage/adjacencylist.rs @@ -132,7 +132,7 @@ impl GraphStorage for AdjacencyListStorage { let mut result: Self = if legacy_path.is_file() { let component: AdjacencyListStorageV1 = deserialize_gs_field(location, "component")?; Self { - stats: component.stats.map(|s| GraphStatistic::from(s)), + stats: component.stats.map(GraphStatistic::from), edges: component.edges, inverse_edges: component.inverse_edges, annos: component.annos, @@ -143,7 +143,7 @@ impl GraphStorage for AdjacencyListStorage { edges: deserialize_gs_field(location, "edges")?, inverse_edges: deserialize_gs_field(location, "inverse_edges")?, annos: deserialize_gs_field(location, "annos")?, - stats: stats, + stats, } }; @@ -480,7 +480,7 @@ impl From for AdjacencyListStorage { edges: value.edges, inverse_edges: value.inverse_edges, annos: value.annos, - stats: value.stats.map(|s| GraphStatistic::from(s)), + stats: value.stats.map(GraphStatistic::from), } } } diff --git a/core/src/graph/storage/dense_adjacency.rs b/core/src/graph/storage/dense_adjacency.rs index c48b55026..3d86425a6 100644 --- a/core/src/graph/storage/dense_adjacency.rs +++ b/core/src/graph/storage/dense_adjacency.rs @@ -230,7 +230,7 @@ impl GraphStorage for DenseAdjacencyListStorage { edges: component.edges, inverse_edges: component.inverse_edges, annos: component.annos, - stats: component.stats.map(|s| GraphStatistic::from(s)), + stats: component.stats.map(GraphStatistic::from), } } else { let stats = load_statistics_from_location(location)?; @@ -238,7 +238,7 @@ impl GraphStorage for DenseAdjacencyListStorage { edges: deserialize_gs_field(location, "edges")?, inverse_edges: deserialize_gs_field(location, "inverse_edges")?, annos: deserialize_gs_field(location, "annos")?, - stats: stats, + stats, } }; diff --git a/core/src/graph/storage/linear.rs b/core/src/graph/storage/linear.rs index 4b6776c1b..d3f037002 100644 --- a/core/src/graph/storage/linear.rs +++ b/core/src/graph/storage/linear.rs @@ -176,7 +176,7 @@ where node_to_pos: component.node_to_pos, node_chains: component.node_chains, annos: component.annos, - stats: component.stats.map(|s| GraphStatistic::from(s)), + stats: component.stats.map(GraphStatistic::from), } } else { let stats = load_statistics_from_location(location)?; @@ -184,7 +184,7 @@ where node_to_pos: deserialize_gs_field(location, "node_to_pos")?, node_chains: deserialize_gs_field(location, "node_chains")?, annos: deserialize_gs_field(location, "annos")?, - stats: stats, + stats, } }; diff --git a/core/src/graph/storage/prepost.rs b/core/src/graph/storage/prepost.rs index 0550dac34..6f3ee559a 100644 --- a/core/src/graph/storage/prepost.rs +++ b/core/src/graph/storage/prepost.rs @@ -208,7 +208,7 @@ where node_to_order: component.node_to_order, order_to_node: component.order_to_node, annos: component.annos, - stats: component.stats.map(|s| GraphStatistic::from(s)), + stats: component.stats.map(GraphStatistic::from), } } else { let stats = load_statistics_from_location(location)?; @@ -216,7 +216,7 @@ where node_to_order: deserialize_gs_field(location, "node_to_order")?, order_to_node: deserialize_gs_field(location, "order_to_node")?, annos: deserialize_gs_field(location, "annos")?, - stats: stats, + stats, } }; From 73d81996d7bdcce13e7c631795475cbe7877f48f Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 17:10:33 +0200 Subject: [PATCH 06/16] Add test corpus that uses the new statistic --- cli/tests/cli.rs | 2 +- .../cli__list_corpora_fully_loaded.snap | 1 + .../cli__list_corpora_not_loaded.snap | 2 +- .../cli__list_corpora_partially_loaded.snap | 1 + .../snapshots/cli__show_corpus_info.snap | 26 +++--- .../sample-disk-based-3.8/corpus-config.toml | 78 ++++++++++++++++++ .../current/global_statistics.toml | 6 ++ .../current/gs/Coverage/annis/edges.bin | Bin 0 -> 123 bytes .../current/gs/Coverage/annis/impl.cfg | 1 + .../annis/inherited-coverage/edges.bin | Bin 0 -> 123 bytes .../annis/inherited-coverage/impl.cfg | 1 + .../inherited-coverage/inverse_edges.bin | Bin 0 -> 123 bytes .../nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../annis/inherited-coverage/stats.toml | 10 +++ .../gs/Coverage/annis/inverse_edges.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../current/gs/Coverage/annis/stats.toml | 10 +++ .../gs/Coverage/default_layer/edges.bin | Bin 0 -> 2211 bytes .../gs/Coverage/default_layer/impl.cfg | 1 + .../Coverage/default_layer/inverse_edges.bin | Bin 0 -> 2163 bytes .../nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../default_layer/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../gs/Coverage/default_layer/stats.toml | 10 +++ .../current/gs/Coverage/default_ns/edges.bin | Bin 0 -> 694 bytes .../current/gs/Coverage/default_ns/impl.cfg | 1 + .../gs/Coverage/default_ns/inverse_edges.bin | Bin 0 -> 950 bytes .../nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../default_ns/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../current/gs/Coverage/default_ns/stats.toml | 10 +++ .../current/gs/Dominance/syntax/annos.bin | Bin 0 -> 73 bytes .../current/gs/Dominance/syntax/impl.cfg | 1 + .../gs/Dominance/syntax/node_to_order.bin | Bin 0 -> 1940 bytes .../gs/Dominance/syntax/order_to_node.bin | Bin 0 -> 2768 bytes .../current/gs/Dominance/syntax/stats.toml | 10 +++ .../current/gs/LeftToken/annis/impl.cfg | 1 + .../gs/LeftToken/annis/inverse_edges.bin | Bin 0 -> 912 bytes .../annis/nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../current/gs/LeftToken/annis/paths.bin | Bin 0 -> 13068 bytes .../current/gs/LeftToken/annis/stats.toml | 10 +++ .../current/gs/Ordering/annis/impl.cfg | 1 + .../gs/Ordering/annis/inverse_edges.bin | Bin 0 -> 804 bytes .../annis/nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../current/gs/Ordering/annis/paths.bin | Bin 0 -> 10527 bytes .../current/gs/Ordering/annis/stats.toml | 10 +++ .../current/gs/PartOf/annis/impl.cfg | 1 + .../current/gs/PartOf/annis/inverse_edges.bin | Bin 0 -> 1213 bytes .../annis/nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../PartOf/annis/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../current/gs/PartOf/annis/paths.bin | Bin 0 -> 13915 bytes .../current/gs/PartOf/annis/stats.toml | 10 +++ .../gs/Pointing/default_ns/anaphoric/impl.cfg | 1 + .../default_ns/anaphoric/inverse_edges.bin | Bin 0 -> 234 bytes .../nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../anaphoric/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../Pointing/default_ns/anaphoric/paths.bin | Bin 0 -> 5687 bytes .../Pointing/default_ns/anaphoric/stats.toml | 10 +++ .../current/gs/RightToken/annis/impl.cfg | 1 + .../gs/RightToken/annis/inverse_edges.bin | Bin 0 -> 832 bytes .../annis/nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/by_container.bin | Bin 0 -> 123 bytes .../annis/nodes_diskmap_v1/custom.bin | Bin 0 -> 33 bytes .../current/gs/RightToken/annis/paths.bin | Bin 0 -> 13068 bytes .../current/gs/RightToken/annis/stats.toml | 10 +++ .../nodes_diskmap_v1/by_anno_qname.bin | Bin 0 -> 7167 bytes .../current/nodes_diskmap_v1/by_container.bin | Bin 0 -> 11833 bytes .../current/nodes_diskmap_v1/custom.bin | Bin 0 -> 11732 bytes 78 files changed, 211 insertions(+), 15 deletions(-) create mode 100644 graphannis/tests/data/sample-disk-based-3.8/corpus-config.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/global_statistics.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/annos.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/node_to_order.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/order_to_node.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/paths.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/paths.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/paths.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/paths.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/impl.cfg create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/inverse_edges.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/custom.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/paths.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/stats.toml create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/by_anno_qname.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/by_container.bin create mode 100644 graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/custom.bin diff --git a/cli/tests/cli.rs b/cli/tests/cli.rs index 4830293c2..bf94d1680 100644 --- a/cli/tests/cli.rs +++ b/cli/tests/cli.rs @@ -26,7 +26,7 @@ fn show_corpus_info() -> Result<(), Box> { cmd.arg("../graphannis/tests/data/") .arg("-c") - .arg("corpus sample-disk-based-3.3") + .arg("corpus sample-disk-based-3.8") .arg("-c") .arg("preload") .arg("-c") diff --git a/cli/tests/snapshots/cli__list_corpora_fully_loaded.snap b/cli/tests/snapshots/cli__list_corpora_fully_loaded.snap index fac1ea16c..b7a158610 100644 --- a/cli/tests/snapshots/cli__list_corpora_fully_loaded.snap +++ b/cli/tests/snapshots/cli__list_corpora_fully_loaded.snap @@ -20,6 +20,7 @@ exit_code: 0 sample-disk-based-1.5 (not loaded) sample-disk-based-3.2 (not loaded) sample-disk-based-3.3 (fully loaded) +sample-disk-based-3.8 (not loaded) sample-memory-based-1.5 (not loaded) sample-memory-based-3.2 (not loaded) sample-memory-based-3.3 (not loaded) diff --git a/cli/tests/snapshots/cli__list_corpora_not_loaded.snap b/cli/tests/snapshots/cli__list_corpora_not_loaded.snap index 091f4f73c..7fd24b45b 100644 --- a/cli/tests/snapshots/cli__list_corpora_not_loaded.snap +++ b/cli/tests/snapshots/cli__list_corpora_not_loaded.snap @@ -13,10 +13,10 @@ exit_code: 0 sample-disk-based-1.5 (not loaded) sample-disk-based-3.2 (not loaded) sample-disk-based-3.3 (not loaded) +sample-disk-based-3.8 (not loaded) sample-memory-based-1.5 (not loaded) sample-memory-based-3.2 (not loaded) sample-memory-based-3.3 (not loaded) graphANNIS says good-bye! ----- stderr ----- - diff --git a/cli/tests/snapshots/cli__list_corpora_partially_loaded.snap b/cli/tests/snapshots/cli__list_corpora_partially_loaded.snap index 58256a6c3..f8b2994b4 100644 --- a/cli/tests/snapshots/cli__list_corpora_partially_loaded.snap +++ b/cli/tests/snapshots/cli__list_corpora_partially_loaded.snap @@ -21,6 +21,7 @@ result: 44 matches in 4 documents sample-disk-based-1.5 (not loaded) sample-disk-based-3.2 (not loaded) sample-disk-based-3.3 (partially loaded) +sample-disk-based-3.8 (not loaded) sample-memory-based-1.5 (not loaded) sample-memory-based-3.2 (not loaded) sample-memory-based-3.3 (not loaded) diff --git a/cli/tests/snapshots/cli__show_corpus_info.snap b/cli/tests/snapshots/cli__show_corpus_info.snap index c663216a9..876b412df 100644 --- a/cli/tests/snapshots/cli__show_corpus_info.snap +++ b/cli/tests/snapshots/cli__show_corpus_info.snap @@ -5,7 +5,7 @@ info: args: - "../graphannis/tests/data/" - "-c" - - corpus sample-disk-based-3.3 + - corpus sample-disk-based-3.8 - "-c" - preload - "-c" @@ -14,59 +14,59 @@ info: success: true exit_code: 0 ----- stdout ----- -12:00:00[INFO] Loaded corpus sample-disk-based-3.3 -12:00:00[INFO] Corpus cache after preloading sample-disk-based-3.3: 100MB / 300MB - loaded corpora [sample-disk-based-3.3] +12:00:00[INFO] Loaded corpus sample-disk-based-3.8 +12:00:00[INFO] Corpus cache after preloading sample-disk-based-3.8: 100MB / 300MB - loaded corpora [sample-disk-based-3.8] 12:00:00[INFO] Preloaded corpus in 10 ms Status: "fully loaded" Token search shortcut possible: true ------------ Component Coverage//: 0 annnotations -Stats: nodes=92, avg_fan_out=2.17, max_fan_out=11, fan_out_99%=11, inv_fan_out_99%=9, max_depth=1 +Stats: nodes=92, root nodes=48, avg_fan_out=2.17, max_fan_out=11, fan_out_99%=11, inv_fan_out_99%=9, max_depth=1 Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Coverage/annis/: 0 annnotations -Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree +Stats: nodes=0, root nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Coverage/default_ns/: 0 annnotations -Stats: nodes=56, avg_fan_out=0.93, max_fan_out=10, fan_out_99%=10, inv_fan_out_99%=2, max_depth=1 +Stats: nodes=56, root nodes=12, avg_fan_out=0.93, max_fan_out=10, fan_out_99%=10, inv_fan_out_99%=2, max_depth=1 Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Coverage/annis/inherited-coverage: 0 annnotations -Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree +Stats: nodes=0, root nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree Implementation: DiskAdjacencyListV1 Status: "fully loaded" ------------ Component Dominance/syntax/: 0 annnotations -Stats: nodes=92, avg_fan_out=0.96, max_fan_out=3, fan_out_99%=3, inv_fan_out_99%=1, max_depth=9, tree +Stats: nodes=92, root nodes=4, avg_fan_out=0.96, max_fan_out=3, fan_out_99%=3, inv_fan_out_99%=1, max_depth=9, tree Implementation: PrePostOrderO16L8V1 Status: "fully loaded" ------------ Component Pointing/default_ns/anaphoric: 0 annnotations -Stats: nodes=8, avg_fan_out=0.50, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=1, tree +Stats: nodes=8, root nodes=4, avg_fan_out=0.50, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=1, tree Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component Ordering/annis/: 0 annnotations -Stats: nodes=44, avg_fan_out=0.91, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=10, tree +Stats: nodes=44, root nodes=4, avg_fan_out=0.91, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=10, tree Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component LeftToken/annis/: 0 annnotations -Stats: nodes=92, avg_fan_out=0.65, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=3, max_depth=1 +Stats: nodes=92, root nodes=60, avg_fan_out=0.65, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=3, max_depth=1 Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component RightToken/annis/: 0 annnotations -Stats: nodes=84, avg_fan_out=0.71, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=8, max_depth=1 +Stats: nodes=84, root nodes=60, avg_fan_out=0.71, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=8, max_depth=1 Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ Component PartOf/annis/: 0 annnotations -Stats: nodes=115, avg_fan_out=0.99, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=26, max_depth=4 +Stats: nodes=115, root nodes=104, avg_fan_out=0.99, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=26, max_depth=4 Implementation: DiskPathV1_D15 Status: "fully loaded" ------------ diff --git a/graphannis/tests/data/sample-disk-based-3.8/corpus-config.toml b/graphannis/tests/data/sample-disk-based-3.8/corpus-config.toml new file mode 100644 index 000000000..97cc8d4f2 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/corpus-config.toml @@ -0,0 +1,78 @@ +[context] +default = 5 +sizes = [0, 1, 2, 5, 10, 20, 25, 50] + +[view] +page_size = 10 + +[[visualizers]] +vis_type = "kwic" +display_name = "kwic" +visibility = "permanent" + +[[visualizers]] +element = "node" +layer = "default_ns" +vis_type = "grid" +display_name = "grid (default_ns)" +visibility = "hidden" + +[[visualizers]] +element = "node" +layer = "syntax" +vis_type = "tree" +display_name = "tree (syntax)" +visibility = "hidden" + +[visualizers.mappings] +edge_type = "null" +node_anno_ns = "default_ns" +node_key = "const" + +[[visualizers]] +element = "edge" +layer = "default_ns" +vis_type = "arch_dependency" +display_name = "anaphoric (default_ns)" +visibility = "hidden" + +[[visualizers]] +element = "node" +layer = "tiger" +vis_type = "tree" +display_name = "tree" +visibility = "hidden" + +[[visualizers]] +element = "node" +layer = "exmaralda" +vis_type = "grid" +display_name = "exmaralda" +visibility = "hidden" + +[[visualizers]] +element = "node" +layer = "mmax" +vis_type = "grid" +display_name = "mmax" +visibility = "hidden" + +[[visualizers]] +element = "edge" +layer = "mmax" +vis_type = "discourse" +display_name = "coref" +visibility = "hidden" + +[[visualizers]] +element = "node" +layer = "urml" +vis_type = "grid" +display_name = "urml" +visibility = "hidden" + +[corpus_size] +quantity = 44 + +[corpus_size.unit] +name = "tokens" diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/global_statistics.toml b/graphannis/tests/data/sample-disk-based-3.8/current/global_statistics.toml new file mode 100644 index 000000000..9054d2c28 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/global_statistics.toml @@ -0,0 +1,6 @@ +all_token_in_order_component = true + +[corpus_size.Token] +base_token_count = 44 + +[corpus_size.Token.segmentation_count] diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/impl.cfg new file mode 100644 index 000000000..449496de3 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/impl.cfg @@ -0,0 +1 @@ +DiskAdjacencyListV1 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/impl.cfg new file mode 100644 index 000000000..449496de3 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/impl.cfg @@ -0,0 +1 @@ +DiskAdjacencyListV1 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/inverse_edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/inverse_edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/stats.toml new file mode 100644 index 000000000..5dc1a4696 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inherited-coverage/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = true +nodes = 0 +root_nodes = 0 +avg_fan_out = 0.0 +fan_out_99_percentile = 0 +inverse_fan_out_99_percentile = 0 +max_fan_out = 0 +max_depth = 1 +dfs_visit_ratio = 0.0 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inverse_edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/inverse_edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/stats.toml new file mode 100644 index 000000000..5dc1a4696 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/annis/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = true +nodes = 0 +root_nodes = 0 +avg_fan_out = 0.0 +fan_out_99_percentile = 0 +inverse_fan_out_99_percentile = 0 +max_fan_out = 0 +max_depth = 1 +dfs_visit_ratio = 0.0 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..cd98d32e7aea5775d4686a46279409bce48665ce GIT binary patch literal 2211 zcmbtVYeE#dF+?Qt@Dgub-H8at^5PYY!BR;rg+RcC+a0rN zJlGLIcL-V*W1F3s)BAxzq@Bv){?LzNeW;>owOqHIc7)l1I)-s97Ei)>K@v+PTop$v zHyphOq9>vkqBo)sq6E4 zh**SJjHpE{K`ccqLp*?I?PpFb4p088F=tspExAA(SILfnD>)6xj_1WfW|&D8H4wG} zS8ZqN^tPZnXs2?R2J{cHK2$Zd5_BvU--026u$Bs!hCJ+qJ3={bR~@ZX_zHF-rXi*y z?nTT%%tXvW%tp*v`kmpeaxsqXe~sZvdg>tr0b|K^*taMjG zCSe)f2>A%B%bgJ4#ZAA@6@(-xT`v154bypv;m5KRGahrr(TdR08|EiUtxgt`Nh0fS z$DS{5{1NXlXt`xor$$E)h>0K)tY|fFJUk@}*oV|W2v`VZS zPJW&(^G}uZ+FBK#Y_%`r<{}DAol&Pn<7HJ!?TfsgD@tjn+8wde=|rN8hLGzxIbu_1*n8m@kbhwpwRR`9T%4 zrZ;K%pC#TmQ>2ClO{OMba&+Xe#J#TCFgK=n=65pKb$ zQBk{12U8$q0hSNB^I^P;gt;0m*5igYmDSK;ur}yb8SPM8nlr2|EloMtQV&XdVX%bY s%pVL5MiGA)Fh0TyA;czKWKh#zW9hgci+?H$;Z!SOy*KyDwtFUlNw-B6RcAdT`f?*IJy|n=_+8=64+%B&XUBU0AjTo*Z?U4^3piv zfldcHO$MJl!Wl4S5T9y+waVht3iKgdEs(7Z4j~<|J~?9bA*_dKfg1xfQWtEPJi;)b zZxo=sV6b%|U@{y`MuN#GFc}TjsRvb?02WIGlgVH*6-=gq$#mqTX^XA}Xr(>67|>}+ zU<({@$~aBSab~aeIJ3=?m`JPZ7JTie|-soZo=fI>w!AAL@D+F2sNeK~Pd0ztZemLcU zDGBHhf9&!I=fD&oC%hQ2)&T5Ufeu7d3lxb3hix2KUm#(92}FFmMXPxYTH^A z#nmPFy82f6HJ!2$)o*LyT4*?@mP`2uleeS#Z@Hfmdk*VxD6nuboUn*kFw^X*x4wjv z&`RUum20f^xjegDT})MbwYK)8tq>P%T$Se|e*Nj8uB8*FzcH;UNYZXwJV&OEN&J^^ zXQ1xlE$wj=rT%dxOHNq9%DPymsQR5fvH4K8l00skCZfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/stats.toml new file mode 100644 index 000000000..59af7ef1f --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_layer/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = false +nodes = 92 +root_nodes = 48 +avg_fan_out = 2.1739130434782608 +fan_out_99_percentile = 11 +inverse_fan_out_99_percentile = 9 +max_fan_out = 11 +max_depth = 1 +dfs_visit_ratio = 2.6956521739130435 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bd7597dcef50c4b8e785770a8d4745bb61cf105 GIT binary patch literal 694 zcmZP&U}As(7AVci$jHvg#0KTDGcxisGI4-OPB6&@Cb_{R518ZylYC&3A502>%@9O4 z18A-=R7@PKP6Dh>1S%%V$jAV9ju=!>2&@pU9OxKHu!t0xlm?SBU{V%L%7ICFu)&H@ zGrYlKVPG+RsF*!i%mFNB0~NCc^X#BJJ!I#=1mW6%4siq<;RGg~!K4eAbOn=cVA36I zqzBYUPcY95%2NQVfH=+v%=QJ7eqhoc*?O3k0I*abm<$4wKwp83>jz>vAZsoVgJV*@ z{ziDn1tAlb8JB*D9pFACAr+7yG@7L?A+s(;T$>OO za6`5=_)eW`Lqx^tpea9+M1>}SQ-KrEXl@4m*TEJHO3Z1QIVGt@dO4|OsW~Z0dQPQ4 zerBFiPJVu_8(7NdF0(htL10%M`f<#kL68~52LhNU!WbBCfW^RS%V6vl)qzax literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/impl.cfg new file mode 100644 index 000000000..449496de3 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/impl.cfg @@ -0,0 +1 @@ +DiskAdjacencyListV1 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/inverse_edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/inverse_edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..986cbd5ef6d8b2c819db546c539676f746edd918 GIT binary patch literal 950 zcmZP&U}As(Rw&KF$jHvg#17@LF*5QqG6{mkIIxIuBE*2YxKMc9C_Ek%9xn=y4~55% z!V^H@2|;;cU>Au(c_Lt*IFu*M$jAWq7EBPLK?0!!=q-qtB#MzzC_HHto(u|47KJB= z!jnhgDL{E%U}x$i=GjAe`ru%PsBl1tA-n|>geY-@DuM8vPCqVW7sc>XB700a-=y+DK@Fj)nm@WP-x zxaA=Jbs%2H$iUDGOwKGIfrjphcO*2!%UKo}shhVug<1L?)0?*b(8`$R!^S#o|DCq2 zzIl1gFF7aCrdLl_w);2gF$*)iZ`kmhbFzA`{*8sK-0K$Dxw3&w0tF{G!@UVB?lLGb zr)B1pq!#Jrq?V=Tq$KG%l>+&hc}_X``MGXjDWkir;LriOg@Ix5qmOeK1erlhAb>gC li-F+=IJbe-9J!rjv6ju?5gRPcKonyo!+$q)tCYIm1^`PzQ~>}0 literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/stats.toml new file mode 100644 index 000000000..5dc6ca90e --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Coverage/default_ns/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = false +nodes = 56 +root_nodes = 12 +avg_fan_out = 0.9285714285714286 +fan_out_99_percentile = 10 +inverse_fan_out_99_percentile = 2 +max_fan_out = 10 +max_depth = 1 +dfs_visit_ratio = 1.1428571428571428 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/annos.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/annos.bin new file mode 100644 index 0000000000000000000000000000000000000000..a2ab2f0b81bcdef9dbba8db5adcdbc23431674da GIT binary patch literal 73 LcmZQzpb`K807(D= literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/impl.cfg new file mode 100644 index 000000000..e96ea914e --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/impl.cfg @@ -0,0 +1 @@ +PrePostOrderO16L8V1 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/node_to_order.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/node_to_order.bin new file mode 100644 index 0000000000000000000000000000000000000000..952be524aa3e9050cf2616ddce55acd4769b4e41 GIT binary patch literal 1940 zcmZA1)l*hs6vg4+Iq!SUcRr-VBt%qNL`51g2vO+FF_HqKPRRrKSp7o5dByfne(9L8NG*hD+-M8UJ1 z!|4&c$|JmZ!2=w`b_jmv3#Li%4tEKn1Y;SA9}-L>9aAMZhq?I0f*x^rd4iM4z|9d{ z%|=2S1oJ7!PZ!+FA#{|PK;of2*P!O?|6?53nnuWyFjprV#4zT zn`yz63WkU#lrNY_3MN;uf=c{;!9q$1G6h$#6T4P$CJQl>1$#ITy@EHmO^_tGiOu-k zf|q%PX%)P{Ux*c)N)ArF;2YjzBLtI}fUObiW(m%G!8+=(Wr6`C2nqxZ379Owt?a<- z6U?I;J4J9AXK+>s=CYK5Q3o^psMoWBV2R)ro@2)e9_Iw6P4FVWa2f<(@De*ha6Iu? zPcV}#+zP?P)M5(-E#t5=1YzKWvjtbO1#g|;e5&wL1&?qNcctKTx^R*O*V0FLV0-^F wyt>Cc!y7Mnlw+87!B2e0jT79)cKk-c2Ry`&6dcVM{6&I4_=#I7xPTh$KVDN<)c^nh literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/order_to_node.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/order_to_node.bin new file mode 100644 index 0000000000000000000000000000000000000000..8a4a431cbde04465ef24982ff9ad8b5b2fe346b1 GIT binary patch literal 2768 zcmY+F^_LY#6ohN0d*;nU77Nazixb?Jg>B zX)<9;6t#pcP;L`2t-Q;Mg_>KVs3LPK#Zqi!QMr!M4tZ;;ZL65+5jr885X`o#j>ub4 z?FWh-qS^X-T~1f{mQ>qC(dlA)!;KAQ>st-qf@*t2G_@Cenbe+&-D8wZmFfo_h;($Q zwWu1nf};9GG<5)6iPZjzeKl$@(u~Z56cfE>!;rKk^H9Z#!dLVopedMbsx?S95iUl$ zg>Ww1Ho{r(+X?HD?If&2zKd`QO1lXsq1Zz>5#`wdZLt#+$E&#(#Vj&URvZ@_S=3k^ zV;b_AR6A91j2>YovK_%}%b9_EIMq&9Y=~y-J6e}B2Yv?C&Q=_yi(Le_HJGh$Bm8u# zogdNEh42leE>N5oqim{?`axGB-5hEyY8hO8Kr=6mXzB{MX{0V!T%u8{k=BuUmEs7! zX6um5A@f?r%FdgDYGa%O`VxMNq#xmzNc$6h3OA7O2>c+z{m6z8?m<43a5qZB2zQ{U z{Jb|mHpX_8dj_;!ZByK;=3OYN$-Gl>OKfCOn{|wR$a_%jUd2s%gu}=N2eS=%2zh&| zJ*apfnyqh>F6R^Y?o@kJaicEw3%CKnY<)k2??$!9Bbs^wzALGpD;|qcHq{3Gpl6WQ zgj$RG8mY{$ozmcvCwHEak+$xHC5z*9FaE+wC dRD7;c|0117=6@6~=rwzTWEq)XE1r*6^nWzDa6te7 literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/stats.toml new file mode 100644 index 000000000..170f24e0b --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Dominance/syntax/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = true +nodes = 92 +root_nodes = 4 +avg_fan_out = 0.9565217391304348 +fan_out_99_percentile = 3 +inverse_fan_out_99_percentile = 1 +max_fan_out = 3 +max_depth = 9 +dfs_visit_ratio = 1.0 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/impl.cfg new file mode 100644 index 000000000..bd39cc239 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/impl.cfg @@ -0,0 +1 @@ +DiskPathV1_D15 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/inverse_edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/inverse_edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..bfdae90fe44959c37e21f15127fa3b214684d327 GIT binary patch literal 912 zcmaKrNh||l7=~x6t;nbC>2mQtED>f|6|4Z+PpB5^_!kvI^E zh?|2(;w*9Iki zNem^CaM74Nk~m5-;41CT0e>%R7{0y z#~eV?Nr?&04A(_&5Y$aH0_q_eMc)F~OKwb9LpbNcWf7hf!p~DGiYor4S#iH}cekkw z@Hvvz+Qrtq+xYw0leUM`HEp)+{M~wRnRzX9?yjiK7UTXE=k>pGCPi%jvDvV~KW!iF zclwuI4Hp5OYJ+=_i{BLY<`~uE#~YIt5vEXpqZQ4ekz^=JQx5qQvf~e?B vHXy!*{Q{c?4WkMYSvbGTGt3G8N9fcT_YT+P`Wt!jck91t`dr%LeXY-5-F{e7 literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/paths.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/paths.bin new file mode 100644 index 0000000000000000000000000000000000000000..0c6f66210a6300f26d75a40df7374e8cc615d3b8 GIT binary patch literal 13068 zcmeI1OA5kJ3`LuPi1-6RMEu|i+<}Pr0a2{(e>2VtC3Fx@+uY7tLQd{^@LFDrLRa4n z*r4wTtNCnLeWH6AkO3Kxff@t(woF^qsEB@S%VV9RfB2+e^%RN_NAa7iWprUU<|#MBvxVPNdnz=c|441A>$AJc(vRN{L& I@B>Qx2V9#90ssI2 literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/stats.toml new file mode 100644 index 000000000..29d71f537 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/LeftToken/annis/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = false +nodes = 92 +root_nodes = 60 +avg_fan_out = 0.6521739130434783 +fan_out_99_percentile = 1 +inverse_fan_out_99_percentile = 3 +max_fan_out = 1 +max_depth = 1 +dfs_visit_ratio = 1.3043478260869565 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/impl.cfg new file mode 100644 index 000000000..bd39cc239 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/impl.cfg @@ -0,0 +1 @@ +DiskPathV1_D15 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/inverse_edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/inverse_edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..87efaf3e8beb5ec72f9b2794a27349ed5cb41c68 GIT binary patch literal 804 zcmZ|M-7AAp90%~{+57C(iftHcc_}Xsk=)RD*tE=Eq@k3OM=e9ixLwgoWPgBCT)9#1 z`~i|HQj+4Hf56Rh!Sj55x7_&E>738+{7&b0I)Ym#{DzscQ54Nqou!h6tds?+p{$UN zs)6j34RTO+$VoXM7v+T9lne4ujgXhBgM5@1@>4#jf$~F*RD&oA)n_&_=|M?=YJ!@n z0MtS?LqRG5Nz^zLrbeJqDh!QLqmWFELF1GRO;8dPp(dax6@ezHC^SXI{ysC#WCkVE zR2)iB(@>HMLMbW>pE8UN``32-u7}!-7lBZ5-lZ#D&YBKP+R*`Z*o4KwL_`SbSF^coWlPE_ zJ4$Y4S(5Xrvm0`5b8|h4t^QZTuyzV9yvIIogjzJ9rmFo3!n2`TbN8LZfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/paths.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Ordering/annis/paths.bin new file mode 100644 index 0000000000000000000000000000000000000000..5498715275a8d3cb536a8365b1e936c4995a5e19 GIT binary patch literal 10527 zcmeI0H&O#Z5JlII!H6P3Lbwxf1q4DO0AvVs|D%vRVii?rzqa_BykN~tzwYUsnxgP! z0|zpY10Rks!3SMNs9kQx#9Es@7VyyR1~(+0y&xpLKR<=66Qe!4LQeSMUK2 m{y;9NA`L2~ohpZ|s%UD}adD+OpDukMPJ=T3+(1;QvtGa7IIxivKsPjY_A_kHJ{@0`;LI+gJE$QQ*@ zQKiNL)Swm%aRnA(F_z#;EJYob;eMvP*?KiNQOBW}XYxCOW3HY~>qtVBKT#9g=>_n=|9c2$)2Vl~!qgY4sP zC$|G_Xvfod1|8_cRy>R6@O*CD?6wyuwP8EDumd~sB6i^=yo}v=1+U^Yj-01FdM&wq zxoxwh6Ol$57g{;6Kc(=NZA zHnX7om-NlCsGE@TB9&3m_MrV`?A>j9zh&B<-<_D8cTSmOjxRHQVPR=pdKQfajq_pQ zt-8eNv)%Wmo?DW=aWNRS=#!7fCYwCg*$p+VV{5|D*FqcvK&1`ScrjNQ~kD literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/paths.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/PartOf/annis/paths.bin new file mode 100644 index 0000000000000000000000000000000000000000..c0a9c2b9977c39e3c0407aa268d69cab285548cb GIT binary patch literal 13915 zcmeI2y%NJ941-DFar^)NLxCZrTxPfm0 zXpi_+6!8e$=ALVSn@eO+4b_?Ea*MbSmt0ioaBT7B-VgxXTzfC5hU(0BxkX%vOD?K( zIQDpRpBn3csmGMXl)u`tR zClGTpJXJj`&!EJVmYGwMTBMhgT9%rVlBDNU3glZfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/paths.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/paths.bin new file mode 100644 index 0000000000000000000000000000000000000000..350cf3b3da35ffca44fa7e87f77368b73e8e9fdd GIT binary patch literal 5687 zcmeIu!3h8`2m~-4p>%>y(EYdT3_=AT@qMR~2t$AX0Rqzna!voxe*~%mxvLF%0t5&U YAkZz4r~88a1PBlyK%gp+x7zrWe_5CSc>n+a literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/stats.toml new file mode 100644 index 000000000..599f469e7 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/Pointing/default_ns/anaphoric/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = true +nodes = 8 +root_nodes = 4 +avg_fan_out = 0.5 +fan_out_99_percentile = 1 +inverse_fan_out_99_percentile = 1 +max_fan_out = 1 +max_depth = 1 +dfs_visit_ratio = 1.0 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/impl.cfg b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/impl.cfg new file mode 100644 index 000000000..bd39cc239 --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/impl.cfg @@ -0,0 +1 @@ +DiskPathV1_D15 \ No newline at end of file diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/inverse_edges.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/inverse_edges.bin new file mode 100644 index 0000000000000000000000000000000000000000..eb64a463c43b6e285fd63dc27b802b90892d0cf1 GIT binary patch literal 832 zcmZvaNiRcD7>4g@duzBv)ex~T)I3$yJccHEX)0oDYgWDG(h_1=No=raB8GI0mDsTo z3-J?J60x$8u(3ws^z{)7CppjU``qt6JxwYYG{W~1>8WEZDn=7cDUq=Rrm@1slB+_B zqg0EOK&b-rm0&ek1J;4{U<25Qo`HILI18MSToGf!@AXnhCSxRxQW9#paOvdokuoUd zU?LAJ01Lrlummgx%h1cDUMidkE{ohSl9|%<-|ID#Y{Ez@rE(nNg|m_KA=xQSVqywZ zKtDJGs^Ba*hn`Hm7PtwxPI4VcU6kB^uh&DeA2k=~0f)d5a1-@c@n!1>mz46`IZFLSokcMw0$1CfjZXnXa{WjC9iiJdNB=hz26Zrc(+!iw*XXB0w W(ef(({0F{=|E24FBb2)$KYRgI>Qq7i literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..b02d64e9ea8c6d80351f24c76f290ac1252d084b GIT binary patch literal 123 zcmZQzU|`^8=z4MMB7+iBT4qj3YLQ+}YFTPdN|K&aDUhF;=aiG5pX&yeGGbr_s%KyX g5vea?SHL+3J}un9rEkXp5+DS^e>Zfil)B#r0Hf|2XaE2J literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/custom.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/nodes_diskmap_v1/custom.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed24586ecab403102687114829631ec81693c9e4 GIT binary patch literal 33 KcmZQzAOHXWApijY literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/paths.bin b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/paths.bin new file mode 100644 index 0000000000000000000000000000000000000000..292e96c477f57db30e3f77a10b9991dff8550b12 GIT binary patch literal 13068 zcmeI1F$w}P5JmSFf{369f>@|0@CJfdC>9o4c>kl=Z`kRQhm&vT4CLp%2|;&G}4x}gV3~vg0WZ~v1kX|fDPEdzZuXM+A(%l>p&4> zKLjx1J*65W#)3Ci-N*oUfl!C5k=Qau*Rs~6f<4*OstI2ig8eb)h!>@WCKZktJ6y{q zV#|;YRbw;8f-mLVb651p&1X6ruz`#Q^vp;VG3gU^-LnB3uz|n;c7f;}?5bu9dnZv7 zDqNdW{R;P0ugMW(r}|XSX1H>Ps<9bk!PFB0-NUn2ugSd{z=&`4Y=$dyGsaY(Lm~B= L9jeA=Oa=b{t<4J< literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/stats.toml b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/stats.toml new file mode 100644 index 000000000..10adbd93f --- /dev/null +++ b/graphannis/tests/data/sample-disk-based-3.8/current/gs/RightToken/annis/stats.toml @@ -0,0 +1,10 @@ +cyclic = false +rooted_tree = false +nodes = 84 +root_nodes = 60 +avg_fan_out = 0.7142857142857143 +fan_out_99_percentile = 1 +inverse_fan_out_99_percentile = 8 +max_fan_out = 1 +max_depth = 1 +dfs_visit_ratio = 1.4285714285714286 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/by_anno_qname.bin b/graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/by_anno_qname.bin new file mode 100644 index 0000000000000000000000000000000000000000..993a85532dbc7af56830e37b79dcbf827e54ecd4 GIT binary patch literal 7167 zcmchbdvH|c8Hdj~B1u^PO|P z?|py2_dB2sGqfN6xSURRfz#F2(Qck`=z3I)F|oa)_S*Rj^TH}!9}#9utasL_Q?0t5 z7-b|C&0p5uW^Yxe=ji%)n~`bWSgh+w;YRy>hGd!NM(KKTm{HlV)~)U`bbWM);cQVS zigZ22W~7_9R7j|<&eQd=UN4y|p?ZFmgxTi3B@#|D!#gCLY=)&0PBFs-UDq=FWXLNg zl7*!9N@q)sc~_Y{WU3jKv%o0B?dqs=ceonVtmo@`s@IxoBvep_3(43!NgN zT4;{05BE|=Pp z9W4M?+qbS_B4f(gP=!bD+` z5F>LH|Mv4vtju*B3^6XmuMnF;+6|=3K$;6gvJme<_Akih0yBkdCH!UwfovT8ypuhI z>uep69RsrUK{heS-UVp@5dT8l53wi2ZV;Ejd?B9VH{O9*rk{6WeXiqpI7e6{EEdic zmI&_i-py~JB3SxHNvIBWy0k`{K|Z5g}9U7 zm=b!t=wG*3jq5lJVk}65fb<1OJFvtUMtwu0y~E*N*4l1b20LIAL=O=zM6wXcLWBts z7o_x%@<0lssedQZFC3SuaTp&*8W7z$!2h@l{cg2){rLx=z&0)z+HpYhJhHS-?w5TMqnd|ji6nKjrfg?AU1;72x1wC zWgwP;SO#Jl=&{UTV?dp=)$OvkyHgvTb-n)(#y0U{5^bcJ_Ye17@%q0ftvQs zKuwFaYT6}^njlR(%Q4HQX~#HHgEeh0M@opMy~vRks%bBAObFAoeH;_RH7(Bb;I-es zwnmFKR~XRyVyqV>_JGLQ8e!Pm+8XRGH8Y$$NIVAd0Yul#y;g0+_?eq=9>g6GwOeD1 zI_Ii3M{}Ls-B9nDg69v7VVIZ~Vm*jqcxS^}J7d)o=_T4=HQ7EFX#-3G6GM-cf{kYP zwWn(u&4$<0RA7P-`}5sEGf@u@l0Hfsp_|eKAl7A!yUE^aK8M!DdD;u4IY2CJ<4r0p z7N)5BcYwR8_X=%{@w6I9YY4I#PV=#0G9@|{)1#X}x&ulHyn#}}e<+Ii9BP8D>3fix zz`ix{=t-43uy3ud75!Sp=#0$cTHO!;zi+mBks(I2Q>CoGLQ#RzJdeO%-Zzl^}>OaPHD z#0nOhVKHyy$y%h@OVnAa6d^qej+EaxfZudyhy$RfN85}9^Fhpj#!c}c35BErQVK}P zAo+vL5_-}QrCzKagRr^C5+E9Y)CH1JNc|v{hb$11HApca#eggsq8vyyA$5RM-QNol z5Esc1r0GEN1Q8vicn~?jQX%={H;oA*5KEY0%_}Tc5*aOxZH$nw2n@tm1YF45RYUZ- zW072E*Jf0`Xr)v=Em?FpPM9g2B3vNkdjXZQ>%$e?8EVWcD^os7lqaSO`3B+X%cWxT z&5EUXC&a2w-y%!n%Z3F`74mh$?;2sFu$jgkV&vOh=Cj7i3nmE{3mc@jSC|8mWR`qm zF;~8Ypk3BlrPey0>8PG;?L9w1MyCpMgwuq%!X-k!h`+( zQZieg|G*FS`aeCVt`n*oR!gY<3&F)K`E!V)oL^sXJc^1Z@kR0njuyU3zU6T7WpaVT z&ezG;92>D#nCHeFEg5@PJhJCpPxkPS_MQ23$g>%r>@f--{~%y?+_;7Nj!ztZ>Q5U- z?oQjJA6si|JeU9Rxj$b_35@*9_9BbzTg_pAwqotyzO0GwGER5KZFoNK!`VTvCq$3P zsUM|Rt{w8#{eMr$scKzR`1?r#g&$2m`prW3<-6YsT2!Kq=&XMAbmwk+!OOo2ICLg` z^^ULpy5M9<;2-NlW<1<6!Ts-lJycQMlXE8b(nnEsb{Dn^_(7z+;Q|z14Hypx*}c zbY<*Z{N=dL&cQtg+$Gze*!@D;)E)H$=6pYS+I!LK2bXTXc-8Up{SnDeZSB@F#^}cG zpPr3vsiZljPc;^uxcuOj{c#gBv|l(L9a=oCV0p&M z;`9w$^f%ihQ}%rKlaaMidsm#hGyIcZzFan_Am2LdH9d7p#EIPGt$CIa%e$B6IunxT zrVdOVxo`gm@sU^cX(tcdlRjeqm1pmLDRoQz&z^qa>E`iUhkW)#>#++lVW;jpvH8J* zCfj>oo^E=kaQzNj&gB6;2lL;X^KfEAQ|;4^H@Agsxi9kA5XR_TKKJ!=~?doO(TOqGf5)nHL@_K0k9s;?B;p zwpi<~f`pWnVMj`=p$m|- literal 0 HcmV?d00001 diff --git a/graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/by_container.bin b/graphannis/tests/data/sample-disk-based-3.8/current/nodes_diskmap_v1/by_container.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d8ced0af904f2d84f9fd81d6e63880de715aa02 GIT binary patch literal 11833 zcmb_i32+nF8Q!PU@+lLrsWD*57~4P$J|M;%!NRfefh`{~cZ@8_w=JO~F-+Pqw+*<^ zi_n&&LkLYs8A54*wi!B(fk39t0bw#N9i|+ONr6xrI>RKHWcsh<_jXrW@4kh2#+GF1 zec${4egF3#E5?}0g!~|r$K~=Zc6r=>?@X`1G59Qdro&~=N%ht_yM5VlC>o(ihs|g6 zy8IryQ*TH`7)5+}4!b;e^yMn)X5qLvN8^-^$4Zt4QV<;OJP-xlvR2)()wKP zRy!D@C#Z79ghBF+(P@yz*U}0u(`oTDjSi`b>WBg62x_WaZug8adSn@14ws0}Y|tRR zv)k6`?f|D5w7eHAe`O`dZ=mMS-IP-)cRGb;3c?4$+!4o}rBv-lG zALZb2SL6x8BYH(JkIcS7$;T2PQ6(P%dgS~xj=;=%n(*=SQW7;6FE4Rt11}Fb z8zE^b4#mjjr3uvgcks{|RBQ!~9G9p@uQwGRWHZHm_Mu-mzi^^Drsmd65dIklaIAppqNP#qkLe^Sr9@u4u zGDu#9!Y(`xefTAaynjORdj%TBhtMS6fm*Ks1gHS|nR@60nxMvC0Yz~uRKjPWI1SXN zLz}*~bB@HD;90GXK&Te>m#J)j0*U0C%$}scg`cJjP@s9Cfc0>lGo};3sZ`-3P&Yo} zh{+i<6aw-Bl5;5r;klG`L14#EDkP8$PZrUfMG!?LFd!l9K;&^Ha3Dz$xbGmXYQ%TpzjF3q}A*lSLBM(#1YCeYKmrT_ z>@Wm40rzhI9KkSyFbMb~31tj>Y0y2)zMtqK+b**fxZ(X9rK04RtESuSJ5cd$JaAfOr|Ac3LuM5s?op+0>H z>eGM0jjCLzPd7q+`XSV(Nl=?YiOMiV?T(|3hhFrYRo!zucWr;}n0wQT-m4oP9P_`$ zZ+BL_TKq}R2W3C++@LFYcwlJi+JwnfNuANnga27{yS=`sS)2N~?ciq1MQeiH+N-*+ z@elZot?s&-%FdO|gUI?`gZ-LmPtNnP zi;O!OE9Okm+?u(k_@x6)nt}TZh8DhZz9(rIVX#pC){nfc1-$Fou{FE>-m}s zYs&O7T|M>FDtBJWuh2D^)#hUxa{8tw}#>B&4SbtGrm}fYfz4c<*o~u7OZE8%s zdV1ecLt(}e)6)1e3l49++uQf%$zRXk_-(|W()+*c)tC-s-55a2Z@d_Lb6>CTNPK_k zr?qE)`9$?!`&%X@A6=eoPMQ0&bNS`*Jt*n7gWv7j6`%Iy-j|tz>LcB|(qi06(|gpP zR}W=XB);+1=J&VN_pXoIdGFAn7e2juuI3eWX(g)OY@gYCY2*F(8rnTO7G2obsoHz@ z)}{it`Y*pbHD=RCcfanRc>830jCI$-*p$r&cO%23wjJ-L{WMe8J;UQNej6HU8)#{FBrn$NyHk5%#?bY+Y2A&MEYqF0i(bE! zmN~ZYkY8K8{rCyMJ)@XkFA%XST+Xc=6`n zQ1||IrEi&r8k&cXoo^eu5da^p4&pSIDjae2yyl(BXbhuod#euhS zUVU~L+Bs<0*gq$}J>_NMA%p@y86%JIR&R^5e5J1VMY11~-$sxJql#Hi6MTtczA3k;BWq^Q^e% znY(_QnylD@?fS!a{B~qgI74OGr1-v;W~#1qzDA7egr3b5oK~o*edd9m{5CTqeV(bA zUQ1JKqwLjuI|bdHo2rNSQV)ExTu)bb(fLQ`#l&Q5SByObxvcrPb{zWs&~xSS&ZqIj z>SC@^-BA~|HWF~cO+Kby{YA=;$>yrCoePsLiqCI_JF;dzsXkfZ4l`xoGk*L2@Z?Lx z-}xaqI=P()`gnSLJe0&RKL(!}z_kYZDwo7?w}U@r(OES2?e=Z8%uGHXo2mKx3ht|IX-C89@BcUxe(>fmt}?4G+r21iN8-OU3#xCd zrcxIwhChe^Zb;4NTIKRRC<;F4+lioBP0egRouta@k*w){RkHdM%L4mH-=;5B*{Fkd z;dUmLj-YlZ&8S0SJ60L+viSPvj~zG1&|7QhNcQ^&DuCP=`>w1D7R4h!rq?Ymip2+a ztoxWc_+nXt?eN6G;Jo}zlLes3Q$A}kD0B8C;80)M?h`@Aq50SqB`mpJDJZ>iBJ5#L z$n}NXOCk44$h{VFZ-m_5Z;|`kIzinWELKk`7P;>J9s=(M(P6ohkZuRTy^zfP5cp*X z{3--~9Rj}zf%jNNC9I;79kHS+lvh-;BUDrg$`rK(C5rMKuP6`ligF;YD97=NavZNH z$MK4C9Iq(H@rrUBuPDdyiYhs|qA<)W3e&uzFwI*B(Lpe;D3qC36oPq0A(&Sbf_X(D zm{$~n_oxypq5PFBzutP3zdRxF*E&l2%cH!%0A&4DfsDT}NBRqcq`weI`U`QSzYs_I z3vr~s5J&n8aiqTxNBRqKq`!a@`D+6v{k1`p{@S2PCDrL5nDkdYGwH87O!})1lm4p1 zq`&Gg>90CWmRKve&nl`fDzSVrZ!=y|eL;!kbMhrt1xgfEUU^xH@rv>wuP6udigFyU zD97=NavZNH$MK4C9Iq(H@rrUBuc(rfD+$^JfL2A!fhTDbOf&r z-K)_eXcsbbuSaF?IAs)cuzl<4W(Ix}`p{37a^A>RPr`(|=@us9lQxK=8x8_Y`*^Kt z-&nJ|et4s(k3kfhEBj(7x~V11ag%ed6W14DRHxLwwqo%#ta3N3evL)etol)3Bnh+n EUn)f5g8%>k literal 0 HcmV?d00001 From 33f43116b6cf8a21ba6426f4f68859c9168298b6 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 17:29:34 +0200 Subject: [PATCH 07/16] Get popullatiion estimate from the component statistics --- .../src/annis/db/aql/operators/edge_op.rs | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index dd9473cbb..175245a7c 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -41,26 +41,47 @@ impl BaseEdgeOp { let gs_for_component = db.get_graphstorage(c).ok_or_else(|| { GraphAnnisError::ImpossibleSearch(format!("Component {} does not exist", &c)) })?; + gs.push(gs_for_component); } + let any_node_count = db.get_node_annos().guess_max_count( + Some(&NODE_TYPE_KEY.ns), + &NODE_TYPE_KEY.name, + "", + &char::MAX.to_string(), + )?; let all_part_of_components = spec .components .iter() .all(|c| c.get_type() == AnnotationComponentType::PartOf); - let node_type = if all_part_of_components { - "corpus" + // Use the single graph storage to get an estimate of population of nodes that can be found. + let max_nodes_estimate = if gs.len() == 1 { + gs[0] + .get_statistics() + .map(|s| s.nodes.saturating_sub(s.root_nodes)) + .unwrap_or(any_node_count) } else { - "node" + // Use all nodes regardless of the component as population estimate + if all_part_of_components { + db.get_node_annos().guess_max_count( + Some(&NODE_TYPE_KEY.ns), + &NODE_TYPE_KEY.name, + "corpus", + "text", + )? + } else { + db.get_node_annos().guess_max_count( + Some(&NODE_TYPE_KEY.ns), + &NODE_TYPE_KEY.name, + "node", + "node", + )? + } }; Ok(BaseEdgeOp { gs, spec, - max_nodes_estimate: db.get_node_annos().guess_max_count( - Some(&NODE_TYPE_KEY.ns), - &NODE_TYPE_KEY.name, - node_type, - node_type, - )?, + max_nodes_estimate, inverse: false, }) } From a79de1cccf4db21470ed820b22babcf3a69efbe6 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 17:41:27 +0200 Subject: [PATCH 08/16] Add changelog entry --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7497f2ac4..57f086dbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New optional `file` option for the `[logging]` section in the webservice configuration. Can be used to additionally output all log messages to the given file. +- Add number of root nodes to graph storage statistics. This changes the way +most of the graph storages store their statistics. You can use old imported data +files, but to make use of the new information you queries, you have to +**reimport** your corpora. - `Graph:ensure_loaded_parallel` returns the actually loaded components that did exist. -- Add number of root nodes to graph storage statistics. ### Fixed From 29cdb024c81dab048b0b7215d5faf2dda102bce3 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 17:52:13 +0200 Subject: [PATCH 09/16] Only run verify workflow on pull requests --- .github/workflows/verify.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/verify.yml b/.github/workflows/verify.yml index f3410a9ac..195ef6073 100644 --- a/.github/workflows/verify.yml +++ b/.github/workflows/verify.yml @@ -1,9 +1,6 @@ name: Verify on: - push: - branches: - - main pull_request: jobs: @@ -66,4 +63,3 @@ jobs: ``` ${{env.COVERAGE_INFO}} ``` - \ No newline at end of file From 41db5069f78b6cea8fa130229a817e3ade5afb79 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 17:53:48 +0200 Subject: [PATCH 10/16] No manual default value for serde needed When an old statistics struct is loaded, it is derived from a different struct and then manually copied over --- core/src/graph/storage/mod.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/core/src/graph/storage/mod.rs b/core/src/graph/storage/mod.rs index 3bb627af8..52f0498fc 100644 --- a/core/src/graph/storage/mod.rs +++ b/core/src/graph/storage/mod.rs @@ -32,7 +32,6 @@ pub struct GraphStatistic { pub nodes: usize, /// Number of root nodes in this graph storage. - #[serde(default = "default_number_root_nodes")] pub root_nodes: usize, /// Average fan out. @@ -52,10 +51,6 @@ pub struct GraphStatistic { pub dfs_visit_ratio: f64, } -fn default_number_root_nodes() -> usize { - 1 -} - impl std::fmt::Display for GraphStatistic { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!( From e380fdd485d876d30fc71037b6fbd964f852a6fd Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 12 May 2025 18:04:11 +0200 Subject: [PATCH 11/16] Mark test data folder as generated --- .gitattributes | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 3c507eb27..7500e2c01 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ -*.graphml text eol=lf \ No newline at end of file +*.graphml text eol=lf +third-party-licenses.html linguist-generated +/graphannis/tests/data/ linguist-generated=true From 32be65405281e4557be84c3865dd2e635fb9d82d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 13 May 2025 11:02:22 +0200 Subject: [PATCH 12/16] Documen fields of the cost estimate --- graphannis/src/annis/db/exec/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/graphannis/src/annis/db/exec/mod.rs b/graphannis/src/annis/db/exec/mod.rs index eb02fb017..8965e29ed 100644 --- a/graphannis/src/annis/db/exec/mod.rs +++ b/graphannis/src/annis/db/exec/mod.rs @@ -12,8 +12,12 @@ use std::sync::Arc; #[derive(Debug, Clone)] pub struct CostEstimate { + /// The estimated number of tuples produces by this execution step. pub output: usize, + /// Sum of all processed tuples including the ones of the sub-steps. pub intermediate_sum: usize, + /// The estimated number of tuples that are processed in a join in this + /// execution step. pub processed_in_step: usize, } From 1a6a11f2b9fcbaeca2f2eb2fb7ff3a78b42658d0 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 13 May 2025 13:09:48 +0200 Subject: [PATCH 13/16] assume all annotation can be matched in regex search without a prefix --- CHANGELOG.md | 2 ++ core/src/annostorage/inmemory.rs | 9 ++++++--- core/src/annostorage/ondisk.rs | 9 ++++++--- graphannis/src/annis/db/aql/conjunction.rs | 1 + graphannis/src/annis/db/exec/mod.rs | 9 ++++----- graphannis/src/annis/db/exec/nodesearch.rs | 10 +++++----- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e904a8319..c722a843d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ exist. - Less frequent corpus cache status updates in log. Before, every corpus access could trigger an entry into the log which is not desired under heavy load. +- Improve query execution planning by assuming all annotations can be matched in +regular expressions without a prefix. ## [3.7.1] - 2025-04-14 diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index 6d881061f..f75599652 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -752,6 +752,10 @@ where fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result { let full_match_pattern = util::regex_full_match(pattern); + // Get the total number of annotations with the namespace/name. We + // can't get larger than this number + let total = self.number_of_annotations_by_name(ns, name)?; + // Try to parse the regular expression let parsed = regex_syntax::Parser::new().parse(&full_match_pattern); if let Ok(parsed) = parsed { @@ -770,11 +774,10 @@ where guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?; } } + } else { + guessed_count = total; } - // Get the total number of annotations with the namespace/name. We - // can't get larger than this number - let total = self.number_of_annotations_by_name(ns, name)?; Ok(guessed_count.min(total)) } else { Ok(0) diff --git a/core/src/annostorage/ondisk.rs b/core/src/annostorage/ondisk.rs index f947dce6d..140c00b09 100644 --- a/core/src/annostorage/ondisk.rs +++ b/core/src/annostorage/ondisk.rs @@ -895,6 +895,10 @@ where fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result { let full_match_pattern = util::regex_full_match(pattern); + // Get the total number of annotations with the namespace/name. We + // can't get larger than this number + let total = self.number_of_annotations_by_name(ns, name)?; + // Try to parse the regular expression let parsed = Parser::new().parse(&full_match_pattern); if let Ok(parsed) = parsed { @@ -913,11 +917,10 @@ where guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?; } } + } else { + guessed_count = total; } - // Get the total number of annotations with the namespace/name. We - // can't get larger than this number - let total = self.number_of_annotations_by_name(ns, name)?; Ok(guessed_count.min(total)) } else { Ok(0) diff --git a/graphannis/src/annis/db/aql/conjunction.rs b/graphannis/src/annis/db/aql/conjunction.rs index d7adc63e7..5551f63c7 100644 --- a/graphannis/src/annis/db/aql/conjunction.rs +++ b/graphannis/src/annis/db/aql/conjunction.rs @@ -119,6 +119,7 @@ fn get_cost_estimates<'a>( } } +/// Returns true if it is estimated to switch the operands in a join. fn should_switch_operand_order( op_spec: &BinaryOperatorSpecEntry, node2cost: &BTreeMap, diff --git a/graphannis/src/annis/db/exec/mod.rs b/graphannis/src/annis/db/exec/mod.rs index 8965e29ed..fe27ea482 100644 --- a/graphannis/src/annis/db/exec/mod.rs +++ b/graphannis/src/annis/db/exec/mod.rs @@ -16,8 +16,7 @@ pub struct CostEstimate { pub output: usize, /// Sum of all processed tuples including the ones of the sub-steps. pub intermediate_sum: usize, - /// The estimated number of tuples that are processed in a join in this - /// execution step. + /// Simplistic estimated number of tuples that are processed in a join. pub processed_in_step: usize, } @@ -58,13 +57,13 @@ impl ExecutionNodeDesc { pub fn empty_with_fragment( node_nr: usize, query_fragment: String, - est_size: Option, + estimated_output: usize, ) -> ExecutionNodeDesc { let mut node_pos = BTreeMap::new(); node_pos.insert(node_nr, 0); - let cost = est_size.map(|output| CostEstimate { - output, + let cost = Some(CostEstimate { + output: estimated_output, intermediate_sum: 0, processed_in_step: 0, }); diff --git a/graphannis/src/annis/db/exec/nodesearch.rs b/graphannis/src/annis/db/exec/nodesearch.rs index 1048c0538..206d3d7dc 100644 --- a/graphannis/src/annis/db/exec/nodesearch.rs +++ b/graphannis/src/annis/db/exec/nodesearch.rs @@ -569,7 +569,7 @@ impl<'a> NodeSearch<'a> { desc: Some(ExecutionNodeDesc::empty_with_fragment( common_args.node_nr, common_args.query_fragment, - Some(est_output), + est_output, )), node_search_desc: Arc::new(NodeSearchDesc { qname: ( @@ -676,7 +676,7 @@ impl<'a> NodeSearch<'a> { desc: Some(ExecutionNodeDesc::empty_with_fragment( common_args.node_nr, common_args.query_fragment.clone(), - Some(est_output), + est_output, )), node_search_desc: Arc::new(NodeSearchDesc { qname: (qname.0, Some(qname.1)), @@ -772,7 +772,7 @@ impl<'a> NodeSearch<'a> { desc: Some(ExecutionNodeDesc::empty_with_fragment( common_args.node_nr, common_args.query_fragment, - Some(est_output), + est_output, )), node_search_desc: Arc::new(NodeSearchDesc { qname: (qname.0, Some(qname.1)), @@ -920,7 +920,7 @@ impl<'a> NodeSearch<'a> { desc: Some(ExecutionNodeDesc::empty_with_fragment( common_args.node_nr, common_args.query_fragment.clone(), - Some(est_output), + est_output, )), node_search_desc: Arc::new(NodeSearchDesc { qname: ( @@ -978,7 +978,7 @@ impl<'a> NodeSearch<'a> { desc: Some(ExecutionNodeDesc::empty_with_fragment( common_args.node_nr, common_args.query_fragment.clone(), - Some(est_output), + est_output, )), node_search_desc: Arc::new(NodeSearchDesc { qname: ( From 540a8176635b14b21a4db79b890e80bcb6fcfa2a Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 13 May 2025 14:33:30 +0200 Subject: [PATCH 14/16] Only assume non-root nodes as population for PartOf searches, not any other base edge operator searches --- cli/src/bin/annis.rs | 4 +-- .../src/annis/db/aql/operators/edge_op.rs | 31 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/cli/src/bin/annis.rs b/cli/src/bin/annis.rs index a055918e2..9205f43e2 100644 --- a/cli/src/bin/annis.rs +++ b/cli/src/bin/annis.rs @@ -321,7 +321,7 @@ impl AnnisRunner { format = ExportFormat::GraphMLZip; } else if file_ext.to_string_lossy() == ".graphml" && self.current_corpus.len() != 1 { bail!( - r##"You need to select a *single* corpus first with the \"corpus\" command when exporting to a GraphML file. + r##"You need to select a *single* corpus first with the \"corpus\" command when exporting to a GraphML file. To export multiple corpora, select a directory as output or a ZIP file (ending with .zip)"## ); } @@ -442,7 +442,7 @@ impl AnnisRunner { "unsorted" => ResultOrder::NotSorted, _ => { return Err(anyhow!( - "Non-existing order with name {}. + "Non-existing order with name {}. Must be one of \"normal\", \"inverted\", \"random\", \"unsorted\"", args )); diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index 175245a7c..c5722d6d5 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -54,29 +54,28 @@ impl BaseEdgeOp { .components .iter() .all(|c| c.get_type() == AnnotationComponentType::PartOf); - // Use the single graph storage to get an estimate of population of nodes that can be found. - let max_nodes_estimate = if gs.len() == 1 { - gs[0] - .get_statistics() - .map(|s| s.nodes.saturating_sub(s.root_nodes)) - .unwrap_or(any_node_count) - } else { - // Use all nodes regardless of the component as population estimate - if all_part_of_components { + let max_nodes_estimate = if all_part_of_components { + // PartOf components have a very skewed distribution of root nodes + // vs. the actual possible targets, thus do not use all nodes as + // population but only the non-roots. + if gs.len() == 1 { + gs[0] + .get_statistics() + .map(|s| s.nodes.saturating_sub(s.root_nodes)) + .unwrap_or(any_node_count) + } else { + // If multiple PartOf graph storages are combined, we can guess + // the non-root nodes by estimating the number of nodes in the + // corpus grah. db.get_node_annos().guess_max_count( Some(&NODE_TYPE_KEY.ns), &NODE_TYPE_KEY.name, "corpus", "text", )? - } else { - db.get_node_annos().guess_max_count( - Some(&NODE_TYPE_KEY.ns), - &NODE_TYPE_KEY.name, - "node", - "node", - )? } + } else { + any_node_count }; Ok(BaseEdgeOp { gs, From 2f353b5f8fc41a617adcb81a59f14a51fdb3ddec Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 13 May 2025 15:17:15 +0200 Subject: [PATCH 15/16] Only estimate 5% of regex without prefix actually do match --- core/src/annostorage/inmemory.rs | 6 ++- core/src/annostorage/ondisk.rs | 5 ++- .../src/annis/db/aql/operators/edge_op.rs | 40 ++++++++----------- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index f75599652..7b8446a38 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -720,7 +720,6 @@ where if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key) { if let Some(histo) = self.histogram_bounds.get(&anno_key) { // find the range in which the value is contained - // we need to make sure the histogram is not empty -> should have at least two bounds if histo.len() >= 2 { sum_histogram_buckets += histo.len() - 1; @@ -775,7 +774,10 @@ where } } } else { - guessed_count = total; + // For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters. + // Assume that a generic percentage (here 5%) of all nodes match the regex. + // TODO: find better ways of estimating this constant + guessed_count = (0.05 * (total as f64)) as usize; } Ok(guessed_count.min(total)) diff --git a/core/src/annostorage/ondisk.rs b/core/src/annostorage/ondisk.rs index 140c00b09..069588830 100644 --- a/core/src/annostorage/ondisk.rs +++ b/core/src/annostorage/ondisk.rs @@ -918,7 +918,10 @@ where } } } else { - guessed_count = total; + // For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters. + // Assume that a generic percentage (here 5%) of all nodes match the regex. + // TODO: find better ways of estimating this constant + guessed_count = (0.05 * (total as f64)) as usize; } Ok(guessed_count.min(total)) diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index c5722d6d5..6c5ab066b 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -44,12 +44,7 @@ impl BaseEdgeOp { gs.push(gs_for_component); } - let any_node_count = db.get_node_annos().guess_max_count( - Some(&NODE_TYPE_KEY.ns), - &NODE_TYPE_KEY.name, - "", - &char::MAX.to_string(), - )?; + let all_part_of_components = spec .components .iter() @@ -57,25 +52,22 @@ impl BaseEdgeOp { let max_nodes_estimate = if all_part_of_components { // PartOf components have a very skewed distribution of root nodes // vs. the actual possible targets, thus do not use all nodes as - // population but only the non-roots. - if gs.len() == 1 { - gs[0] - .get_statistics() - .map(|s| s.nodes.saturating_sub(s.root_nodes)) - .unwrap_or(any_node_count) - } else { - // If multiple PartOf graph storages are combined, we can guess - // the non-root nodes by estimating the number of nodes in the - // corpus grah. - db.get_node_annos().guess_max_count( - Some(&NODE_TYPE_KEY.ns), - &NODE_TYPE_KEY.name, - "corpus", - "text", - )? - } + // population but only the non-roots. We can guess the non-root + // nodes by estimating the number of nodes in the corpus grah. + let result = db.get_node_annos().guess_max_count( + Some(&NODE_TYPE_KEY.ns), + &NODE_TYPE_KEY.name, + "corpus", + "datasource", + )?; + result } else { - any_node_count + db.get_node_annos().guess_max_count( + Some(&NODE_TYPE_KEY.ns), + &NODE_TYPE_KEY.name, + "node", + "node", + )? }; Ok(BaseEdgeOp { gs, From 7966e8ddbecbbc610a55f8d1fc5e70d43869a4b1 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 13 May 2025 15:41:21 +0200 Subject: [PATCH 16/16] Use more accurate root_node statistic --- .../src/annis/db/aql/operators/edge_op.rs | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/graphannis/src/annis/db/aql/operators/edge_op.rs b/graphannis/src/annis/db/aql/operators/edge_op.rs index 6c5ab066b..c22f204ac 100644 --- a/graphannis/src/annis/db/aql/operators/edge_op.rs +++ b/graphannis/src/annis/db/aql/operators/edge_op.rs @@ -49,18 +49,22 @@ impl BaseEdgeOp { .components .iter() .all(|c| c.get_type() == AnnotationComponentType::PartOf); - let max_nodes_estimate = if all_part_of_components { + + let max_nodes_estimate = if all_part_of_components && gs.len() == 1 { // PartOf components have a very skewed distribution of root nodes // vs. the actual possible targets, thus do not use all nodes as - // population but only the non-roots. We can guess the non-root - // nodes by estimating the number of nodes in the corpus grah. - let result = db.get_node_annos().guess_max_count( - Some(&NODE_TYPE_KEY.ns), - &NODE_TYPE_KEY.name, - "corpus", - "datasource", - )?; - result + // population but only the non-roots. + if let Some(stats) = gs[0].get_statistics() { + stats.nodes - stats.root_nodes + } else { + // Fallback to guessing by using the node type + db.get_node_annos().guess_max_count( + Some(&NODE_TYPE_KEY.ns), + &NODE_TYPE_KEY.name, + "corpus", + "datasource", + )? + } } else { db.get_node_annos().guess_max_count( Some(&NODE_TYPE_KEY.ns),