Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
*.graphml text eol=lf
*.graphml text eol=lf
third-party-licenses.html linguist-generated
/graphannis/tests/data/ linguist-generated=true
4 changes: 0 additions & 4 deletions .github/workflows/verify.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
name: Verify

on:
push:
branches:
- main
pull_request:

jobs:
Expand Down Expand Up @@ -66,4 +63,3 @@ jobs:
```
${{env.COVERAGE_INFO}}
```

6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- New optional `file` option for the `[logging]` section in the webservice
configuration. Can be used to additionally output all log messages to the given
file.
- Add number of root nodes to graph storage statistics. This changes the way
most of the graph storages store their statistics. You can use old imported data
files, but to make use of the new information you queries, you have to
**reimport** your corpora.
- `Graph:ensure_loaded_parallel` returns the actually loaded components that did
exist.

### Fixed

- Less frequent corpus cache status updates in log. Before, every corpus access
could trigger an entry into the log which is not desired under heavy load.
- Improve query execution planning by assuming all annotations can be matched in
regular expressions without a prefix.

## [3.7.1] - 2025-04-14

Expand Down
4 changes: 2 additions & 2 deletions cli/src/bin/annis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ impl AnnisRunner {
format = ExportFormat::GraphMLZip;
} else if file_ext.to_string_lossy() == ".graphml" && self.current_corpus.len() != 1 {
bail!(
r##"You need to select a *single* corpus first with the \"corpus\" command when exporting to a GraphML file.
r##"You need to select a *single* corpus first with the \"corpus\" command when exporting to a GraphML file.
To export multiple corpora, select a directory as output or a ZIP file (ending with .zip)"##
);
}
Expand Down Expand Up @@ -442,7 +442,7 @@ impl AnnisRunner {
"unsorted" => ResultOrder::NotSorted,
_ => {
return Err(anyhow!(
"Non-existing order with name {}.
"Non-existing order with name {}.
Must be one of \"normal\", \"inverted\", \"random\", \"unsorted\"",
args
));
Expand Down
2 changes: 1 addition & 1 deletion cli/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ fn show_corpus_info() -> Result<(), Box<dyn std::error::Error>> {

cmd.arg("../graphannis/tests/data/")
.arg("-c")
.arg("corpus sample-disk-based-3.3")
.arg("corpus sample-disk-based-3.8")
.arg("-c")
.arg("preload")
.arg("-c")
Expand Down
1 change: 1 addition & 0 deletions cli/tests/snapshots/cli__list_corpora_fully_loaded.snap
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ exit_code: 0
sample-disk-based-1.5 (not loaded)
sample-disk-based-3.2 (not loaded)
sample-disk-based-3.3 (fully loaded)
sample-disk-based-3.8 (not loaded)
sample-memory-based-1.5 (not loaded)
sample-memory-based-3.2 (not loaded)
sample-memory-based-3.3 (not loaded)
Expand Down
2 changes: 1 addition & 1 deletion cli/tests/snapshots/cli__list_corpora_not_loaded.snap
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ exit_code: 0
sample-disk-based-1.5 (not loaded)
sample-disk-based-3.2 (not loaded)
sample-disk-based-3.3 (not loaded)
sample-disk-based-3.8 (not loaded)
sample-memory-based-1.5 (not loaded)
sample-memory-based-3.2 (not loaded)
sample-memory-based-3.3 (not loaded)
graphANNIS says good-bye!

----- stderr -----

Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ result: 44 matches in 4 documents
sample-disk-based-1.5 (not loaded)
sample-disk-based-3.2 (not loaded)
sample-disk-based-3.3 (partially loaded)
sample-disk-based-3.8 (not loaded)
sample-memory-based-1.5 (not loaded)
sample-memory-based-3.2 (not loaded)
sample-memory-based-3.3 (not loaded)
Expand Down
26 changes: 13 additions & 13 deletions cli/tests/snapshots/cli__show_corpus_info.snap
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ info:
args:
- "../graphannis/tests/data/"
- "-c"
- corpus sample-disk-based-3.3
- corpus sample-disk-based-3.8
- "-c"
- preload
- "-c"
Expand All @@ -14,59 +14,59 @@ info:
success: true
exit_code: 0
----- stdout -----
12:00:00[INFO] Loaded corpus sample-disk-based-3.3
12:00:00[INFO] Corpus cache after preloading sample-disk-based-3.3: 100MB / 300MB - loaded corpora [sample-disk-based-3.3]
12:00:00[INFO] Loaded corpus sample-disk-based-3.8
12:00:00[INFO] Corpus cache after preloading sample-disk-based-3.8: 100MB / 300MB - loaded corpora [sample-disk-based-3.8]
12:00:00[INFO] Preloaded corpus in 10 ms
Status: "fully loaded"
Token search shortcut possible: true
------------
Component Coverage//: 0 annnotations
Stats: nodes=92, avg_fan_out=2.17, max_fan_out=11, max_depth=1
Stats: nodes=92, root nodes=48, avg_fan_out=2.17, max_fan_out=11, fan_out_99%=11, inv_fan_out_99%=9, max_depth=1
Implementation: DiskAdjacencyListV1
Status: "fully loaded"
------------
Component Coverage/annis/: 0 annnotations
Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, max_depth=1, tree
Stats: nodes=0, root nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree
Implementation: DiskAdjacencyListV1
Status: "fully loaded"
------------
Component Coverage/default_ns/: 0 annnotations
Stats: nodes=56, avg_fan_out=0.93, max_fan_out=10, max_depth=1
Stats: nodes=56, root nodes=12, avg_fan_out=0.93, max_fan_out=10, fan_out_99%=10, inv_fan_out_99%=2, max_depth=1
Implementation: DiskAdjacencyListV1
Status: "fully loaded"
------------
Component Coverage/annis/inherited-coverage: 0 annnotations
Stats: nodes=0, avg_fan_out=0.00, max_fan_out=0, max_depth=1, tree
Stats: nodes=0, root nodes=0, avg_fan_out=0.00, max_fan_out=0, fan_out_99%=0, inv_fan_out_99%=0, max_depth=1, tree
Implementation: DiskAdjacencyListV1
Status: "fully loaded"
------------
Component Dominance/syntax/: 0 annnotations
Stats: nodes=92, avg_fan_out=0.96, max_fan_out=3, max_depth=9, tree
Stats: nodes=92, root nodes=4, avg_fan_out=0.96, max_fan_out=3, fan_out_99%=3, inv_fan_out_99%=1, max_depth=9, tree
Implementation: PrePostOrderO16L8V1
Status: "fully loaded"
------------
Component Pointing/default_ns/anaphoric: 0 annnotations
Stats: nodes=8, avg_fan_out=0.50, max_fan_out=1, max_depth=1, tree
Stats: nodes=8, root nodes=4, avg_fan_out=0.50, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=1, tree
Implementation: DiskPathV1_D15
Status: "fully loaded"
------------
Component Ordering/annis/: 0 annnotations
Stats: nodes=44, avg_fan_out=0.91, max_fan_out=1, max_depth=10, tree
Stats: nodes=44, root nodes=4, avg_fan_out=0.91, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=1, max_depth=10, tree
Implementation: DiskPathV1_D15
Status: "fully loaded"
------------
Component LeftToken/annis/: 0 annnotations
Stats: nodes=92, avg_fan_out=0.65, max_fan_out=1, max_depth=1
Stats: nodes=92, root nodes=60, avg_fan_out=0.65, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=3, max_depth=1
Implementation: DiskPathV1_D15
Status: "fully loaded"
------------
Component RightToken/annis/: 0 annnotations
Stats: nodes=84, avg_fan_out=0.71, max_fan_out=1, max_depth=1
Stats: nodes=84, root nodes=60, avg_fan_out=0.71, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=8, max_depth=1
Implementation: DiskPathV1_D15
Status: "fully loaded"
------------
Component PartOf/annis/: 0 annnotations
Stats: nodes=115, avg_fan_out=0.99, max_fan_out=1, max_depth=4
Stats: nodes=115, root nodes=104, avg_fan_out=0.99, max_fan_out=1, fan_out_99%=1, inv_fan_out_99%=26, max_depth=4
Implementation: DiskPathV1_D15
Status: "fully loaded"
------------
Expand Down
13 changes: 9 additions & 4 deletions core/src/annostorage/inmemory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,6 @@ where
if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key) {
if let Some(histo) = self.histogram_bounds.get(&anno_key) {
// find the range in which the value is contained

// we need to make sure the histogram is not empty -> should have at least two bounds
if histo.len() >= 2 {
sum_histogram_buckets += histo.len() - 1;
Expand Down Expand Up @@ -752,6 +751,10 @@ where
fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result<usize> {
let full_match_pattern = util::regex_full_match(pattern);

// Get the total number of annotations with the namespace/name. We
// can't get larger than this number
let total = self.number_of_annotations_by_name(ns, name)?;

// Try to parse the regular expression
let parsed = regex_syntax::Parser::new().parse(&full_match_pattern);
if let Ok(parsed) = parsed {
Expand All @@ -770,11 +773,13 @@ where
guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?;
}
}
} else {
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
// Assume that a generic percentage (here 5%) of all nodes match the regex.
// TODO: find better ways of estimating this constant
guessed_count = (0.05 * (total as f64)) as usize;
}

// Get the total number of annotations with the namespace/name. We
// can't get larger than this number
let total = self.number_of_annotations_by_name(ns, name)?;
Ok(guessed_count.min(total))
} else {
Ok(0)
Expand Down
12 changes: 9 additions & 3 deletions core/src/annostorage/ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,10 @@ where
fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result<usize> {
let full_match_pattern = util::regex_full_match(pattern);

// Get the total number of annotations with the namespace/name. We
// can't get larger than this number
let total = self.number_of_annotations_by_name(ns, name)?;

// Try to parse the regular expression
let parsed = Parser::new().parse(&full_match_pattern);
if let Ok(parsed) = parsed {
Expand All @@ -913,11 +917,13 @@ where
guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?;
}
}
} else {
// For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
// Assume that a generic percentage (here 5%) of all nodes match the regex.
// TODO: find better ways of estimating this constant
guessed_count = (0.05 * (total as f64)) as usize;
}

// Get the total number of annotations with the namespace/name. We
// can't get larger than this number
let total = self.number_of_annotations_by_name(ns, name)?;
Ok(guessed_count.min(total))
} else {
Ok(0)
Expand Down
45 changes: 42 additions & 3 deletions core/src/graph/storage/adjacencylist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@ use crate::{
types::{AnnoKey, Annotation, Edge, NodeID},
};

use super::{EdgeContainer, GraphStatistic, GraphStorage, WriteableGraphStorage};
use super::{
deserialize_gs_field,
legacy::{self, AdjacencyListStorageV1},
load_statistics_from_location, save_statistics_to_toml, serialize_gs_field, EdgeContainer,
GraphStatistic, GraphStorage, WriteableGraphStorage,
};
use itertools::Itertools;
use rustc_hash::FxHashSet;
use serde::Deserialize;
Expand Down Expand Up @@ -123,13 +128,34 @@ impl GraphStorage for AdjacencyListStorage {
where
for<'de> Self: std::marker::Sized + Deserialize<'de>,
{
let mut result: Self = super::default_deserialize_gs(location)?;
let legacy_path = location.join("component.bin");
let mut result: Self = if legacy_path.is_file() {
let component: AdjacencyListStorageV1 = deserialize_gs_field(location, "component")?;
Self {
stats: component.stats.map(GraphStatistic::from),
edges: component.edges,
inverse_edges: component.inverse_edges,
annos: component.annos,
}
} else {
let stats = load_statistics_from_location(location)?;
Self {
edges: deserialize_gs_field(location, "edges")?,
inverse_edges: deserialize_gs_field(location, "inverse_edges")?,
annos: deserialize_gs_field(location, "annos")?,
stats,
}
};

result.annos.after_deserialization();
Ok(result)
}

fn save_to(&self, location: &Path) -> Result<()> {
super::default_serialize_gs(self, location)?;
serialize_gs_field(&self.edges, "edges", location)?;
serialize_gs_field(&self.inverse_edges, "inverse_edges", location)?;
serialize_gs_field(&self.annos, "annos", location)?;
save_statistics_to_toml(location, self.stats.as_ref())?;
Ok(())
}

Expand Down Expand Up @@ -334,6 +360,7 @@ impl WriteableGraphStorage for AdjacencyListStorage {
cyclic: false,
rooted_tree: true,
nodes: 0,
root_nodes: 0,
dfs_visit_ratio: 0.0,
};

Expand Down Expand Up @@ -370,6 +397,7 @@ impl WriteableGraphStorage for AdjacencyListStorage {
}
}
}
stats.root_nodes = roots.len();

let fan_outs = get_fan_outs(&self.edges);
let sum_fan_out: usize = fan_outs.iter().sum();
Expand Down Expand Up @@ -446,5 +474,16 @@ impl WriteableGraphStorage for AdjacencyListStorage {
}
}

impl From<legacy::AdjacencyListStorageV1> for AdjacencyListStorage {
fn from(value: legacy::AdjacencyListStorageV1) -> Self {
Self {
edges: value.edges,
inverse_edges: value.inverse_edges,
annos: value.annos,
stats: value.stats.map(GraphStatistic::from),
}
}
}

#[cfg(test)]
mod tests;
31 changes: 28 additions & 3 deletions core/src/graph/storage/dense_adjacency.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use super::{EdgeContainer, GraphStatistic, GraphStorage};
use super::{
deserialize_gs_field, legacy::DenseAdjacencyListStorageV1, load_statistics_from_location,
save_statistics_to_toml, serialize_gs_field, EdgeContainer, GraphStatistic, GraphStorage,
};
use crate::{
annostorage::{
inmemory::AnnoStorageImpl, AnnotationStorage, EdgeAnnotationStorage, NodeAnnotationStorage,
Expand Down Expand Up @@ -219,13 +222,35 @@ impl GraphStorage for DenseAdjacencyListStorage {
where
for<'de> Self: std::marker::Sized + Deserialize<'de>,
{
let mut result: Self = super::default_deserialize_gs(location)?;
let legacy_path = location.join("component.bin");
let mut result: Self = if legacy_path.is_file() {
let component: DenseAdjacencyListStorageV1 =
deserialize_gs_field(location, "component")?;
Self {
edges: component.edges,
inverse_edges: component.inverse_edges,
annos: component.annos,
stats: component.stats.map(GraphStatistic::from),
}
} else {
let stats = load_statistics_from_location(location)?;
Self {
edges: deserialize_gs_field(location, "edges")?,
inverse_edges: deserialize_gs_field(location, "inverse_edges")?,
annos: deserialize_gs_field(location, "annos")?,
stats,
}
};

result.annos.after_deserialization();
Ok(result)
}

fn save_to(&self, location: &Path) -> Result<()> {
super::default_serialize_gs(self, location)?;
serialize_gs_field(&self.edges, "edges", location)?;
serialize_gs_field(&self.inverse_edges, "inverse_edges", location)?;
serialize_gs_field(&self.annos, "annos", location)?;
save_statistics_to_toml(location, self.stats.as_ref())?;
Ok(())
}
}
Expand Down
Loading
Loading