Skip to content

Commit

Permalink
feat: Refactor configuring query parsers, add initial inflection support
Browse files Browse the repository at this point in the history
  • Loading branch information
ppodolsky committed Jun 8, 2023
1 parent 2008d69 commit 164310a
Show file tree
Hide file tree
Showing 20 changed files with 499 additions and 338 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ pest_derive = "2.5"
prost = "0.11"
rand = { version = "0.8", features = ["small_rng"] }
rayon = "1.6.1"
safe-regex = "0.2.5"
regex = "1.8"
serde = { version = "1.0", default_features = false, features = ["derive", "std"] }
serde_bytes = "0.11"
serde_cbor = "0.11"
Expand Down
5 changes: 4 additions & 1 deletion examples/proto_client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ async fn main() -> Result<(), tonic::Status> {
query: Some(proto::Query {
query: Some(proto::query::Query::Match(proto::MatchQuery {
value: "game".to_string(),
default_fields: vec!["title".to_string(), "body".to_string()],
query_parser_config: Some(proto::QueryParserConfig {
default_fields: vec!["title".to_string(), "body".to_string()],
..Default::default()
}),
..Default::default()
})),
}),
Expand Down
3 changes: 2 additions & 1 deletion summa-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ opentelemetry = { version = "0.19", features = ["metrics", "rt-tokio"], optional
parking_lot = { workspace = true }
pest = { workspace = true }
pest_derive = { workspace = true }
pluralize-rs = "0.1"
prost = { workspace = true }
rand = { workspace = true }
rayon = { workspace = true }
safe-regex = { workspace = true }
regex = { workspace = true }
rustc-hash = "1.1.0"
serde = { workspace = true }
serde_bytes = { workspace = true }
Expand Down
3 changes: 2 additions & 1 deletion summa-core/src/collectors/reservoir_sampling_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use tantivy::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader};
///
/// ```rust
/// use summa_core::collectors::ReservoirSampling;
/// use summa_core::configs::core::QueryParserConfig;
/// use tantivy::collector::Count;
/// use summa_core::components::QueryParser;
/// use tantivy::schema::{Schema, TEXT};
Expand All @@ -29,7 +30,7 @@ use tantivy::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader};
/// let searcher = reader.searcher();
///
/// // Here comes the important part
/// let query_parser = QueryParser::for_index(&index, vec!["title".to_string()]).unwrap();
/// let query_parser = QueryParser::for_index(&index, QueryParserConfig::from_default_fields(vec!["title".to_string()])).unwrap();
/// let query = query_parser.parse_query("diary").unwrap();
/// let documents = searcher.search(&query, &ReservoirSampling::with_limit(2)).unwrap();
///
Expand Down
51 changes: 19 additions & 32 deletions summa-core/src/components/index_holder.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::{HashMap, HashSet};
use std::collections::HashSet;
use std::fmt::Debug;
use std::hash::{Hash, Hasher};
use std::path::Path;
Expand Down Expand Up @@ -160,7 +160,7 @@ impl IndexHolder {
index_name: Option<&str>,
index_engine_config: Arc<dyn ConfigProxy<proto::IndexEngineConfig>>,
merge_policy: Option<proto::MergePolicy>,
field_aliases: HashMap<String, String>,
query_parser_config: proto::QueryParserConfig,
driver: Driver,
) -> SummaResult<IndexHolder> {
register_default_tokenizers(&index);
Expand Down Expand Up @@ -192,12 +192,7 @@ impl IndexHolder {
.expect("no index name")
});

let query_parser = ProtoQueryParser::for_index(
&index_name,
&index,
cached_index_attributes.as_ref().map(|a| a.default_fields.clone()).unwrap_or_else(Vec::new),
field_aliases,
)?;
let query_parser = ProtoQueryParser::for_index(&index_name, &index, query_parser_config)?;
let index_reader = index
.reader_builder()
.doc_store_cache_num_blocks(core_config.doc_store_cache_num_blocks)
Expand Down Expand Up @@ -357,35 +352,27 @@ impl IndexHolder {
}

/// Load term dictionaries into memory
pub async fn partial_warmup(&self, load_dictionaries: bool) -> SummaResult<()> {
pub async fn partial_warmup<T: AsRef<str>>(&self, load_dictionaries: bool, fields: &[T]) -> SummaResult<()> {
let searcher = self.index_reader().searcher();
let mut warm_up_futures = Vec::new();
let index_attributes = self.index_attributes();
let default_fields = index_attributes
.map(|index_attributes| {
index_attributes
.default_fields
.iter()
.map(|field_name| self.cached_schema.get_field(field_name))
.collect::<Result<Vec<_>, _>>()
})
.transpose()?;
if let Some(default_fields) = default_fields {
for field in default_fields {
for segment_reader in searcher.segment_readers() {
let inverted_index = segment_reader.inverted_index_async(field).await?.clone();
if load_dictionaries {
warm_up_futures.push(async move {
let dict = inverted_index.terms();
info!(action = "warming_up_dictionary", index_name = ?self.index_name());
dict.warm_up_dictionary().await
});
}
let default_fields = fields
.iter()
.map(|field_name| self.cached_schema.get_field(field_name.as_ref()))
.collect::<Result<Vec<_>, _>>()?;
for field in default_fields {
for segment_reader in searcher.segment_readers() {
let inverted_index = segment_reader.inverted_index_async(field).await?.clone();
if load_dictionaries {
warm_up_futures.push(async move {
let dict = inverted_index.terms();
info!(action = "warming_up_dictionary", index_name = ?self.index_name());
dict.warm_up_dictionary().await
});
}
}
info!(action = "warming_up");
try_join_all(warm_up_futures).await?;
}
info!(action = "warming_up");
try_join_all(warm_up_futures).await?;
Ok(())
}

Expand Down
3 changes: 2 additions & 1 deletion summa-core/src/components/query_parser/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod proto_query_parser;
mod summa_ql;
mod term_field_mappers;

pub use proto_query_parser::ProtoQueryParser;

pub use self::summa_ql::{MissingFieldPolicy, QueryParser, QueryParserError};
pub use self::summa_ql::{QueryParser, QueryParserError};
69 changes: 24 additions & 45 deletions summa-core/src/components/query_parser/proto_query_parser.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::ops::Bound;
use std::ops::Bound::Unbounded;
use std::str::FromStr;
Expand All @@ -17,10 +16,11 @@ use tantivy::query::{
};
use tantivy::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
use tantivy::{DateTime, Index, Score, Term};
use tracing::{info, warn};
use tracing::info;

use crate::components::queries::ExistsQuery;
use crate::components::query_parser::{QueryParser, QueryParserError};
use crate::configs::core::QueryParserConfig;
use crate::errors::{Error, SummaResult, ValidationError};
#[cfg(feature = "metrics")]
use crate::metrics::ToLabel;
Expand All @@ -35,21 +35,20 @@ pub struct ProtoQueryParser {
query_counter: Counter<u64>,
#[cfg(feature = "metrics")]
subquery_counter: Counter<u64>,
index_default_fields: Vec<String>,
field_aliases: HashMap<String, String>,
query_parser_config: QueryParserConfig,
}

pub enum MatchQueryDefaultMode {
pub enum QueryParserDefaultMode {
Boolean,
DisjuctionMax { tie_breaker: Score },
}

impl From<Option<proto::match_query::DefaultMode>> for MatchQueryDefaultMode {
fn from(value: Option<proto::match_query::DefaultMode>) -> Self {
impl From<Option<proto::query_parser_config::DefaultMode>> for QueryParserDefaultMode {
fn from(value: Option<proto::query_parser_config::DefaultMode>) -> Self {
match value {
Some(proto::match_query::DefaultMode::BooleanShouldMode(_)) | None => MatchQueryDefaultMode::Boolean,
Some(proto::match_query::DefaultMode::DisjuctionMaxMode(proto::MatchQueryDisjuctionMaxMode { tie_breaker })) => {
MatchQueryDefaultMode::DisjuctionMax { tie_breaker }
Some(proto::query_parser_config::DefaultMode::BooleanShouldMode(_)) | None => QueryParserDefaultMode::Boolean,
Some(proto::query_parser_config::DefaultMode::DisjuctionMaxMode(proto::MatchQueryDisjuctionMaxMode { tie_breaker })) => {
QueryParserDefaultMode::DisjuctionMax { tie_breaker }
}
}
}
Expand Down Expand Up @@ -107,12 +106,7 @@ fn cast_value_to_bound_term(field: Field, full_path: &str, field_type: &FieldTyp
}

impl ProtoQueryParser {
pub fn for_index(
index_name: &str,
index: &Index,
index_default_fields: Vec<String>,
field_aliases: HashMap<String, String>,
) -> SummaResult<ProtoQueryParser> {
pub fn for_index(index_name: &str, index: &Index, query_parser_config: proto::QueryParserConfig) -> SummaResult<ProtoQueryParser> {
#[cfg(feature = "metrics")]
let query_counter = global::meter("summa").u64_counter("query_counter").with_description("Queries counter").init();
#[cfg(feature = "metrics")]
Expand All @@ -129,13 +123,17 @@ impl ProtoQueryParser {
query_counter,
#[cfg(feature = "metrics")]
subquery_counter,
index_default_fields,
field_aliases,
query_parser_config: QueryParserConfig(query_parser_config),
})
}

pub fn resolve_field_name<'a>(&'a self, field_name: &'a str) -> &str {
self.field_aliases.get(field_name).map(|s| s.as_str()).unwrap_or(field_name)
self.query_parser_config
.0
.field_aliases
.get(field_name)
.map(|s| s.as_str())
.unwrap_or(field_name)
}

#[inline]
Expand Down Expand Up @@ -188,32 +186,13 @@ impl ProtoQueryParser {
},
)),
proto::query::Query::Match(match_query_proto) => {
let default_fields = if !match_query_proto.default_fields.is_empty() {
match_query_proto.default_fields
} else {
self.index_default_fields.clone()
};
if default_fields.is_empty() {
warn!(
action = "missing_default_fields",
hint = "Add `default_fields` to match query, otherwise you match nothing"
)
}
let mut nested_query_parser = QueryParser::for_index(&self.index, default_fields)?;
nested_query_parser.set_default_mode(match_query_proto.default_mode.into());

if !match_query_proto.field_boosts.is_empty() {
nested_query_parser.set_field_boosts(match_query_proto.field_boosts)
}

if let Some(exact_matches_promoter) = match_query_proto.exact_matches_promoter {
nested_query_parser.set_exact_match_promoter(exact_matches_promoter)
}

if !self.field_aliases.is_empty() {
nested_query_parser.set_field_aliases(self.field_aliases.clone())
}

let nested_query_parser = QueryParser::for_index(
&self.index,
match_query_proto
.query_parser_config
.map(QueryParserConfig)
.unwrap_or_else(|| self.query_parser_config.clone()),
)?;
match nested_query_parser.parse_query(&match_query_proto.value) {
Ok(parsed_query) => {
info!(parsed_match_query = ?parsed_query);
Expand Down
2 changes: 2 additions & 0 deletions summa-core/src/components/query_parser/summa_ql.pest
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ quote = _{ "\"" | "'" | "`" | "'" | "“" | "”" | "‘" | "«" | "»" | "„"
star = { "*" }
statement_sep = _{ WHITE_SPACE | "," | "." | ";" | "!" | "?" | (":" ~ WHITE_SPACE) | "(" | ")" | "/" }
allowed_chars = _{"_" | "+" | "#" | "-"}
url_end = _{ WHITE_SPACE | "," }

isbn = @{ "978" ~ ("-"? ~ ASCII_DIGIT){7,10} ~ EOI }
doi = @{ "10." ~ ASCII_DIGIT{4,9} ~ WHITE_SPACE? ~ "/" ~ WHITE_SPACE? ~ (!WHITE_SPACE ~ ANY)+ }
wrapped_doi = _{ ("http" ~ "s"? ~ "://")? ~ "doi.org/"? ~ doi }
// url = _{ ("http" ~ "s"? ~ "://")? ~ (!url_end ~ ANY)+ }

slop = @{ DECIMAL_NUMBER+ }
boost = { (DECIMAL_NUMBER | ".")+ }
Expand Down
Loading

0 comments on commit 164310a

Please sign in to comment.