From 50842984ad3b28582c92208b03bb968bad49ebf6 Mon Sep 17 00:00:00 2001 From: James Little Date: Sat, 4 Feb 2023 12:03:55 -0500 Subject: [PATCH] Wire up UI config values excerpt length, number of results, number of excerpts --- Cargo.lock | 1 + dev/site/all-options.html | 32 ++++++++++++++++---- js/config.ts | 5 ++- js/entity.ts | 10 +++++- js/entityDomManager.ts | 4 +++ readme.md | 18 +++++++---- stork-cli/Cargo.toml | 1 + stork-cli/src/app/search.rs | 52 +++++++++++++++++++++++++++----- stork-cli/src/main.rs | 21 ++++++++++++- stork-lib/Cargo.toml | 13 ++++++-- stork-lib/src/index_v4/search.rs | 19 ++++++------ stork-lib/src/lib.rs | 30 +++++++++++++++--- stork-wasm/src/lib.rs | 17 +++++++++-- 13 files changed, 183 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 182f59be..8d6b1bba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1569,6 +1569,7 @@ dependencies = [ "colored", "hyper", "indicatif", + "lazy_static", "num-format", "pretty_assertions", "serde", diff --git a/dev/site/all-options.html b/dev/site/all-options.html index cc7d6da5..64252094 100644 --- a/dev/site/all-options.html +++ b/dev/site/all-options.html @@ -23,9 +23,20 @@ +
- +
@@ -48,9 +59,11 @@ }); document.getElementById("btn-2").addEventListener("click", () => { - stork.initialize("http://127.0.0.1:8025/does-not-exist.wasm").then((value) => { - console.log("WASM initialized", value); - }); + stork + .initialize("http://127.0.0.1:8025/does-not-exist.wasm") + .then((value) => { + console.log("WASM initialized", value); + }); }); document.getElementById("btn-3").addEventListener("click", () => { @@ -66,7 +79,10 @@ document.getElementById("btn-4").addEventListener("click", () => { stork - .downloadIndex("federalist", "http://127.0.0.1:8025/does-not-exist.st") + .downloadIndex( + "federalist", + "http://127.0.0.1:8025/does-not-exist.st" + ) .then(() => { console.log("Index loaded"); }) @@ -76,7 +92,11 @@ }); document.getElementById("btn-5").addEventListener("click", () => { - stork.attach("federalist"); + console.log(document.getElementById("attach-config").value); + stork.attach( + "federalist", + JSON.parse(document.getElementById("attach-config").value) + ); }); document.getElementById("btn-6").addEventListener("click", () => { diff --git a/js/config.ts b/js/config.ts index 985de284..bf0d8841 100644 --- a/js/config.ts +++ b/js/config.ts @@ -50,7 +50,10 @@ const defaultUIConfig = { onQueryUpdate: (_query: string) => {}, // eslint-disable-next-line @typescript-eslint/no-empty-function onResultSelected: (_query: string, _result: SearchResult) => {}, - transformResultUrl: (url: string) => url + transformResultUrl: (url: string) => url, + excerptLength: 150, + numberOfResults: 10, + numberOfExcerpts: 5 }; export type UIConfig = Readonly; diff --git a/js/entity.ts b/js/entity.ts index 43b13af9..d58a0082 100644 --- a/js/entity.ts +++ b/js/entity.ts @@ -22,6 +22,7 @@ export default class Entity implements EntityDomDelegate { readonly config: RegisterConfiguration; readonly indexLoadPromise: Promise; + private uiConfig: UIConfig; private domManager: EntityDomManager; private loadManager: LoadManager; private indexLoader: IndexLoader; @@ -103,6 +104,7 @@ export default class Entity implements EntityDomDelegate { } attach(uiConfig: UIConfig) { + this.uiConfig = uiConfig; this.domManager.attach(uiConfig); } @@ -115,7 +117,13 @@ export default class Entity implements EntityDomDelegate { log(`Performing search for index "${this.name}" with query "${query}"`); try { - const v = perform_search(this.name, query); + const v = perform_search( + this.name, + query, + this.uiConfig?.excerptLength, + this.uiConfig?.numberOfResults, + this.uiConfig?.numberOfExcerpts + ); const value = JSON.parse(v); return { success: true, diff --git a/js/entityDomManager.ts b/js/entityDomManager.ts index 1a131abd..dbefb487 100644 --- a/js/entityDomManager.ts +++ b/js/entityDomManager.ts @@ -82,6 +82,10 @@ export default class EntityDomManager { this.attachedToDom = true; this.resetElements(); + + if (this.input.value.length > 0) { + this.performSearchFromInputValue(); + } } setProgress(number: number) { diff --git a/readme.md b/readme.md index c97e08b0..eb68ba8b 100644 --- a/readme.md +++ b/readme.md @@ -171,7 +171,7 @@ $ stork search --index federalist.st --query "liberty" To build an interactive, online search interface, you can use the Stork Javascript library to load your search index and attach it to HTML on your webpage. -You'll need to take the output file generated in the previous step and make it accessible on a web server. I've already done that and uploaded the search index to `https://files.stork-search.net/federalist.st`. +You'll need to take the output file generated in the previous step and make it accessible on a web server. I've already done that and uploaded the search index to `https://files.stork-search.net/releases/latest/federalist.st`. Stork looks for the `data-stork` attributes on two tags: an `` tag where your users will type their search query, and a `
` tag where Stork will render the search results. Here, we're setting up our input and output elements with the name "federalist"—we'll use that name later to point the Javascript library at the correct HTML tags. @@ -195,7 +195,10 @@ By default, Stork's output is completely unstyled, letting you customize the out Search - +
@@ -215,7 +218,10 @@ Finally, we'll load the Stork Javascript library and register our search index: Search - +
@@ -223,11 +229,11 @@ Finally, we'll load the Stork Javascript library and register our search index:
- + @@ -237,7 +243,7 @@ Finally, we'll load the Stork Javascript library and register our search index: > **Warning** > -> The files at the root of `files.stork-search.net` point directly to the build artifacts from most recent release. Linking to these files from your webpage can result in unexpected behavior when a new version is released. To pin to a specific release, use URLs in the following the format: +> The files linked in this demo point directly to the build artifacts from most recent release. Linking to these files from your webpage can result in unexpected behavior when a new version is released. To pin to a specific release, use URLs in the following the format: > > `https://files.stork-search.net/releases/v2.0.0/stork.js` > diff --git a/stork-cli/Cargo.toml b/stork-cli/Cargo.toml index 59dd36f8..a757a03f 100644 --- a/stork-cli/Cargo.toml +++ b/stork-cli/Cargo.toml @@ -27,6 +27,7 @@ hyper = { version = "0.14.17", optional = true, features = [ "tcp" ] } indicatif = "0.17.0" +lazy_static = "1.4.0" num-format = "0.4.0" serde = "1.0.130" serde_json = "1.0.68" diff --git a/stork-cli/src/app/search.rs b/stork-cli/src/app/search.rs index 47ce9918..c2499c48 100644 --- a/stork-cli/src/app/search.rs +++ b/stork-cli/src/app/search.rs @@ -1,6 +1,18 @@ use clap::{Arg, Command}; +use lazy_static::lazy_static; +use stork_lib::SearchConfig; pub(super) fn search_subcommand() -> Command<'static> { + lazy_static! { + static ref DEFAULT_SEARCH_CONFIG: SearchConfig = SearchConfig::default(); + static ref DEFAULT_NUMBER_OF_EXCERPTS: String = + DEFAULT_SEARCH_CONFIG.number_of_excerpts.to_string(); + static ref DEFAULT_EXCERPT_LENGTH: String = + DEFAULT_SEARCH_CONFIG.excerpt_length.to_string(); + static ref DEFAULT_NUMBER_OF_RESULTS: String = + DEFAULT_SEARCH_CONFIG.number_of_results.to_string(); + } + Command::new("search") .about("Search an index for a query.") .arg( @@ -21,21 +33,45 @@ pub(super) fn search_subcommand() -> Command<'static> { .help("The text with which to search the index") .required(true), ) - .next_help_heading("DIAGNOSTICS") - .arg( - Arg::with_name("timing") - .short('t') - .long("timing") - .help("Displays the duration of the search operation"), - ) + .next_help_heading("DISPLAY") .arg( Arg::with_name("format") .long("format") - .display_order(100) .takes_value(true) .value_name("FORMAT") .possible_values(["json", "pretty", "none"]) .default_value("pretty") .help("The output format for the returned search results"), ) + .arg( + Arg::with_name("number_of_excerpts") + .long("number-of-excerpts") + .takes_value(true) + .value_name("NUMBER_OF_EXCERPTS") + .default_value(&DEFAULT_NUMBER_OF_EXCERPTS) + .help("The maximum number of excerpts to return for each result."), + ) + .arg( + Arg::with_name("number_of_results") + .long("number-of-results") + .takes_value(true) + .value_name("NUMBER_OF_RESULTS") + .default_value(&DEFAULT_NUMBER_OF_RESULTS) + .help("The maximum number of documents to return in the search output."), + ) + .arg( + Arg::with_name("excerpt_length") + .long("excerpt-length") + .takes_value(true) + .value_name("EXCERPT_LENGTH") + .default_value(&DEFAULT_EXCERPT_LENGTH) + .help("The length, in characters, of each text excerpt returned in the search output."), + ) + .next_help_heading("DIAGNOSTICS") + .arg( + Arg::with_name("timing") + .short('t') + .long("timing") + .help("Displays the duration of the search operation"), + ) } diff --git a/stork-cli/src/main.rs b/stork-cli/src/main.rs index 643d25e4..d6cc8c68 100644 --- a/stork-cli/src/main.rs +++ b/stork-cli/src/main.rs @@ -11,6 +11,7 @@ mod timings; use clap::ArgMatches; use colored::Colorize; +use lib::SearchConfig; use num_format::{Locale, ToFormattedString}; use std::time::Instant; @@ -89,7 +90,25 @@ fn search(submatches: &ArgMatches) -> CommandOutput { let read_time = Instant::now(); - let results = lib::search(&index, query).unwrap(); + let search_config = SearchConfig { + excerpt_length: submatches + .value_of("excerpt_length") + .unwrap() + .parse() + .unwrap(), + number_of_results: submatches + .value_of("number_of_results") + .unwrap() + .parse() + .unwrap(), + number_of_excerpts: submatches + .value_of("number_of_excerpts") + .unwrap() + .parse() + .unwrap(), + }; + + let results = lib::search(&index, query, &search_config).unwrap(); let search_time = Instant::now(); diff --git a/stork-lib/Cargo.toml b/stork-lib/Cargo.toml index 0e885abd..62e8b38a 100644 --- a/stork-lib/Cargo.toml +++ b/stork-lib/Cargo.toml @@ -4,7 +4,16 @@ version = "2.0.0" edition = "2021" [features] -build = ['thiserror', 'toml', 'serde_json', 'smart-default', 'srtparse', 'frontmatter', 'kuchiki', 'pulldown-cmark', 'rust-tfidf'] +build = [ + 'thiserror', + 'toml', + 'serde_json', + 'srtparse', + 'frontmatter', + 'kuchiki', + 'pulldown-cmark', + 'rust-tfidf' +] build-remote-fetch = ['build', 'mime', 'reqwest'] # json-search-output = ['serde'] @@ -17,12 +26,12 @@ wasm-bindgen = "0.2.83" rust-stemmers = "1.2.0" itertools = "0.10.3" unicode-segmentation = "1.8.0" +smart-default = { version = "0.6.0" } ### build deps thiserror = { version = "1.0.29", optional = true } toml = { version = "0.5.8", optional = true } serde = { version = "1.0.147", features = ["derive"], optional = false } serde_json = { version = "1.0.72", optional = true } -smart-default = { version = "0.6.0", optional = true } srtparse = { version = "0.2.0", optional = true } frontmatter = { version = "0.4.0", optional = true } kuchiki = { version = "0.8.1", optional = true } diff --git a/stork-lib/src/index_v4/search.rs b/stork-lib/src/index_v4/search.rs index d1388e0a..c48bbf5b 100644 --- a/stork-lib/src/index_v4/search.rs +++ b/stork-lib/src/index_v4/search.rs @@ -92,6 +92,7 @@ pub(crate) fn get_search_values( pub(crate) fn render_search_values( index: &Index, search_values: Vec, + config: &crate::SearchConfig, ) -> Result { type ValuesArrayIndex = usize; @@ -180,7 +181,7 @@ pub(crate) fn render_search_values( vec![], |mut accumulator, (contents_excerpt, highlight_length)| { if let Some(last_grouping) = accumulator.last_mut() { - if last_grouping.can_swallow(contents_excerpt) { + if last_grouping.can_swallow(contents_excerpt, config.excerpt_length) { last_grouping.push(contents_excerpt, **highlight_length); return accumulator; } @@ -203,14 +204,14 @@ pub(crate) fn render_search_values( .iter() .fold(0.0, |acc, g| acc + g.score()); - contents_excerpts_groupings.truncate(10); + contents_excerpts_groupings.truncate(config.number_of_excerpts); // ---- sum all grouping scores for each document to determine document scoring // ---- sort groupings by aggregated score // ---- create output excerpts for top n groupings let mut excerpts = contents_excerpts_groupings .iter() - .map(|g| g.as_excerpt(document)) + .map(|g| g.as_excerpt(document, config.excerpt_length)) .collect_vec(); excerpts.sort_by(|a, b| a.score.partial_cmp(&b.score).unwrap()); @@ -227,7 +228,7 @@ pub(crate) fn render_search_values( let total_hit_count = results.len(); results.sort_by(|a, b| a.score.partial_cmp(&b.score).unwrap()); results.reverse(); - results.truncate(10); + results.truncate(config.number_of_results); Ok(SearchOutput { results, @@ -249,12 +250,12 @@ impl ContentExcerptGrouping { self.0.push((contents_excerpt.clone(), highlight_length)); } - fn can_swallow(&self, other: &ContentsExcerpt) -> bool { + fn can_swallow(&self, other: &ContentsExcerpt, excerpt_length: usize) -> bool { match (self.0.first(), self.0.last()) { (Some((first_excerpt_in_self, _)), Some((last_excerpt_in_self, _))) => { assert!(last_excerpt_in_self.byte_offset <= other.byte_offset); let diff = other.byte_offset - last_excerpt_in_self.byte_offset; - diff < 150 - 3 + diff < excerpt_length - 3 } _ => unreachable!("Grouping should always have at least one element"), } @@ -300,15 +301,15 @@ impl ContentExcerptGrouping { self.0.len() } - fn as_excerpt(&self, document: &super::Document) -> Excerpt { + fn as_excerpt(&self, document: &super::Document, excerpt_length: usize) -> Excerpt { let first_byte = self.first().byte_offset.saturating_sub( - 147_usize + (excerpt_length - 3) .saturating_sub(self.last().byte_offset - self.first().byte_offset) .div(2), ); // TODO: Trim to word bounds let last_byte = std::cmp::min( - first_byte + 150, + first_byte + excerpt_length, document.contents.first().unwrap().contents.len(), ); diff --git a/stork-lib/src/lib.rs b/stork-lib/src/lib.rs index 4b4c6393..86fff39d 100644 --- a/stork-lib/src/lib.rs +++ b/stork-lib/src/lib.rs @@ -6,6 +6,7 @@ use std::collections::HashMap; use bytes::Bytes; use itertools::Itertools; use search_query::SearchQuery; +use smart_default::SmartDefault; mod envelope; mod string_utils; @@ -68,19 +69,39 @@ pub fn get_search_values( } } +#[derive(Debug, Clone, PartialEq, Eq, Hash, SmartDefault)] +pub struct SearchConfig { + /// The length, in characters, that a rendered excerpt will be. Controls + /// excerpt merging as well as display length. + /// Defaults to 150. + #[default = 150] + pub excerpt_length: usize, + + /// The maximum number of documents returned in the search results. + /// Defaults to 10. + #[default = 10] + pub number_of_results: usize, + + /// The maximum number of excerpts returned for each document. + /// Defaults to 5. + #[default = 5] + pub number_of_excerpts: usize, +} + pub fn merge_search_values( index: &parse_index::ParsedIndex, - lists_of_search_values: Vec>, + search_values: Vec>, + config: &SearchConfig, ) -> Result { match &index.value { parse_index::IndexType::V4Index(v4_index) => { - let search_values = lists_of_search_values + let search_values = search_values .iter() .flatten() .filter_map(|sv| sv.v4_value.clone()) // TODO: Throw a user-visible error if there are non-v4 search values .collect_vec(); - index_v4::search::render_search_values(v4_index, search_values) + index_v4::search::render_search_values(v4_index, search_values, config) } } } @@ -88,6 +109,7 @@ pub fn merge_search_values( pub fn search( index: &parse_index::ParsedIndex, query: &str, + config: &SearchConfig, ) -> Result { let terms = query .parse::() @@ -99,5 +121,5 @@ pub fn search( .flat_map(|term| get_search_values(index, term)) .collect_vec(); - merge_search_values(index, search_values) + merge_search_values(index, search_values, config) } diff --git a/stork-wasm/src/lib.rs b/stork-wasm/src/lib.rs index db7d2555..57673faa 100644 --- a/stork-wasm/src/lib.rs +++ b/stork-wasm/src/lib.rs @@ -9,6 +9,7 @@ use stork_lib::{ parse_index::ParsedIndex, search, search_value::{SearchValue, SearchValueCacheKey}, + SearchConfig, }; lazy_static! { @@ -45,14 +46,26 @@ pub fn append_chunk_to_index(name: &str, chunk_data: &[u8]) -> Result<(), JsErro } #[wasm_bindgen] -pub fn perform_search(name: &str, query: &str) -> Result { +pub fn perform_search( + name: &str, + query: &str, + excerpt_length: usize, + number_of_results: usize, + number_of_excerpts: usize, +) -> Result { if cfg!(debug_assertions) { console_error_panic_hook::set_once(); } + let config = SearchConfig { + excerpt_length, + number_of_results, + number_of_excerpts, + }; + let mut index_cache = INDEX_CACHE.lock().unwrap(); let index = index_cache.get_mut(name).unwrap(); // TODO: map_err() - search(index, query) + search(index, query, &config) .map(|output| serde_json::to_string(&output).unwrap()) .map_err(|_e| JsError::new("Error")) }