diff --git a/bin/src/task_queue.rs b/bin/src/task_queue.rs index 27cd139..a9baf18 100644 --- a/bin/src/task_queue.rs +++ b/bin/src/task_queue.rs @@ -49,7 +49,7 @@ pub async fn process_tasks(queue: Arc, location: Arc, links: G let note = patch.clone().into(); update_global_store(&patch.title, ¬e, links.clone()).await; - patch_search_from_update(¬e).await; + patch_search_from_update(¬e); if !patch.old_title.is_empty() && patch.old_title != patch.title { rename_in_global_store(&patch.title, &patch.old_title, links.clone()) @@ -103,7 +103,7 @@ pub async fn process_tasks(queue: Arc, location: Arc, links: G write(&patch).await.unwrap(); let note = patch.clone().into(); update_global_store(&patch.title, ¬e, links.clone()).await; - patch_search_from_update(¬e).await; + patch_search_from_update(¬e); update_mru_cache(&patch.old_title, &patch.title).await; } Message::ArchiveBody { title, body } => { diff --git a/libs/persistance/src/fs/utils.rs b/libs/persistance/src/fs/utils.rs index 9eac41d..323d963 100644 --- a/libs/persistance/src/fs/utils.rs +++ b/libs/persistance/src/fs/utils.rs @@ -9,6 +9,15 @@ pub fn get_data_dir_location() -> PathBuf { data_dir.to_owned() } +pub fn get_search_index_location() -> PathBuf { + let data_location = get_data_dir_location(); + data_location.join("search-index") +} +pub fn get_search_file_index_location() -> PathBuf { + let data_location = get_data_dir_location(); + data_location.join("search-index").join("file_index") +} + pub fn get_config_location() -> (PathBuf, PathBuf) { let project_dir = ProjectDirs::from("", "", "tendril").unwrap(); let config_dir = project_dir.config_dir(); diff --git a/libs/search-engine/src/indexer/archive.rs b/libs/search-engine/src/indexer/archive.rs index 9546692..0339d35 100644 --- a/libs/search-engine/src/indexer/archive.rs +++ b/libs/search-engine/src/indexer/archive.rs @@ -5,20 +5,21 @@ use std::{ path::Path, }; -use crate::{tokenizer::tokenize, Doc, Tokens}; +use crate::Tokens; -use super::Proccessor; +use super::{tokenize_document, Proccessor}; #[derive(Default, Debug)] pub(crate) struct Archive { pub(crate) tokens: Tokens, + pub(crate) file_index: HashMap>, } impl Proccessor for Archive { fn load(&mut self, location: &Path) { let entries = read_dir(location).unwrap(); let mut tokens: Tokens = HashMap::new(); - let mut doc_token_counter: HashMap = HashMap::new(); + let mut term_index: HashMap> = HashMap::new(); entries.for_each(|entry| { let entry = entry.unwrap(); if let Some(fname) = entry.file_name().to_str() { @@ -38,27 +39,20 @@ impl Proccessor for Archive { fname ); }); - let mut total_tokens = 0; - for line in text_content.lines() { - let raw_tokens = tokenize(line); - total_tokens += raw_tokens.len(); - for token in raw_tokens { - doc_token_counter - .entry(token) - .and_modify(|v| *v += 1.) - .or_insert(1.); - } - for (term, count) in doc_token_counter.iter() { - tokens - .entry(term.to_owned()) - .and_modify(|v| { - v.push((fname.to_string(), *count / total_tokens as f32)) - }) - .or_insert(vec![(fname.to_string(), *count / total_tokens as f32)]); - } - doc_token_counter.clear(); + let doc_token_counter = tokenize_document(text_content); + for (term, score) in doc_token_counter.iter() { + tokens + .entry(term.to_owned()) + .and_modify(|v| v.push((fname.to_string(), *score))) + .or_insert(vec![(fname.to_string(), *score)]); + term_index + .entry(fname.to_owned()) + .and_modify(|v| v.push(term.clone())) + .or_insert(vec![term.clone()]); } } }); + self.tokens = tokens; + self.file_index = term_index; } } diff --git a/libs/search-engine/src/indexer/mod.rs b/libs/search-engine/src/indexer/mod.rs index 51113bb..e8b3e42 100644 --- a/libs/search-engine/src/indexer/mod.rs +++ b/libs/search-engine/src/indexer/mod.rs @@ -1,4 +1,6 @@ -use std::path::Path; +use std::{path::Path, collections::HashMap}; + +use crate::tokenizer::tokenize; pub(crate) mod archive; pub(crate) mod notebook; @@ -6,3 +8,23 @@ pub(crate) mod notebook; pub(crate) trait Proccessor { fn load(&mut self, location: &Path); } +pub type DocTokenCount = HashMap; + +pub fn tokenize_document(content: String) -> DocTokenCount { + let mut token_counter: DocTokenCount = HashMap::new(); + let mut total_tokens = 0.0; + for line in content.lines() { + let raw_tokens = tokenize(line); + total_tokens += raw_tokens.len() as f32; + for token in raw_tokens { + token_counter + .entry(token) + .and_modify(|v| *v += 1.0) + .or_insert(1.0); + } + } + for (_, val) in token_counter.iter_mut() { + *val /= total_tokens; + } + token_counter +} diff --git a/libs/search-engine/src/indexer/notebook.rs b/libs/search-engine/src/indexer/notebook.rs index 844480a..fc399bf 100644 --- a/libs/search-engine/src/indexer/notebook.rs +++ b/libs/search-engine/src/indexer/notebook.rs @@ -1,5 +1,5 @@ -use super::Proccessor; -use crate::{tokenizer::tokenize, Tokens}; +use super::{Proccessor, tokenize_document}; +use crate::Tokens; use persistance::fs::path_to_string; use serde::{Deserialize, Serialize}; use std::{collections::HashMap, fs::read_dir, path::Path}; @@ -7,13 +7,14 @@ use std::{collections::HashMap, fs::read_dir, path::Path}; #[derive(Default, Debug, Serialize, Deserialize)] pub(crate) struct Notebook { pub(crate) tokens: Tokens, + // filename, Vec + pub(crate) file_index: HashMap>, } impl Proccessor for Notebook { fn load(&mut self, location: &Path) { let mut tokens: Tokens = HashMap::new(); - let mut doc_token_counter: HashMap = HashMap::new(); - // For some reason using tokio::read_dir never returns in the while loop + let mut term_index: HashMap> = HashMap::new(); let entries = read_dir(location).unwrap(); entries.for_each(|entry| { let entry = entry.unwrap(); @@ -21,27 +22,21 @@ impl Proccessor for Notebook { if fname.ends_with(".txt") { let title = fname.strip_suffix(".txt").unwrap(); let content = path_to_string(&entry.path()).unwrap(); - let mut total_tokens = 0; - for line in content.lines() { - let raw_tokens = tokenize(line); - total_tokens += raw_tokens.len(); - for token in raw_tokens { - doc_token_counter - .entry(token) - .and_modify(|v| *v += 1.) - .or_insert(1.); - } - } - for (term, count) in doc_token_counter.iter() { + let doc_token_counter = tokenize_document(content); + for (term, score) in doc_token_counter.iter() { tokens .entry(term.to_owned()) - .and_modify(|v| v.push((title.to_string(), *count / total_tokens as f32))) - .or_insert(vec![(title.to_string(), *count / total_tokens as f32)]); + .and_modify(|v| v.push((title.to_string(), *score))) + .or_insert(vec![(title.to_string(), *score)]); + term_index + .entry(fname.to_owned()) + .and_modify(|v| v.push(term.clone())) + .or_insert(vec![term.clone()]); } - doc_token_counter.clear(); } } }); self.tokens = tokens; + self.file_index = term_index; } } diff --git a/libs/search-engine/src/lib.rs b/libs/search-engine/src/lib.rs index 340a1f3..03df740 100644 --- a/libs/search-engine/src/lib.rs +++ b/libs/search-engine/src/lib.rs @@ -1,22 +1,20 @@ -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use indexer::notebook::Notebook; -use persistance::fs::utils::{get_archive_location, get_data_dir_location}; -use searcher::{highlight_matches, search}; +use indexer::{notebook::Notebook, tokenize_document}; +use persistance::fs::utils::{ + get_archive_location, get_search_file_index_location, get_search_index_location, +}; +use searcher::search; use serde::{Deserialize, Serialize}; use std::{ collections::HashMap, fs::{create_dir, read, write}, - io::Write, - path::PathBuf, + path::{Path, PathBuf}, process::exit, usize, }; use thiserror::Error; -use tokenizer::tokenize; use wikitext::parsers::Note; -/// Heavy inspiration / code taken from: https://github.com/thesephist/monocle -use tokio::fs::{read_to_string, remove_file}; +use tokio::fs::remove_file; use crate::indexer::{archive::Archive, Proccessor}; @@ -24,16 +22,19 @@ mod indexer; mod searcher; mod tokenizer; -pub type Tokens = HashMap>; -type DocIdx = HashMap; -type SearchIdx = HashMap>; +type SearchTerm = String; +type DocTitle = String; +type NormalizedFrequency = f32; +pub type Tokens = HashMap>; #[derive(Error, Debug)] -pub enum SearchIndexReadErr { +pub enum SearchIndexErr { #[error("Could not find file")] NotExistErr, #[error("Could not deserialize file")] DeserErr(bincode::Error), + #[error("Could not write serialized file")] + WriteErr(String), } #[derive(Serialize, Deserialize, Debug, Clone)] @@ -42,13 +43,12 @@ pub(crate) struct Doc { tokens: Tokens, } -#[derive(Serialize, Deserialize, Debug)] -pub struct Indicies { - search_idx: SearchIdx, - doc_idx: DocIdx, -} - pub fn build_search_index(location: &str) { + let loc = get_search_index_location(); + if !loc.exists() { + create_dir(&loc).unwrap(); + create_dir(&get_search_file_index_location()).unwrap(); + } let archive_location = get_archive_location(); let mut n = Notebook::default(); let mut a = Archive::default(); @@ -62,23 +62,18 @@ pub fn build_search_index(location: &str) { n.tokens.insert(key.to_owned(), value.to_owned()); } } - write_search_index(&n.tokens); -} - -pub async fn dump_search_index() -> Result { - todo!("REMOVE"); - // read_search_index() + write_search_index(&n.tokens, vec![n.file_index, a.file_index]); } pub async fn semantic_search(term: &str) -> Vec { search(term).await } -pub(crate) fn write_search_index(search_idx: &Tokens) { - // TODO: Handle file/dir existing already. - let stored_location = get_data_dir_location(); - let mut loc = stored_location.join("search-index"); - create_dir(&loc).unwrap(); +pub(crate) fn write_search_index( + search_idx: &Tokens, + term_indicies: Vec>>, +) { + let loc = get_search_index_location(); for (key, value) in search_idx.iter() { let bytes = bincode::serialize(value).unwrap(); let file_loc = loc.join(key); @@ -90,58 +85,152 @@ pub(crate) fn write_search_index(search_idx: &Tokens) { } } } + // write the term_index for easy deletion + let term_index_loc = get_search_file_index_location(); + for index in term_indicies.iter() { + for (file, terms) in index.iter() { + let bytes = bincode::serialize(terms).unwrap(); + let index_loc = term_index_loc.join(file); + match write(index_loc, bytes) { + Ok(()) => {} + Err(e) => { + eprintln!("Could not write file index -> {}\n{}", file, e); + exit(1); + } + } + } + } } -pub(crate) fn read_search_index(filename: &str) -> Result, SearchIndexReadErr> { - let index_location = get_data_dir_location(); - let read_loc = index_location.join("search-index").join(filename); +fn read_file_term_index(location: &Path) -> Result, SearchIndexErr> { + let term_index_loc = get_search_file_index_location(); + match read(term_index_loc.join(location)) { + Ok(content) => { + let deserialized_terms = bincode::deserialize(&content); + match deserialized_terms { + Ok(terms) => Ok(terms), + Err(e) => Err(SearchIndexErr::DeserErr(e)), + } + } + Err(_) => Err(SearchIndexErr::NotExistErr), + } +} + +fn write_file_term_index(location: &Path, content: Vec) -> Result<(), SearchIndexErr> { + let serialized_terms = bincode::serialize(&content); + match serialized_terms { + Ok(terms) => { + write(location, terms).unwrap(); + Ok(()) + } + Err(e) => Err(SearchIndexErr::DeserErr(e)), + } +} + +pub(crate) fn read_search_index( + filename: &str, +) -> Result, SearchIndexErr> { + let index_location = get_search_index_location(); + let read_loc = index_location.join(filename); match read(read_loc) { Ok(content) => { let deserialized_freqs = bincode::deserialize(&content); - match deserialized_freqs { + match deserialized_freqs { Ok(tokens) => Ok(tokens), - Err(e) => Err(SearchIndexReadErr::DeserErr(e)), + Err(e) => Err(SearchIndexErr::DeserErr(e)), } } - Err(_) => Err(SearchIndexReadErr::NotExistErr), + Err(_) => Err(SearchIndexErr::NotExistErr), } } -pub async fn patch_search_from_update(note: &Note) { - todo!(); - // let search_idx = read_search_index().await; - // let doc_idx = read_doc_index().await; - // let doc = tokenize_note_meta(note); - // if let Some((search_idx, doc_idx)) = patch_search_index(doc, search_idx, doc_idx).await { - // write_search_index(&search_idx); - // } +pub fn patch_search_from_update(note: &Note) { + let mut content = note.content.clone(); + let title = note.header.get("title").unwrap(); + content.push('\n'); + content.push_str(title); + let doc_token_count = tokenize_document(content); + patch(doc_token_count, title.to_owned()); +} + +pub fn patch(doc_token_count: HashMap, title: String) { + let term_index_loc = get_search_file_index_location(); + let index_loc = term_index_loc.join(&title); + let term_index_doc = read_file_term_index(&index_loc).unwrap(); + let mut file_terms = Vec::with_capacity(doc_token_count.len()); + for (term, score) in doc_token_count.iter() { + file_terms.push(term.to_owned()); + if let Ok(mut tokens) = read_search_index(term) { + let mut found = false; + for data in tokens.iter_mut() { + if data.0 == *title { + found = true; + *data = (title.clone(), *score); + } + } + if !found { + tokens.push((title.clone(), *score)); + } + write_search_entry(term, &tokens).unwrap(); + } else { + let tokens = vec![(title.to_owned(), *score)]; + // The term we've parsed doesn't yet exist. + write_search_entry(term, &tokens).unwrap(); + } + } + for term in term_index_doc.iter() { + if file_terms.contains(term) { + continue; + } + + let tokens = read_search_index(term).unwrap(); + let tokens = tokens.into_iter().filter(|t| t.0 != *title).collect(); + write_search_entry(term, &tokens).unwrap(); + } + write_file_term_index(&index_loc, file_terms).unwrap(); } type Title = String; type Content = String; type ArchivePatch = (Title, Content); -pub async fn patch_search_from_archive(patch: ArchivePatch) { - todo!(); - // let search_idx = read_search_index().await; - // let doc_idx = read_doc_index().await; - // let tokens = tokenize(&patch.1); - // let doc = Doc { - // id: patch.0, - // tokens, - // content: patch.1, - // }; - // if let Some((search_idx, doc_idx)) = patch_search_index(doc, search_idx, doc_idx).await { - // write_search_index(&search_idx); - // } +pub async fn patch_search_from_archive(archive_patch: ArchivePatch) { + let content = [archive_patch.0.clone(), archive_patch.1].join("\n"); + let doc_token_count = tokenize_document(content); + patch(doc_token_count, archive_patch.0); +} + +fn write_search_entry( + entry: &str, + content: &Vec<(DocTitle, NormalizedFrequency)>, +) -> Result<(), SearchIndexErr> { + let bytes = bincode::serialize(content); + let path = get_search_index_location(); + match bytes { + Ok(b) => match write(path.join(entry), b) { + Ok(()) => Ok(()), + Err(e) => Err(SearchIndexErr::WriteErr(format!( + "Could not write {}\n {}", + entry, e + ))), + }, + Err(e) => Err(SearchIndexErr::DeserErr(e)), + } } pub async fn delete_entry_from_update(entry: &str) { - todo!(); - // let search_idx = read_search_index().await; - // let doc_idx = read_doc_index().await; - // let (search_idx, doc_idx) = delete_entry_from_index(search_idx, doc_idx, entry).await; - // write_search_index(&search_idx); + let search_file_idx = get_search_file_index_location(); + let entry_file = search_file_idx.join(entry); + let entries = read_file_term_index(&entry_file).unwrap(); + for e in entries.iter() { + let contents = read_search_index(e).unwrap(); + let filtered_contents = contents + .into_iter() + .filter(|c| c.0 == entry) + .collect::>(); + let bytes = bincode::serialize(&filtered_contents).unwrap(); + write(e, bytes).unwrap(); + } } pub async fn delete_archived_file(entry: &str) { @@ -153,162 +242,3 @@ pub async fn delete_archived_file(entry: &str) { .expect("Could not delete archive file"); } } - -async fn delete_entry_from_index( - mut search_idx: SearchIdx, - mut doc_idx: DocIdx, - entry: &str, -) -> (SearchIdx, DocIdx) { - let doc = doc_idx - .get(entry) - .unwrap_or_else(|| panic!("Could not find doc marked for removal -- {}", entry)); - for token in doc.tokens.keys() { - let matched_documents = search_idx - .get_mut(token) - .expect("Improperly index search term."); - *matched_documents = matched_documents - .iter() - .filter(|i| *i != entry) - .map(|i| i.to_owned()) - .collect::>(); - if matched_documents.is_empty() { - search_idx.remove(token).unwrap(); - } - } - doc_idx.remove(entry).unwrap(); - (search_idx, doc_idx) -} - -async fn patch_search_index( - doc: Doc, - mut search_idx: SearchIdx, - mut doc_idx: DocIdx, -) -> Option<(SearchIdx, DocIdx)> { - todo!(); - // let mut removed_tokens = Vec::new(); - // let mut added_tokens = Vec::new(); - // // TODO: Don't clone so much - // if let Some(old_version) = doc_idx.get_mut(&doc.id) { - // let old_tokens = old_version.tokens.clone(); - // for token in old_tokens.keys() { - // if !doc.tokens.keys().any(|f| f == token) { - // removed_tokens.push(token); - // } - // } - // for token in doc.tokens.keys() { - // if !old_tokens.keys().any(|f| f == token) { - // added_tokens.push(token) - // } - // } - - // for token in removed_tokens { - // old_version.tokens.remove(token).unwrap(); - // if let Some(search_token) = search_idx.get_mut(token) { - // *search_token = search_token - // .iter() - // .filter(|&f| f != &doc.id) - // .map(|t| t.to_owned()) - // .collect::>(); - // if search_token.is_empty() { - // search_idx.remove(token).unwrap(); - // } - // } - // } - - // for token in added_tokens { - // let doc_id = doc.id.clone(); - // if let Some(search_token) = old_version.tokens.get_mut(token) { - // *search_token += 1; - // } else { - // old_version.tokens.insert(token.clone(), 1); - // } - // if let Some(search_token) = search_idx.get_mut(token) { - // search_token.push(doc_id); - // } else { - // search_idx.insert(token.clone(), vec![doc_id]); - // } - // } - // doc_idx.insert(doc.id.clone(), doc); - // Some((search_idx, doc_idx)) - // } else { - // for token in doc.tokens.keys() { - // let doc_id = doc.id.clone(); - // if let Some(search_token) = search_idx.get_mut(token) { - // search_token.push(doc_id); - // } else { - // search_idx.insert(token.clone(), vec![doc_id]); - // } - // } - // doc_idx.insert(doc.id.clone(), doc); - // Some((search_idx, doc_idx)) - // } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn delete_entry_successfully() { - let mut search_idx: SearchIdx = HashMap::new(); - search_idx.insert("test".into(), vec!["test_doc".into(), "another_doc".into()]); - search_idx.insert("token".into(), vec!["test_doc".into()]); - let mut doc_idx: DocIdx = HashMap::new(); - let doc = Doc { - id: "test_doc".into(), - tokens: HashMap::from([("test".into(), 1), ("token".into(), 1)]), - content: "test token".into(), - }; - doc_idx.insert("test_doc".into(), doc); - let (new_search, new_doc) = delete_entry_from_index(search_idx, doc_idx, "test_doc").await; - assert_eq!(new_search.get("token"), None); - assert_eq!(new_search.get("test"), Some(&vec!["another_doc".into()])); - assert!(new_doc.is_empty()); - } - #[tokio::test] - async fn patches_entry_successfully() { - let mut search_idx: SearchIdx = HashMap::new(); - search_idx.insert("test".into(), vec!["test_doc".into(), "another_doc".into()]); - search_idx.insert("token".into(), vec!["test_doc".into()]); - let mut doc_idx: DocIdx = HashMap::new(); - let doc = Doc { - id: "test_doc".into(), - tokens: HashMap::from([("test".into(), 1), ("token".into(), 1)]), - content: "test token".into(), - }; - doc_idx.insert("test_doc".into(), doc); - - let updated_doc = Doc { - id: "test_doc".into(), - tokens: HashMap::from([("cool".into(), 1), ("info".into(), 1)]), - content: "cool info".into(), - }; - let (new_search, new_docs) = patch_search_index(updated_doc, search_idx, doc_idx) - .await - .unwrap(); - - let added_doc = Doc { - id: "added_doc".into(), - tokens: HashMap::from([("added".into(), 1), ("doc".into(), 1)]), - content: "added doc".into(), - }; - - let (new_search, new_docs) = patch_search_index(added_doc.clone(), new_search, new_docs) - .await - .unwrap(); - let updated_search_term_info = new_search.get("info"); - let updated_search_term_cool = new_search.get("cool"); - let updated_doc_id_added = new_docs.get("added_doc"); - let search_term_test = new_search.get("test"); - let search_term_token = new_search.get("token"); - assert_eq!(updated_search_term_info, Some(&vec!["test_doc".into()])); - assert_eq!(updated_search_term_cool, Some(&vec!["test_doc".into()])); - assert_eq!(search_term_test, Some(&vec!["another_doc".into()])); - assert_eq!(search_term_token, None); - assert!(updated_doc_id_added.is_some()); - let updated_doc_added = updated_doc_id_added.unwrap(); - assert_eq!(updated_doc_added.id, added_doc.id); - assert_eq!(updated_doc_added.content, added_doc.content); - assert_eq!(updated_doc_added.tokens, added_doc.tokens); - } -} diff --git a/libs/search-engine/src/searcher.rs b/libs/search-engine/src/searcher.rs index 8df754d..bdbbe97 100644 --- a/libs/search-engine/src/searcher.rs +++ b/libs/search-engine/src/searcher.rs @@ -1,6 +1,6 @@ use std::collections::HashMap; -use crate::{read_search_index, tokenizer::tokenize, SearchIndexReadErr}; +use crate::{read_search_index, tokenizer::tokenize, SearchIndexErr}; fn tokenize_query(query: &str) -> Vec { tokenize(query) @@ -26,12 +26,15 @@ pub(crate) async fn search(query: &str) -> Vec { } } Err(e) => match e { - SearchIndexReadErr::NotExistErr => { + SearchIndexErr::NotExistErr => { continue; } - SearchIndexReadErr::DeserErr(e) => { + SearchIndexErr::DeserErr(e) => { eprintln!("Could not deserialize: {}", e); } + SearchIndexErr::WriteErr(e) => { + eprintln!("{}", e); + } }, } } @@ -96,21 +99,21 @@ const WORD_ENDINGS: [&str; 17] = [ "n", "ian", ]; -const OPEN_TAG_LENGTH: usize = 6; -const CLOSE_TAG_LENGTH: usize = 7; +// const OPEN_TAG_LENGTH: usize = 6; +// const CLOSE_TAG_LENGTH: usize = 7; -pub(crate) fn highlight_matches(mut line: String, term: &str) -> String { - let readline = line.clone().to_lowercase(); - let matches = readline - .match_indices(&term.trim().to_lowercase()) - .collect::>(); - if !matches.is_empty() { - for (pointer, (idx, t)) in matches.into_iter().enumerate() { - let current_pos = idx + (pointer * (OPEN_TAG_LENGTH + CLOSE_TAG_LENGTH)); - let closing_tag = current_pos + OPEN_TAG_LENGTH + t.len(); - line.insert_str(current_pos, ""); - line.insert_str(closing_tag, ""); - } - } - line -} +// pub(crate) fn highlight_matches(mut line: String, term: &str) -> String { +// let readline = line.clone().to_lowercase(); +// let matches = readline +// .match_indices(&term.trim().to_lowercase()) +// .collect::>(); +// if !matches.is_empty() { +// for (pointer, (idx, t)) in matches.into_iter().enumerate() { +// let current_pos = idx + (pointer * (OPEN_TAG_LENGTH + CLOSE_TAG_LENGTH)); +// let closing_tag = current_pos + OPEN_TAG_LENGTH + t.len(); +// line.insert_str(current_pos, ""); +// line.insert_str(closing_tag, ""); +// } +// } +// line +// } diff --git a/libs/search-engine/src/tokenizer.rs b/libs/search-engine/src/tokenizer.rs index 10781cc..6c162c6 100644 --- a/libs/search-engine/src/tokenizer.rs +++ b/libs/search-engine/src/tokenizer.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, usize}; +use std::collections::HashMap; use regex::Regex; @@ -22,8 +22,8 @@ lazy_static::lazy_static! { } pub(crate) fn tokenize(slice: &str) -> Vec { - let stripped_whitespace = PUNCT_RGX.replace_all(slice, " "); - stripped_whitespace + let punct_to_whitespace = PUNCT_RGX.replace_all(slice, " "); + punct_to_whitespace .split(' ') .map(|w| { let word = w.to_lowercase(); diff --git a/libs/task-runners/src/runners/api_runner.rs b/libs/task-runners/src/runners/api_runner.rs index 53e3f1a..da4e9c3 100644 --- a/libs/task-runners/src/runners/api_runner.rs +++ b/libs/task-runners/src/runners/api_runner.rs @@ -3,7 +3,7 @@ use std::{collections::HashMap, io, time::Instant}; use bytes::Bytes; use persistance::fs::{read, utils::get_config_location, write_media}; use render::{search_results_page::SearchResultsPage, Render}; -use search_engine::{semantic_search, Tokens}; +use search_engine::semantic_search; use thiserror::Error; use urlencoding::decode; use wikitext::parsers::Note; @@ -53,11 +53,6 @@ impl APIRunner { ctx.render().await } - // TODO: Better error handling - pub async fn dump_search_index() -> Tokens { - search_engine::dump_search_index().await.unwrap() - } - pub async fn update_styles(form_body: HashMap) -> Result<(), io::Error> { let (path, _) = get_config_location(); let style_location = path.join("userstyles.css"); diff --git a/libs/wikitext/src/parsers/headers.rs b/libs/wikitext/src/parsers/headers.rs index 9ad96ee..ff74eef 100644 --- a/libs/wikitext/src/parsers/headers.rs +++ b/libs/wikitext/src/parsers/headers.rs @@ -182,6 +182,21 @@ impl Into for Note { formatted_string } } +#[allow(clippy::from_over_into)] +impl Into for &Note { + fn into(self) -> String { + let mut formatted_string = String::new(); + for key in self.header.keys() { + formatted_string.push_str(key); + formatted_string.push_str(": "); + formatted_string.push_str(self.header.get(key).unwrap()); + formatted_string.push('\n'); + } + formatted_string.push('\n'); + formatted_string.push_str(&self.content); + formatted_string + } +} pub fn parse_meta<'a>(lines: impl Iterator, debug_marker: &str) -> Note { let mut parser = HeaderParserMachine::new(); diff --git a/libs/www/src/handlers/api_handler.rs b/libs/www/src/handlers/api_handler.rs index 5558321..3300421 100644 --- a/libs/www/src/handlers/api_handler.rs +++ b/libs/www/src/handlers/api_handler.rs @@ -35,7 +35,6 @@ impl APIRouter { .or(self.mru()) .or(self.json_page()) .or(self.search_from_qs()) - .or(self.search_indicies()) .or(self.version()) .boxed() } @@ -55,15 +54,6 @@ impl APIRouter { .with(warp::cors().allow_any_origin()) .boxed() } - fn search_indicies(&self) -> BoxedFilter<(impl Reply,)> { - warp::get() - .and(with_auth()) - .and(warp::path("search-idx").then(|| async { - let indicies = APIRunner::dump_search_index().await; - warp::reply::json(&indicies) - })) - .boxed() - } fn titles(&self) -> BoxedFilter<(impl Reply,)> { warp::get() .and(with_auth())