Search: Add methods for updating search indicies when notes update

jamestthompson3 · Jul 13, 2023 · 5ade237 · 5ade237
1 parent e536e40
commit 5ade237
Show file tree

Hide file tree

Showing 11 changed files with 257 additions and 304 deletions.
diff --git a/bin/src/task_queue.rs b/bin/src/task_queue.rs
@@ -49,7 +49,7 @@ pub async fn process_tasks(queue: Arc<JobQueue>, location: Arc<String>, links: G
                         let note = patch.clone().into();
 
                         update_global_store(&patch.title, &note, links.clone()).await;
-                        patch_search_from_update(&note).await;
+                        patch_search_from_update(&note);
 
                         if !patch.old_title.is_empty() && patch.old_title != patch.title {
                             rename_in_global_store(&patch.title, &patch.old_title, links.clone())
@@ -103,7 +103,7 @@ pub async fn process_tasks(queue: Arc<JobQueue>, location: Arc<String>, links: G
                         write(&patch).await.unwrap();
                         let note = patch.clone().into();
                         update_global_store(&patch.title, &note, links.clone()).await;
-                        patch_search_from_update(&note).await;
+                        patch_search_from_update(&note);
                         update_mru_cache(&patch.old_title, &patch.title).await;
                     }
                     Message::ArchiveBody { title, body } => {

diff --git a/libs/persistance/src/fs/utils.rs b/libs/persistance/src/fs/utils.rs
@@ -9,6 +9,15 @@ pub fn get_data_dir_location() -> PathBuf {
     data_dir.to_owned()
 }
 
+pub fn get_search_index_location() -> PathBuf {
+    let data_location = get_data_dir_location();
+    data_location.join("search-index")
+}
+pub fn get_search_file_index_location() -> PathBuf {
+    let data_location = get_data_dir_location();
+    data_location.join("search-index").join("file_index")
+}
+
 pub fn get_config_location() -> (PathBuf, PathBuf) {
     let project_dir = ProjectDirs::from("", "", "tendril").unwrap();
     let config_dir = project_dir.config_dir();

diff --git a/libs/search-engine/src/indexer/archive.rs b/libs/search-engine/src/indexer/archive.rs
@@ -5,20 +5,21 @@ use std::{
     path::Path,
 };
 
-use crate::{tokenizer::tokenize, Doc, Tokens};
+use crate::Tokens;
 
-use super::Proccessor;
+use super::{tokenize_document, Proccessor};
 
 #[derive(Default, Debug)]
 pub(crate) struct Archive {
     pub(crate) tokens: Tokens,
+    pub(crate) file_index: HashMap<String, Vec<String>>,
 }
 
 impl Proccessor for Archive {
     fn load(&mut self, location: &Path) {
         let entries = read_dir(location).unwrap();
         let mut tokens: Tokens = HashMap::new();
-        let mut doc_token_counter: HashMap<String, f32> = HashMap::new();
+        let mut term_index: HashMap<String, Vec<String>> = HashMap::new();
         entries.for_each(|entry| {
             let entry = entry.unwrap();
             if let Some(fname) = entry.file_name().to_str() {
@@ -38,27 +39,20 @@ impl Proccessor for Archive {
                         fname
                     );
                 });
-                let mut total_tokens = 0;
-                for line in text_content.lines() {
-                    let raw_tokens = tokenize(line);
-                    total_tokens += raw_tokens.len();
-                    for token in raw_tokens {
-                        doc_token_counter
-                            .entry(token)
-                            .and_modify(|v| *v += 1.)
-                            .or_insert(1.);
-                    }
-                    for (term, count) in doc_token_counter.iter() {
-                        tokens
-                            .entry(term.to_owned())
-                            .and_modify(|v| {
-                                v.push((fname.to_string(), *count / total_tokens as f32))
-                            })
-                            .or_insert(vec![(fname.to_string(), *count / total_tokens as f32)]);
-                    }
-                    doc_token_counter.clear();
+                let doc_token_counter = tokenize_document(text_content);
+                for (term, score) in doc_token_counter.iter() {
+                    tokens
+                        .entry(term.to_owned())
+                        .and_modify(|v| v.push((fname.to_string(), *score)))
+                        .or_insert(vec![(fname.to_string(), *score)]);
+                    term_index
+                        .entry(fname.to_owned())
+                        .and_modify(|v| v.push(term.clone()))
+                        .or_insert(vec![term.clone()]);
                 }
             }
         });
+        self.tokens = tokens;
+        self.file_index = term_index;
     }
 }
diff --git a/libs/search-engine/src/indexer/mod.rs b/libs/search-engine/src/indexer/mod.rs
@@ -1,8 +1,30 @@
-use std::path::Path;
+use std::{path::Path, collections::HashMap};
+
+use crate::tokenizer::tokenize;
 
 pub(crate) mod archive;
 pub(crate) mod notebook;
 
 pub(crate) trait Proccessor {
     fn load(&mut self, location: &Path);
 }
+pub type DocTokenCount = HashMap<String, f32>;
+
+pub fn tokenize_document(content: String) -> DocTokenCount {
+    let mut token_counter: DocTokenCount = HashMap::new();
+    let mut total_tokens = 0.0;
+    for line in content.lines() {
+        let raw_tokens = tokenize(line);
+        total_tokens += raw_tokens.len() as f32;
+        for token in raw_tokens {
+            token_counter
+                .entry(token)
+                .and_modify(|v| *v += 1.0)
+                .or_insert(1.0);
+        }
+    }
+    for (_, val) in token_counter.iter_mut() {
+        *val /= total_tokens;
+    }
+    token_counter
+}
diff --git a/libs/search-engine/src/indexer/notebook.rs b/libs/search-engine/src/indexer/notebook.rs
@@ -1,47 +1,42 @@
-use super::Proccessor;
-use crate::{tokenizer::tokenize, Tokens};
+use super::{Proccessor, tokenize_document};
+use crate::Tokens;
 use persistance::fs::path_to_string;
 use serde::{Deserialize, Serialize};
 use std::{collections::HashMap, fs::read_dir, path::Path};
 
 #[derive(Default, Debug, Serialize, Deserialize)]
 pub(crate) struct Notebook {
     pub(crate) tokens: Tokens,
+    // filename, Vec<search_terms>
+    pub(crate) file_index: HashMap<String, Vec<String>>,
 }
 
 impl Proccessor for Notebook {
     fn load(&mut self, location: &Path) {
         let mut tokens: Tokens = HashMap::new();
-        let mut doc_token_counter: HashMap<String, f32> = HashMap::new();
-        // For some reason using tokio::read_dir never returns in the while loop
+        let mut term_index: HashMap<String, Vec<String>> = HashMap::new();
         let entries = read_dir(location).unwrap();
         entries.for_each(|entry| {
             let entry = entry.unwrap();
             if let Some(fname) = entry.file_name().to_str() {
                 if fname.ends_with(".txt") {
                     let title = fname.strip_suffix(".txt").unwrap();
                     let content = path_to_string(&entry.path()).unwrap();
-                    let mut total_tokens = 0;
-                    for line in content.lines() {
-                        let raw_tokens = tokenize(line);
-                        total_tokens += raw_tokens.len();
-                        for token in raw_tokens {
-                            doc_token_counter
-                                .entry(token)
-                                .and_modify(|v| *v += 1.)
-                                .or_insert(1.);
-                        }
-                    }
-                    for (term, count) in doc_token_counter.iter() {
+                    let doc_token_counter = tokenize_document(content);
+                    for (term, score) in doc_token_counter.iter() {
                         tokens
                             .entry(term.to_owned())
-                            .and_modify(|v| v.push((title.to_string(), *count / total_tokens as f32)))
-                            .or_insert(vec![(title.to_string(), *count / total_tokens as f32)]);
+                            .and_modify(|v| v.push((title.to_string(), *score)))
+                            .or_insert(vec![(title.to_string(), *score)]);
+                        term_index
+                            .entry(fname.to_owned())
+                            .and_modify(|v| v.push(term.clone()))
+                            .or_insert(vec![term.clone()]);
                     }
-                    doc_token_counter.clear();
                 }
             }
         });
         self.tokens = tokens;
+        self.file_index = term_index;
     }
 }