Skip to content

Commit

Permalink
Search: Add methods for updating search indicies when notes update
Browse files Browse the repository at this point in the history
  • Loading branch information
jamestthompson3 committed Jul 13, 2023
1 parent e536e40 commit 5ade237
Show file tree
Hide file tree
Showing 11 changed files with 257 additions and 304 deletions.
4 changes: 2 additions & 2 deletions bin/src/task_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pub async fn process_tasks(queue: Arc<JobQueue>, location: Arc<String>, links: G
let note = patch.clone().into();

update_global_store(&patch.title, &note, links.clone()).await;
patch_search_from_update(&note).await;
patch_search_from_update(&note);

if !patch.old_title.is_empty() && patch.old_title != patch.title {
rename_in_global_store(&patch.title, &patch.old_title, links.clone())
Expand Down Expand Up @@ -103,7 +103,7 @@ pub async fn process_tasks(queue: Arc<JobQueue>, location: Arc<String>, links: G
write(&patch).await.unwrap();
let note = patch.clone().into();
update_global_store(&patch.title, &note, links.clone()).await;
patch_search_from_update(&note).await;
patch_search_from_update(&note);
update_mru_cache(&patch.old_title, &patch.title).await;
}
Message::ArchiveBody { title, body } => {
Expand Down
9 changes: 9 additions & 0 deletions libs/persistance/src/fs/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ pub fn get_data_dir_location() -> PathBuf {
data_dir.to_owned()
}

pub fn get_search_index_location() -> PathBuf {
let data_location = get_data_dir_location();
data_location.join("search-index")
}
pub fn get_search_file_index_location() -> PathBuf {
let data_location = get_data_dir_location();
data_location.join("search-index").join("file_index")
}

pub fn get_config_location() -> (PathBuf, PathBuf) {
let project_dir = ProjectDirs::from("", "", "tendril").unwrap();
let config_dir = project_dir.config_dir();
Expand Down
38 changes: 16 additions & 22 deletions libs/search-engine/src/indexer/archive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,21 @@ use std::{
path::Path,
};

use crate::{tokenizer::tokenize, Doc, Tokens};
use crate::Tokens;

use super::Proccessor;
use super::{tokenize_document, Proccessor};

#[derive(Default, Debug)]
pub(crate) struct Archive {
pub(crate) tokens: Tokens,
pub(crate) file_index: HashMap<String, Vec<String>>,
}

impl Proccessor for Archive {
fn load(&mut self, location: &Path) {
let entries = read_dir(location).unwrap();
let mut tokens: Tokens = HashMap::new();
let mut doc_token_counter: HashMap<String, f32> = HashMap::new();
let mut term_index: HashMap<String, Vec<String>> = HashMap::new();
entries.for_each(|entry| {
let entry = entry.unwrap();
if let Some(fname) = entry.file_name().to_str() {
Expand All @@ -38,27 +39,20 @@ impl Proccessor for Archive {
fname
);
});
let mut total_tokens = 0;
for line in text_content.lines() {
let raw_tokens = tokenize(line);
total_tokens += raw_tokens.len();
for token in raw_tokens {
doc_token_counter
.entry(token)
.and_modify(|v| *v += 1.)
.or_insert(1.);
}
for (term, count) in doc_token_counter.iter() {
tokens
.entry(term.to_owned())
.and_modify(|v| {
v.push((fname.to_string(), *count / total_tokens as f32))
})
.or_insert(vec![(fname.to_string(), *count / total_tokens as f32)]);
}
doc_token_counter.clear();
let doc_token_counter = tokenize_document(text_content);
for (term, score) in doc_token_counter.iter() {
tokens
.entry(term.to_owned())
.and_modify(|v| v.push((fname.to_string(), *score)))
.or_insert(vec![(fname.to_string(), *score)]);
term_index
.entry(fname.to_owned())
.and_modify(|v| v.push(term.clone()))
.or_insert(vec![term.clone()]);
}
}
});
self.tokens = tokens;
self.file_index = term_index;
}
}
24 changes: 23 additions & 1 deletion libs/search-engine/src/indexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,30 @@
use std::path::Path;
use std::{path::Path, collections::HashMap};

use crate::tokenizer::tokenize;

pub(crate) mod archive;
pub(crate) mod notebook;

pub(crate) trait Proccessor {
fn load(&mut self, location: &Path);
}
pub type DocTokenCount = HashMap<String, f32>;

pub fn tokenize_document(content: String) -> DocTokenCount {
let mut token_counter: DocTokenCount = HashMap::new();
let mut total_tokens = 0.0;
for line in content.lines() {
let raw_tokens = tokenize(line);
total_tokens += raw_tokens.len() as f32;
for token in raw_tokens {
token_counter
.entry(token)
.and_modify(|v| *v += 1.0)
.or_insert(1.0);
}
}
for (_, val) in token_counter.iter_mut() {
*val /= total_tokens;
}
token_counter
}
33 changes: 14 additions & 19 deletions libs/search-engine/src/indexer/notebook.rs
Original file line number Diff line number Diff line change
@@ -1,47 +1,42 @@
use super::Proccessor;
use crate::{tokenizer::tokenize, Tokens};
use super::{Proccessor, tokenize_document};
use crate::Tokens;
use persistance::fs::path_to_string;
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, fs::read_dir, path::Path};

#[derive(Default, Debug, Serialize, Deserialize)]
pub(crate) struct Notebook {
pub(crate) tokens: Tokens,
// filename, Vec<search_terms>
pub(crate) file_index: HashMap<String, Vec<String>>,
}

impl Proccessor for Notebook {
fn load(&mut self, location: &Path) {
let mut tokens: Tokens = HashMap::new();
let mut doc_token_counter: HashMap<String, f32> = HashMap::new();
// For some reason using tokio::read_dir never returns in the while loop
let mut term_index: HashMap<String, Vec<String>> = HashMap::new();
let entries = read_dir(location).unwrap();
entries.for_each(|entry| {
let entry = entry.unwrap();
if let Some(fname) = entry.file_name().to_str() {
if fname.ends_with(".txt") {
let title = fname.strip_suffix(".txt").unwrap();
let content = path_to_string(&entry.path()).unwrap();
let mut total_tokens = 0;
for line in content.lines() {
let raw_tokens = tokenize(line);
total_tokens += raw_tokens.len();
for token in raw_tokens {
doc_token_counter
.entry(token)
.and_modify(|v| *v += 1.)
.or_insert(1.);
}
}
for (term, count) in doc_token_counter.iter() {
let doc_token_counter = tokenize_document(content);
for (term, score) in doc_token_counter.iter() {
tokens
.entry(term.to_owned())
.and_modify(|v| v.push((title.to_string(), *count / total_tokens as f32)))
.or_insert(vec![(title.to_string(), *count / total_tokens as f32)]);
.and_modify(|v| v.push((title.to_string(), *score)))
.or_insert(vec![(title.to_string(), *score)]);
term_index
.entry(fname.to_owned())
.and_modify(|v| v.push(term.clone()))
.or_insert(vec![term.clone()]);
}
doc_token_counter.clear();
}
}
});
self.tokens = tokens;
self.file_index = term_index;
}
}

0 comments on commit 5ade237

Please sign in to comment.