From a4596d9adf22eaced0c3c2a0450af487e370218a Mon Sep 17 00:00:00 2001 From: Joshua Fleming Date: Fri, 22 Dec 2023 08:56:37 -0500 Subject: [PATCH] Initial Rust project setup and POC implementation of the Forthic tokenizer --- Makefile | 6 + forthic-rs/.gitignore | 2 + forthic-rs/Cargo.lock | 86 ++++++++ forthic-rs/Cargo.toml | 11 + forthic-rs/src/errors.rs | 17 ++ forthic-rs/src/lib.rs | 3 + forthic-rs/src/main.rs | 11 + forthic-rs/src/token.rs | 14 ++ forthic-rs/src/tokenizer.rs | 295 +++++++++++++++++++++++++++ tests/tests_rs/.gitignore | 2 + tests/tests_rs/Cargo.lock | 93 +++++++++ tests/tests_rs/Cargo.toml | 9 + tests/tests_rs/src/lib.rs | 1 + tests/tests_rs/src/test_tokenizer.rs | 105 ++++++++++ 14 files changed, 655 insertions(+) create mode 100644 forthic-rs/.gitignore create mode 100644 forthic-rs/Cargo.lock create mode 100644 forthic-rs/Cargo.toml create mode 100644 forthic-rs/src/errors.rs create mode 100644 forthic-rs/src/lib.rs create mode 100644 forthic-rs/src/main.rs create mode 100644 forthic-rs/src/token.rs create mode 100644 forthic-rs/src/tokenizer.rs create mode 100644 tests/tests_rs/.gitignore create mode 100644 tests/tests_rs/Cargo.lock create mode 100644 tests/tests_rs/Cargo.toml create mode 100644 tests/tests_rs/src/lib.rs create mode 100644 tests/tests_rs/src/test_tokenizer.rs diff --git a/Makefile b/Makefile index f57f280..2a62623 100644 --- a/Makefile +++ b/Makefile @@ -43,6 +43,12 @@ test-react: @echo "============" cd forthic-react/v1 && npm install && CI=1 npm run test +test-rs: + @echo + @echo "Forthic Rust tests" + @echo "============" + cargo test --manifest-path tests/tests_rs/Cargo.toml + test-all: test-py test-react test-js diff --git a/forthic-rs/.gitignore b/forthic-rs/.gitignore new file mode 100644 index 0000000..5f32e70 --- /dev/null +++ b/forthic-rs/.gitignore @@ -0,0 +1,2 @@ +target/ +.env \ No newline at end of file diff --git a/forthic-rs/Cargo.lock b/forthic-rs/Cargo.lock new file mode 100644 index 0000000..a2ee47d --- /dev/null +++ b/forthic-rs/Cargo.lock @@ -0,0 +1,86 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "forthic-rs" +version = "0.1.0" +dependencies = [ + "enum_dispatch", + "itertools", + "lazy_static", +] + +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "proc-macro2" +version = "1.0.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b7d0a2c048d661a1a59fcd7355baa232f7ed34e0ee4df2eef3c1c1c0d3852d8" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/forthic-rs/Cargo.toml b/forthic-rs/Cargo.toml new file mode 100644 index 0000000..6eadace --- /dev/null +++ b/forthic-rs/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "forthic-rs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +enum_dispatch = "0.3.12" +itertools = "0.12.0" +lazy_static = "1.4" diff --git a/forthic-rs/src/errors.rs b/forthic-rs/src/errors.rs new file mode 100644 index 0000000..97131ef --- /dev/null +++ b/forthic-rs/src/errors.rs @@ -0,0 +1,17 @@ +#[derive(Debug)] +pub struct InterpreterError { + pub message: String, + pub filename: String, + pub line: u32, +} + +#[macro_export] +macro_rules! interpreter_error { + ($msg:expr) => { + InterpreterError { + message: $msg.to_string(), + filename: file!().to_string(), + line: line!(), + } + }; +} diff --git a/forthic-rs/src/lib.rs b/forthic-rs/src/lib.rs new file mode 100644 index 0000000..ff01605 --- /dev/null +++ b/forthic-rs/src/lib.rs @@ -0,0 +1,3 @@ +pub mod errors; +pub mod token; +pub mod tokenizer; diff --git a/forthic-rs/src/main.rs b/forthic-rs/src/main.rs new file mode 100644 index 0000000..448bdca --- /dev/null +++ b/forthic-rs/src/main.rs @@ -0,0 +1,11 @@ +mod errors; +mod token; +mod tokenizer; + +use tokenizer::Tokenizer; + +fn main() { + let mut lexer = Tokenizer::new("Hello, world!"); + let t = lexer.next_token().unwrap(); + println!("{:?}", t); +} diff --git a/forthic-rs/src/token.rs b/forthic-rs/src/token.rs new file mode 100644 index 0000000..d620acb --- /dev/null +++ b/forthic-rs/src/token.rs @@ -0,0 +1,14 @@ +#[derive(Debug, PartialEq, Clone)] +pub enum Token { + String(String), + Comment(String), + StartArray, + EndArray, + StartModule(String), + EndModule, + StartDefinition(String), + EndDefinition, + StartMemo(String), + Word(String), + EOS, +} diff --git a/forthic-rs/src/tokenizer.rs b/forthic-rs/src/tokenizer.rs new file mode 100644 index 0000000..96a7bd2 --- /dev/null +++ b/forthic-rs/src/tokenizer.rs @@ -0,0 +1,295 @@ +use itertools::Itertools; +use lazy_static::lazy_static; +use std::collections::HashSet; + +use crate::errors::InterpreterError; +use crate::interpreter_error; +use crate::token::Token; + +lazy_static! { + static ref QUOTE_CHARS: HashSet = HashSet::from(['"', '\'', '^', '\x16']); + static ref WHITESPACE_CHARS: HashSet = HashSet::from([' ', '\t', '\n', '\r', '(', ')']); +} + +pub struct Tokenizer { + source: Vec, + position: usize, + token_buffer: Vec, +} + +impl Tokenizer { + /// A Tokenizer is constructed with an input string and returns the next available token on request. + pub fn new(source: &str) -> Tokenizer { + Tokenizer { + source: source.chars().collect_vec(), + position: 0, + token_buffer: vec![], + } + } + + /// Returns the next token in the input string. + /// If the end of the string is reached, returns Token::EOS. + /// If an error occurs, returns an InterpreterError. + /// + /// # Examples + /// + /// ``` + /// use forthic_rs::tokenizer::Tokenizer; + /// use forthic_rs::token::Token; + /// + /// let mut tokenizer = Tokenizer::new("1 2 3"); + /// assert_eq!(tokenizer.next_token().unwrap(), Token::Word("1".to_string())); + /// assert_eq!(tokenizer.next_token().unwrap(), Token::Word("2".to_string())); + /// assert_eq!(tokenizer.next_token().unwrap(), Token::Word("3".to_string())); + /// assert_eq!(tokenizer.next_token().unwrap(), Token::EOS); + /// ``` + pub fn next_token(&mut self) -> Result { + self.clear_token_buffer(); + self.transition_from_start() + } + + fn clear_token_buffer(&mut self) { + self.token_buffer.clear(); + } + + fn flatten_token_buffer(&mut self) -> String { + self.token_buffer.iter().collect() + } + + fn is_whitespace(&self, c: &char) -> bool { + WHITESPACE_CHARS.contains(c) + } + + fn is_quote(&self, c: &char) -> bool { + QUOTE_CHARS.contains(c) + } + + fn is_triple_quote(&mut self, index: usize, c: &char) -> bool { + if !self.is_quote(c) { + return false; + } + + if index + 2 >= self.source.len() { + return false; + } + + self.source[index + 1] == *c && self.source[index + 2] == *c + } + + fn is_start_memo(&mut self, index: usize) -> bool { + if index + 1 >= self.source.len() { + return false; + } + + self.source[index] == '@' && self.source[index + 1] == ':' + } + + /// Tokenization is implemented as a state machine. This is the entry point. + fn transition_from_start(&mut self) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + if self.is_whitespace(&c) { + continue; + } else if c == '#' { + return self.transition_from_comment(); + } else if c == ':' { + return self.transition_from_start_definition(); + } else if self.is_start_memo(self.position - 1) { + // Skip over ":" in "@:" + self.position += 1; + return self.transition_from_start_memo(); + } else if c == ';' { + return Ok(Token::EndDefinition); + } else if c == '[' { + return Ok(Token::StartArray); + } else if c == ']' { + return Ok(Token::EndArray); + } else if c == '{' { + return self.transition_from_gather_module(); + } else if c == '}' { + return Ok(Token::EndModule); + } else if self.is_triple_quote(self.position - 1, &c) { + // Skip over 2nd and 3rd quote chars + self.position += 2; + return self.transition_from_gather_triple_quote(c); + } else if self.is_quote(&c) { + return self.transition_from_gather_string(c); + } else { + // Back up to beginning of word + self.position -= 1; + return self.transition_from_gather_word(); + } + } + Ok(Token::EOS) + } + + fn transition_from_comment(&mut self) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + self.token_buffer.push(c); + self.position += 1; + if c == '\n' { + break; + } + } + + Ok(Token::Comment(self.flatten_token_buffer())) + } + + fn transition_from_start_definition(&mut self) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + + if self.is_whitespace(&c) { + continue; + } else { + self.position -= 1; + return self.transition_from_gather_definition_name(); + } + } + + Err(interpreter_error!("Got EOS in START_DEFINITION")) + } + + fn transition_from_start_memo(&mut self) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + + if self.is_whitespace(&c) { + continue; + } else { + self.position -= 1; + return self.transition_from_gather_memo_name(); + } + } + + Err(interpreter_error!("Got EOS in START_MEMO")) + } + + fn gather_definition_name(&mut self) -> Result<(), InterpreterError> { + let invalid_def_chars = HashSet::from(['[', ']', '{', '}']); + + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + + if self.is_whitespace(&c) { + break; + } else if self.is_quote(&c) { + return Err(interpreter_error!("Definitions can't have quotes in them")); + } else if invalid_def_chars.contains(&c) { + return Err(interpreter_error!(format!( + "Definitions can't have {:?} in them", + invalid_def_chars + ))); + } else { + self.token_buffer.push(c); + } + } + + Ok(()) + } + + fn transition_from_gather_definition_name(&mut self) -> Result { + self.gather_definition_name()?; + Ok(Token::StartDefinition(self.flatten_token_buffer())) + } + + fn transition_from_gather_memo_name(&mut self) -> Result { + self.gather_definition_name()?; + Ok(Token::StartMemo(self.flatten_token_buffer())) + } + + fn transition_from_gather_module(&mut self) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + + if self.is_whitespace(&c) { + break; + } else if c == '}' { + self.position -= 1; + break; + } else { + self.token_buffer.push(c); + } + } + + Ok(Token::StartModule(self.flatten_token_buffer())) + } + + fn transition_from_gather_triple_quote( + &mut self, + delimiter: char, + ) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + + if c == delimiter && self.is_triple_quote(self.position, &c) { + self.position += 3; + return Ok(Token::String(self.flatten_token_buffer())); + } else { + self.position += 1; + self.token_buffer.push(c); + } + } + + Err(interpreter_error!("Unterminated triple quoted string")) + } + + fn transition_from_gather_string( + &mut self, + delimiter: char, + ) -> Result { + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + + if c == delimiter { + return Ok(Token::String(self.flatten_token_buffer())); + } else { + self.token_buffer.push(c); + } + } + + Err(interpreter_error!(format!( + "Unterminated string ({}), {}", + delimiter, + self.flatten_token_buffer() + ))) + } + + fn transition_from_gather_word(&mut self) -> Result { + let word_delimiters = HashSet::from([';', '[', ']', '}']); + while self.position < self.source.len() { + let c = self.source[self.position]; + self.position += 1; + + if self.is_whitespace(&c) { + break; + } else if word_delimiters.contains(&c) { + self.position -= 1; + break; + } else { + self.token_buffer.push(c); + } + } + + Ok(Token::Word(self.flatten_token_buffer())) + } +} + +/// Iterator implementation for Tokenizer +impl Iterator for Tokenizer { + type Item = Result; + + fn next(&mut self) -> Option { + match self.next_token() { + Ok(Token::EOS) => None, + result => Some(result), + } + } +} diff --git a/tests/tests_rs/.gitignore b/tests/tests_rs/.gitignore new file mode 100644 index 0000000..5f32e70 --- /dev/null +++ b/tests/tests_rs/.gitignore @@ -0,0 +1,2 @@ +target/ +.env \ No newline at end of file diff --git a/tests/tests_rs/Cargo.lock b/tests/tests_rs/Cargo.lock new file mode 100644 index 0000000..fdbe012 --- /dev/null +++ b/tests/tests_rs/Cargo.lock @@ -0,0 +1,93 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "enum_dispatch" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "forthic-rs" +version = "0.1.0" +dependencies = [ + "enum_dispatch", + "itertools", + "lazy_static", +] + +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" + +[[package]] +name = "proc-macro2" +version = "1.0.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75cb1540fadbd5b8fbccc4dddad2734eba435053f725621c070711a14bb5f4b8" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "syn" +version = "2.0.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b7d0a2c048d661a1a59fcd7355baa232f7ed34e0ee4df2eef3c1c1c0d3852d8" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tests_rs" +version = "0.1.0" +dependencies = [ + "forthic-rs", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" diff --git a/tests/tests_rs/Cargo.toml b/tests/tests_rs/Cargo.toml new file mode 100644 index 0000000..f8976d9 --- /dev/null +++ b/tests/tests_rs/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "tests_rs" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +forthic-rs = { path="../../forthic-rs" } diff --git a/tests/tests_rs/src/lib.rs b/tests/tests_rs/src/lib.rs new file mode 100644 index 0000000..474e7ff --- /dev/null +++ b/tests/tests_rs/src/lib.rs @@ -0,0 +1 @@ +mod test_tokenizer; diff --git a/tests/tests_rs/src/test_tokenizer.rs b/tests/tests_rs/src/test_tokenizer.rs new file mode 100644 index 0000000..61d6892 --- /dev/null +++ b/tests/tests_rs/src/test_tokenizer.rs @@ -0,0 +1,105 @@ +#[cfg(test)] +mod tests { + use forthic_rs::token::Token; + use forthic_rs::tokenizer::Tokenizer; + + #[test] + fn test_basic() { + let tokenizer = Tokenizer::new("[ ] : DEFINITION ; { } '' WORD @: MEMO"); + let tokens: Vec = tokenizer.filter_map(|t| t.ok()).collect(); + assert_eq!( + tokens, + vec![ + Token::StartArray, + Token::EndArray, + Token::StartDefinition("DEFINITION".to_string()), + Token::EndDefinition, + Token::StartModule("".to_string()), + Token::EndModule, + Token::String("".to_string()), + Token::Word("WORD".to_string()), + Token::StartMemo("MEMO".to_string()), + ] + ); + } + + #[test] + fn test_end_definition() { + let tokenizer = Tokenizer::new("WORD; WORD2"); + let tokens: Vec = tokenizer.filter_map(|t| t.ok()).collect(); + assert_eq!( + tokens, + vec![ + Token::Word("WORD".to_string()), + Token::EndDefinition, + Token::Word("WORD2".to_string()), + ] + ); + } + + #[test] + fn test_start_module() { + let tokenizer = Tokenizer::new("{ {my-mod"); + let tokens: Vec = tokenizer.filter_map(|t| t.ok()).collect(); + assert_eq!( + tokens, + vec![ + Token::StartModule("".to_string()), + Token::StartModule("my-mod".to_string()), + ] + ); + } + + #[test] + fn test_strings() { + let tokenizer = Tokenizer::new( + "'Single' ^Caret^ '''Triple Single''' ^^^Triple Caret^^^ \x16Single DLE\x16", + ); + let tokens: Vec = tokenizer.filter_map(|t| t.ok()).collect(); + assert_eq!( + tokens, + vec![ + Token::String("Single".to_string()), + Token::String("Caret".to_string()), + Token::String("Triple Single".to_string()), + Token::String("Triple Caret".to_string()), + Token::String("Single DLE".to_string()), + ] + ); + } + + #[test] + fn test_arrays() { + let tokenizer = Tokenizer::new("[1 2] [3[4]]"); + let tokens: Vec = tokenizer.filter_map(|t| t.ok()).collect(); + assert_eq!( + tokens, + vec![ + Token::StartArray, + Token::Word("1".to_string()), + Token::Word("2".to_string()), + Token::EndArray, + Token::StartArray, + Token::Word("3".to_string()), + Token::StartArray, + Token::Word("4".to_string()), + Token::EndArray, + Token::EndArray, + ] + ); + } + + #[test] + fn test_end_module() { + let tokenizer = Tokenizer::new("WORD1}WORD2"); + let tokens: Vec = tokenizer.filter_map(|t| t.ok()).collect(); + assert_eq!( + tokens, + vec![ + Token::Word("WORD1".to_string()), + Token::EndModule, + Token::Word("WORD2".to_string()), + ] + ); + } +}