diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4e16833..11ac80a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,8 +25,8 @@ jobs: target: x86_64 - runner: ubuntu-latest target: x86 - # - runner: ubuntu-latest - # target: aarch64 + - runner: ubuntu-latest + target: aarch64 - runner: ubuntu-latest target: armv7 - runner: ubuntu-latest @@ -171,7 +171,7 @@ jobs: with: subject-path: 'wheels-*/*' - name: Publish to PyPI - if: "startsWith(github.ref, 'refs/tags/')" + if: startsWith(github.ref, 'refs/tags/') uses: PyO3/maturin-action@v1 env: MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index 74fd9af..8008ca5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -19,9 +19,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.91" +version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" [[package]] name = "autocfg" @@ -88,9 +88,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" dependencies = [ "csv-core", "itoa", @@ -255,6 +255,12 @@ dependencies = [ "encoding_rs", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.9" @@ -319,6 +325,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "heck" version = "0.5.0" @@ -341,6 +353,16 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "indoc" version = "2.0.5" @@ -370,9 +392,9 @@ checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "lindera" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7739050058bc965a56d9f5c8bf5fb2896420eadc105602394da47576d9438a3d" +checksum = "63eb3476710bb8a29b92486ee926048a72e9e2b8c2a7dbfece4176466abfa309" dependencies = [ "anyhow", "bincode", @@ -389,6 +411,7 @@ dependencies = [ "regex", "serde", "serde_json", + "serde_yaml", "strum", "strum_macros", "unicode-blocks", @@ -399,9 +422,9 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf09154d3491be7f81ca2b74f7850b8d501d5a7890f578947663a690cfa19988" +checksum = "20470166621d673664b06bb15981ccbf2539ca8ed65365942ae9bf05b844bdb4" dependencies = [ "bincode", "byteorder", @@ -411,9 +434,9 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a39d058c9749a7f16e91c2e94c7abfe8c3a57f884eb51ce201b53287ae52130" +checksum = "1bc3fbf42ddaccebe1e0d2c2ea44e9e8965ae36e927d01b9a6e473f539cd3052" dependencies = [ "anyhow", "bincode", @@ -436,9 +459,9 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40926f3d1462a31a88d6a225271abebd369d24e20f18d7c4e14ffb28e8bfd719" +checksum = "106272982af9e0c66f2a81e2f68bbb2b6e5529546e3c87230838100d1722e31f" dependencies = [ "bincode", "byteorder", @@ -448,9 +471,9 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6686b379594d64dcaa4eb877a2f6652b031232e7a1bc6f5d63cde9b060705225" +checksum = "2e033e29de8ba3e98229a4ec97253cee9a8d726ed648568c4863a81f4f9b40c1" dependencies = [ "bincode", "byteorder", @@ -460,9 +483,9 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f7105d218a89c3a21f1b04fb855d4ef50c7e2a0588575c22d8ecb1fba8508bc" +checksum = "f68eae55c3a78a9c6e0c949e945d5fa441cc55c0b449848a02a181a63c5be233" dependencies = [ "bincode", "byteorder", @@ -472,7 +495,7 @@ dependencies = [ [[package]] name = "lindera-py" -version = "0.35.0" +version = "0.38.0" dependencies = [ "lindera", "pyo3", @@ -482,9 +505,9 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.35.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db0ccb99aa1cc9ab3bb268b69a3e7f255951b3a611a4f8de609246c5a3be238c" +checksum = "39bdf52908175074aea27619c170cf07c8429c51aa4f76abc100df1a357671fc" dependencies = [ "bincode", "byteorder", @@ -557,9 +580,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" +checksum = "f402062616ab18202ae8319da13fa4279883a2b8a9d9f83f20dbade813ce1884" dependencies = [ "cfg-if", "indoc", @@ -575,9 +598,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" +checksum = "b14b5775b5ff446dd1056212d778012cbe8a0fbffd368029fd9e25b514479c38" dependencies = [ "once_cell", "target-lexicon", @@ -585,9 +608,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" +checksum = "9ab5bcf04a2cdcbb50c7d6105de943f543f9ed92af55818fd17b660390fc8636" dependencies = [ "libc", "pyo3-build-config", @@ -595,9 +618,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" +checksum = "0fd24d897903a9e6d80b968368a34e1525aeb719d568dba8b3d4bfa5dc67d453" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -607,9 +630,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.5" +version = "0.22.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" +checksum = "36c011a03ba1e50152b4b394b479826cad97e7a21eb52df179cd91ac411cbfbe" dependencies = [ "heck", "proc-macro2", @@ -739,18 +762,18 @@ checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" [[package]] name = "serde" -version = "1.0.213" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.213" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -769,6 +792,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "shlex" version = "1.3.0" @@ -817,9 +853,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.82" +version = "2.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" dependencies = [ "proc-macro2", "quote", @@ -828,9 +864,9 @@ dependencies = [ [[package]] name = "tar" -version = "0.4.41" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb797dad5fb5b76fcf519e702f4a589483b5ef06567f160c392832c1f5e44909" +checksum = "c65998313f8e17d0d553d28f91a0df93e4dbbbf770279c7bc21ca0f09ea1a1f6" dependencies = [ "filetime", "libc", @@ -845,18 +881,18 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "thiserror" -version = "1.0.65" +version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d11abd9594d9b38965ef50805c5e469ca9cc6f197f883f717e0269a3057b3d5" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.65" +version = "2.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae71770322cbd277e69d762a16c444af02aa0575ac0d174f0b9562d3b37f8602" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" dependencies = [ "proc-macro2", "quote", @@ -917,6 +953,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index a5970a0..b6e79c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lindera-py" -version = "0.35.0" +version = "0.38.0" edition = "2021" description = "Python binding for Lindera." documentation = "https://docs.rs/lindera-py" @@ -25,8 +25,8 @@ cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese) compress = ["lindera/compress"] # Compress dictionaries [dependencies] -pyo3 = { version = "0.22.5", features = ["extension-module"] } -serde = { version = "1.0.213", features = ["derive"] } +pyo3 = { version = "0.22.6", features = ["extension-module"] } +serde = { version = "1.0.214", features = ["derive"] } serde_json = "1.0.132" -lindera = "0.35.0" +lindera = "0.38.0" diff --git a/README.md b/README.md index cc7cbcc..883644c 100644 --- a/README.md +++ b/README.md @@ -46,13 +46,18 @@ This command takes a long time because it builds a library that includes all the ## Example code ```python -from lindera import load_dictionary # type: ignore -from lindera import Tokenizer +from lindera import Segmenter, Tokenizer, load_dictionary def main(): + # load the dictionary dictionary = load_dictionary("ipadic") - tokenizer = Tokenizer("normal", dictionary) + + # create a segmenter + segmenter = Segmenter("normal", dictionary) + + # create a tokenizer + tokenizer = Tokenizer(segmenter) text = "関西国際空港限定トートバッグを東京スカイツリーの最寄り駅であるとうきょうスカイツリー駅で買う" print(f"text: {text}\n") diff --git a/examples/tokenize_ipadic.py b/examples/tokenize.py similarity index 64% rename from examples/tokenize_ipadic.py rename to examples/tokenize.py index c166965..a090c55 100644 --- a/examples/tokenize_ipadic.py +++ b/examples/tokenize.py @@ -1,10 +1,15 @@ -from lindera import load_dictionary # type: ignore -from lindera import Tokenizer +from lindera import Segmenter, Tokenizer, load_dictionary def main(): + # load the dictionary dictionary = load_dictionary("ipadic") - tokenizer = Tokenizer("normal", dictionary) + + # create a segmenter + segmenter = Segmenter("normal", dictionary) + + # create a tokenizer + tokenizer = Tokenizer(segmenter) text = "関西国際空港限定トートバッグを東京スカイツリーの最寄り駅であるとうきょうスカイツリー駅で買う" print(f"text: {text}\n") diff --git a/examples/tokenize_ipadic_decompose.py b/examples/tokenize_with_decompose.py similarity index 63% rename from examples/tokenize_ipadic_decompose.py rename to examples/tokenize_with_decompose.py index be4d464..560f9e1 100644 --- a/examples/tokenize_ipadic_decompose.py +++ b/examples/tokenize_with_decompose.py @@ -1,10 +1,15 @@ -from lindera import load_dictionary # type: ignore -from lindera import Tokenizer +from lindera import Segmenter, Tokenizer, load_dictionary def main(): + # load the dictionary dictionary = load_dictionary("ipadic") - tokenizer = Tokenizer("decompose", dictionary) + + # create a segmenter + segmenter = Segmenter("decompose", dictionary) + + # create a tokenizer + tokenizer = Tokenizer(segmenter) text = "関西国際空港限定トートバッグを東京スカイツリーの最寄り駅であるとうきょうスカイツリー駅で買う" print(f"text: {text}\n") diff --git a/examples/tokenize_ipadic_filters.py b/examples/tokenize_with_filters.py similarity index 81% rename from examples/tokenize_ipadic_filters.py rename to examples/tokenize_with_filters.py index f945f15..13c2841 100644 --- a/examples/tokenize_ipadic_filters.py +++ b/examples/tokenize_with_filters.py @@ -1,17 +1,22 @@ -from lindera import load_dictionary # type: ignore -from lindera import Tokenizer +from lindera import Segmenter, Tokenizer, load_dictionary def main(): + # load the dictionary dictionary = load_dictionary("ipadic") - tokenizer = Tokenizer("normal", dictionary) + # create a segmenter + segmenter = Segmenter("normal", dictionary) + + # create a tokenizer + tokenizer = Tokenizer(segmenter) + + # append character filters tokenizer.append_character_filter("unicode_normalize", **{"kind": "nfkc"}) - tokenizer.append_character_filter( - "japanese_iteration_mark", **{"normalize_kanji": True, "normalize_kana": True} - ) + tokenizer.append_character_filter("japanese_iteration_mark", **{"normalize_kanji": True, "normalize_kana": True}) tokenizer.append_character_filter("mapping", **{"mapping": {"リンデラ": "lindera"}}) + # append token filters tokenizer.append_token_filter("japanese_katakana_stem", **{"min": 3}) tokenizer.append_token_filter( "japanese_stop_tags", diff --git a/examples/tokenize_ipadic_userdict.py b/examples/tokenize_with_userdict.py similarity index 58% rename from examples/tokenize_ipadic_userdict.py rename to examples/tokenize_with_userdict.py index 7375352..61d4f5b 100644 --- a/examples/tokenize_ipadic_userdict.py +++ b/examples/tokenize_with_userdict.py @@ -1,18 +1,23 @@ from pathlib import Path -from lindera import load_dictionary # type: ignore -from lindera import Tokenizer, load_user_dictionary +from lindera import Segmenter, Tokenizer, load_dictionary, load_user_dictionary project_root = Path(__file__).resolve().parent.parent def main(): + # load the dictionary dictionary = load_dictionary("ipadic") - user_dictionary_path = str( - project_root / Path("./resources/ipadic_simple_userdic.csv") - ) + + # load the user dictionary + user_dictionary_path = str(project_root / Path("./resources/ipadic_simple_userdic.csv")) user_dictionary = load_user_dictionary(user_dictionary_path, "ipadic") - tokenizer = Tokenizer("normal", dictionary, user_dictionary) + + # create a segmenter + segmenter = Segmenter("normal", dictionary, user_dictionary) + + # create a tokenizer + tokenizer = Tokenizer(segmenter) text = "関西国際空港限定トートバッグを東京スカイツリーの最寄り駅であるとうきょうスカイツリー駅で買う" print(f"text: {text}\n") diff --git a/poetry.lock b/poetry.lock index a0a82c2..ff4e285 100644 --- a/poetry.lock +++ b/poetry.lock @@ -179,43 +179,43 @@ files = [ [[package]] name = "mypy" -version = "1.12.1" +version = "1.13.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, - {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, - {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, - {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, - {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, - {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, - {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, - {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, - {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, - {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, - {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, - {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, - {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, - {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, - {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, - {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, - {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, - {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, - {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, - {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, - {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, - {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, - {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, - {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, - {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, - {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, - {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, - {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, - {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, - {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, - {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, - {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6607e0f1dd1fb7f0aca14d936d13fd19eba5e17e1cd2a14f808fa5f8f6d8f60a"}, + {file = "mypy-1.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a21be69bd26fa81b1f80a61ee7ab05b076c674d9b18fb56239d72e21d9f4c80"}, + {file = "mypy-1.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b2353a44d2179846a096e25691d54d59904559f4232519d420d64da6828a3a7"}, + {file = "mypy-1.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0730d1c6a2739d4511dc4253f8274cdd140c55c32dfb0a4cf8b7a43f40abfa6f"}, + {file = "mypy-1.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c5fc54dbb712ff5e5a0fca797e6e0aa25726c7e72c6a5850cfd2adbc1eb0a372"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:581665e6f3a8a9078f28d5502f4c334c0c8d802ef55ea0e7276a6e409bc0d82d"}, + {file = "mypy-1.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3ddb5b9bf82e05cc9a627e84707b528e5c7caaa1c55c69e175abb15a761cec2d"}, + {file = "mypy-1.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20c7ee0bc0d5a9595c46f38beb04201f2620065a93755704e141fcac9f59db2b"}, + {file = "mypy-1.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3790ded76f0b34bc9c8ba4def8f919dd6a46db0f5a6610fb994fe8efdd447f73"}, + {file = "mypy-1.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:51f869f4b6b538229c1d1bcc1dd7d119817206e2bc54e8e374b3dfa202defcca"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:5c7051a3461ae84dfb5dd15eff5094640c61c5f22257c8b766794e6dd85e72d5"}, + {file = "mypy-1.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:39bb21c69a5d6342f4ce526e4584bc5c197fd20a60d14a8624d8743fffb9472e"}, + {file = "mypy-1.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:164f28cb9d6367439031f4c81e84d3ccaa1e19232d9d05d37cb0bd880d3f93c2"}, + {file = "mypy-1.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a4c1bfcdbce96ff5d96fc9b08e3831acb30dc44ab02671eca5953eadad07d6d0"}, + {file = "mypy-1.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0affb3a79a256b4183ba09811e3577c5163ed06685e4d4b46429a271ba174d2"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a7b44178c9760ce1a43f544e595d35ed61ac2c3de306599fa59b38a6048e1aa7"}, + {file = "mypy-1.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5d5092efb8516d08440e36626f0153b5006d4088c1d663d88bf79625af3d1d62"}, + {file = "mypy-1.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2904956dac40ced10931ac967ae63c5089bd498542194b436eb097a9f77bc8"}, + {file = "mypy-1.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7bfd8836970d33c2105562650656b6846149374dc8ed77d98424b40b09340ba7"}, + {file = "mypy-1.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:9f73dba9ec77acb86457a8fc04b5239822df0c14a082564737833d2963677dbc"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:100fac22ce82925f676a734af0db922ecfea991e1d7ec0ceb1e115ebe501301a"}, + {file = "mypy-1.13.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7bcb0bb7f42a978bb323a7c88f1081d1b5dee77ca86f4100735a6f541299d8fb"}, + {file = "mypy-1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bde31fc887c213e223bbfc34328070996061b0833b0a4cfec53745ed61f3519b"}, + {file = "mypy-1.13.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:07de989f89786f62b937851295ed62e51774722e5444a27cecca993fc3f9cd74"}, + {file = "mypy-1.13.0-cp38-cp38-win_amd64.whl", hash = "sha256:4bde84334fbe19bad704b3f5b78c4abd35ff1026f8ba72b29de70dda0916beb6"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0246bcb1b5de7f08f2826451abd947bf656945209b140d16ed317f65a17dc7dc"}, + {file = "mypy-1.13.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7f5b7deae912cf8b77e990b9280f170381fdfbddf61b4ef80927edd813163732"}, + {file = "mypy-1.13.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7029881ec6ffb8bc233a4fa364736789582c738217b133f1b55967115288a2bc"}, + {file = "mypy-1.13.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3e38b980e5681f28f033f3be86b099a247b13c491f14bb8b1e1e134d23bb599d"}, + {file = "mypy-1.13.0-cp39-cp39-win_amd64.whl", hash = "sha256:a6789be98a2017c912ae6ccb77ea553bbaf13d27605d2ca20a76dfbced631b24"}, + {file = "mypy-1.13.0-py3-none-any.whl", hash = "sha256:9c250883f9fd81d212e0952c92dbfcc96fc237f4b7c92f56ac81fd48460b3e5a"}, + {file = "mypy-1.13.0.tar.gz", hash = "sha256:0291a61b6fbf3e6673e3405cfcc0e7650bebc7939659fdca2702958038bd835e"}, ] [package.dependencies] @@ -224,6 +224,7 @@ typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] +faster-cache = ["orjson"] install-types = ["pip"] mypyc = ["setuptools (>=50)"] reports = ["lxml"] @@ -241,13 +242,13 @@ files = [ [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index 9b51fd6..afaad82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "lindera" -version = "0.35.0" +version = "0.38.0" description = "" authors = ["Minoru Osuka "] license = "MIT" @@ -11,7 +11,6 @@ python = "^3.12" maturin = "^1.7.1" patchelf = "^0.17.2.1" - [tool.poetry.group.dev.dependencies] pytest = "^8.3.3" black = "^24.10.0" @@ -24,3 +23,17 @@ mypy = "^1.12.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.black] +line-length = 119 + +[tool.flake8] +ignore = "E203,E501" +max-line-length = 119 + +[tool.isort] +profile = "black" +line_length = 119 + +[tool.mypy] +ignore_missing_imports = true diff --git a/resources/lindera.yml b/resources/lindera.yml new file mode 100644 index 0000000..df3f713 --- /dev/null +++ b/resources/lindera.yml @@ -0,0 +1,67 @@ +segmenter: + mode: "normal" + dictionary: + kind: "ipadic" + # user_dictionary: + # path: "./resources/ipadic_simple.csv" + # kind: "ipadic" + +character_filters: + - kind: "unicode_normalize" + args: + kind: "nfkc" + - kind: "japanese_iteration_mark" + args: + normalize_kanji: true + normalize_kana: true + - kind: mapping + args: + mapping: + リンデラ: Lindera + +token_filters: + - kind: "japanese_compound_word" + args: + kind: "ipadic" + tags: + - "名詞,数" + - "名詞,接尾,助数詞" + new_tag: "名詞,数" + - kind: "japanese_number" + args: + tags: + - "名詞,数" + - kind: "japanese_stop_tags" + args: + tags: + - "接続詞" + - "助詞" + - "助詞,格助詞" + - "助詞,格助詞,一般" + - "助詞,格助詞,引用" + - "助詞,格助詞,連語" + - "助詞,係助詞" + - "助詞,副助詞" + - "助詞,間投助詞" + - "助詞,並立助詞" + - "助詞,終助詞" + - "助詞,副助詞/並立助詞/終助詞" + - "助詞,連体化" + - "助詞,副詞化" + - "助詞,特殊" + - "助動詞" + - "記号" + - "記号,一般" + - "記号,読点" + - "記号,句点" + - "記号,空白" + - "記号,括弧閉" + - "その他,間投" + - "フィラー" + - "非言語音" + - kind: "japanese_katakana_stem" + args: + min: 3 + - kind: "remove_diacritical_mark" + args: + japanese: false diff --git a/resources/lindera_ipadic_conf.json b/resources/lindera_ipadic_conf.json deleted file mode 100644 index e845ab1..0000000 --- a/resources/lindera_ipadic_conf.json +++ /dev/null @@ -1,66 +0,0 @@ -{ - "character_filters": [ - { - "kind": "unicode_normalize", - "args": { - "kind": "nfkc" - } - } - ], - "tokenizer": { - "dictionary": { - "kind": "ipadic" - }, - "mode": "normal" - }, - "token_filters": [ - { - "kind": "japanese_compound_word", - "args": { - "kind": "ipadic", - "tags": [ - "名詞,数" - ], - "new_tag": "名詞,数" - } - }, - { - "kind": "japanese_stop_tags", - "args": { - "tags": [ - "接続詞", - "助詞", - "助詞,格助詞", - "助詞,格助詞,一般", - "助詞,格助詞,引用", - "助詞,格助詞,連語", - "助詞,係助詞", - "助詞,副助詞", - "助詞,間投助詞", - "助詞,並立助詞", - "助詞,終助詞", - "助詞,副助詞/並立助詞/終助詞", - "助詞,連体化", - "助詞,副詞化", - "助詞,特殊", - "助動詞", - "記号", - "記号,一般", - "記号,読点", - "記号,句点", - "記号,空白", - "記号,括弧閉", - "その他,間投", - "フィラー", - "非言語音" - ] - } - }, - { - "kind": "japanese_katakana_stem", - "args": { - "min": 3 - } - } - ] -} diff --git a/src/dictionary.rs b/src/dictionary.rs index 27561df..dc989f4 100644 --- a/src/dictionary.rs +++ b/src/dictionary.rs @@ -1,4 +1,4 @@ -use std::path::PathBuf; +use std::path::Path; use std::str::FromStr; use pyo3::{exceptions::PyValueError, prelude::*}; @@ -33,8 +33,8 @@ pub fn load_dictionary(kind: Option<&str>, path: Option<&str>) -> PyResult { - let p = PathBuf::from(path_str); - let dictionary = load_dictionary_from_path(p.as_path()).map_err(|err| { + let p = Path::new(path_str); + let dictionary = load_dictionary_from_path(p).map_err(|err| { PyValueError::new_err(format!("Failed to load dictionary: {}", err)) })?; @@ -47,7 +47,7 @@ pub fn load_dictionary(kind: Option<&str>, path: Option<&str>) -> PyResult) -> PyResult { - let p = PathBuf::from(path); + let p = Path::new(path); let ext = p .extension() .and_then(|ext| ext.to_str()) diff --git a/src/lib.rs b/src/lib.rs index d2cabeb..83ba0c7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,20 +1,74 @@ +// pub mod character_filter; pub mod dictionary; +pub mod segmenter; pub mod token; +// pub mod token_filter; pub mod tokenizer; pub mod util; use pyo3::prelude::*; +// use crate::character_filter::japanese_iteration_mark::PyJapaneseIterationMarkCharacterFilter; +// use crate::character_filter::mapping::PyMappingCharacterFilter; +// use crate::character_filter::regex::PyRegexCharacterFilter; +// use crate::character_filter::unicode_normalize::PyUnicodeNormalizeCharacterFilter; +// use crate::character_filter::PyCharacterFilter; use crate::dictionary::{load_dictionary, load_user_dictionary, PyDictionary, PyUserDictionary}; +use crate::segmenter::PySegmenter; use crate::token::PyToken; -use crate::tokenizer::PyTokenizer; +// use crate::token_filter::japanese_base_form::PyJapaneseBaseFormTokenFilter; +// use crate::token_filter::japanese_compound_word::PyJapaneseCompoundWordTokenFilter; +// use crate::token_filter::japanese_kana::PyJapaneseKanaTokenFilter; +// use crate::token_filter::japanese_katakana_stem::PyJapaneseKatakanaStemTokenFilter; +// use crate::token_filter::japanese_keep_tags::PyJapaneseKeepTagsTokenFilter; +// use crate::token_filter::japanese_number::PyJapaneseNumberTokenFilter; +// use crate::token_filter::japanese_reading_form::PyJapaneseReadingFormTokenFilter; +// use crate::token_filter::japanese_stop_tags::PyJapaneseStopTagsTokenFilter; +// use crate::token_filter::keep_words::PyKeepWordsTokenFilter; +// use crate::token_filter::korean_keep_tags::PyKoreanKeepTagsTokenFilter; +// use crate::token_filter::korean_reading_form::PyKoreanReadingFormTokenFilter; +// use crate::token_filter::korean_stop_tags::PyKoreanStopTagsTokenFilter; +// use crate::token_filter::length::PyLengthTokenFilter; +// use crate::token_filter::lowercase::PyLowercaseTokenFilter; +// use crate::token_filter::mapping::PyMappingTokenFilter; +// use crate::token_filter::remove_diacritical_mark::PyRemoveDiacriticalMarkTokenFilter; +// use crate::token_filter::stop_words::PyStopWordsTokenFilter; +// use crate::token_filter::uppercase::PyUppercaseTokenFilter; +// use crate::token_filter::PyTokenFilter; +use crate::tokenizer::{PyTokenizer, PyTokenizerBuilder}; #[pymodule] fn lindera(module: &Bound<'_, PyModule>) -> PyResult<()> { module.add_class::()?; module.add_class::()?; module.add_class::()?; + module.add_class::()?; module.add_class::()?; + module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; + // module.add_class::()?; module.add_function(wrap_pyfunction!(load_dictionary, module)?)?; module.add_function(wrap_pyfunction!(load_user_dictionary, module)?)?; diff --git a/src/segmenter.rs b/src/segmenter.rs new file mode 100644 index 0000000..c4b4785 --- /dev/null +++ b/src/segmenter.rs @@ -0,0 +1,70 @@ +use std::borrow::Cow; +use std::str::FromStr; + +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; +use pyo3::types::PyDict; + +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; + +use crate::dictionary::{PyDictionary, PyUserDictionary}; +use crate::token::PyToken; +use crate::util::pydict_to_value; + +#[pyclass(name = "Segmenter")] +#[derive(Clone)] +pub struct PySegmenter { + pub inner: Segmenter, +} + +#[pymethods] +impl PySegmenter { + #[new] + #[pyo3(signature = (mode, dictionary, user_dictionary=None))] + fn new( + mode: &str, + dictionary: PyDictionary, + user_dictionary: Option, + ) -> PyResult { + let m = Mode::from_str(mode) + .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {}", err)))?; + let d = dictionary.inner; + let u = user_dictionary.map(|d| d.inner); + + let segmenter = Segmenter::new(m, d, u); + + Ok(Self { inner: segmenter }) + } + + #[pyo3(signature = (config))] + #[allow(clippy::wrong_self_convention)] + fn from_config(&self, config: &Bound<'_, PyDict>) -> PyResult { + let config_value = pydict_to_value(config)?; + let segmenter = Segmenter::from_config(&config_value) + .map_err(|err| PyValueError::new_err(format!("Failed to create tokenizer: {}", err)))?; + + Ok(Self { inner: segmenter }) + } + + #[pyo3(signature = (text))] + fn segment(&self, text: &str) -> PyResult> { + let mut tokens = self + .inner + .segment(Cow::Borrowed(text)) + .map_err(|err| PyValueError::new_err(format!("Failed to tokenize text: {}", err)))?; + + Ok(tokens + .iter_mut() + .map(|t| PyToken { + #[allow(clippy::suspicious_to_owned)] + text: t.text.to_owned().to_string(), + byte_start: t.byte_start, + byte_end: t.byte_end, + position: t.position, + position_length: t.position_length, + details: t.details().iter().map(|d| d.to_string()).collect(), + }) + .collect()) + } +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs index b01c930..3d110e3 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,19 +1,147 @@ +use std::path::Path; use std::str::FromStr; -use lindera::token_filter::TokenFilterLoader; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::PyDict; use serde_json::json; use lindera::character_filter::CharacterFilterLoader; +use lindera::dictionary::DictionaryKind; use lindera::mode::Mode; -use lindera::tokenizer::Tokenizer; +use lindera::token_filter::TokenFilterLoader; +use lindera::tokenizer::{Tokenizer, TokenizerBuilder}; -use crate::dictionary::{PyDictionary, PyUserDictionary}; +use crate::segmenter::PySegmenter; use crate::token::PyToken; use crate::util::pydict_to_value; +#[pyclass(name = "TokenizerBuilder")] +pub struct PyTokenizerBuilder { + pub inner: TokenizerBuilder, +} + +#[pymethods] +impl PyTokenizerBuilder { + #[new] + #[pyo3(signature = ())] + fn new() -> PyResult { + let inner = TokenizerBuilder::new().map_err(|err| { + PyValueError::new_err(format!("Failed to create TokenizerBuilder: {}", err)) + })?; + + Ok(Self { inner }) + } + + #[pyo3(signature = (file_path))] + #[allow(clippy::wrong_self_convention)] + fn from_file(&self, file_path: &str) -> PyResult { + let inner = TokenizerBuilder::from_file(Path::new(file_path)).map_err(|err| { + PyValueError::new_err(format!("Failed to load config from file: {}", err)) + })?; + + Ok(Self { inner }) + } + + #[pyo3(signature = (mode))] + fn set_mode<'a>(mut slf: PyRefMut<'a, Self>, mode: &str) -> PyResult> { + let m = Mode::from_str(mode) + .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {}", err)))?; + + slf.inner.set_segmenter_mode(&m); + + Ok(slf) + } + + #[pyo3(signature = (kind))] + fn set_dictionary_kind<'a>( + mut slf: PyRefMut<'a, Self>, + kind: &str, + ) -> PyResult> { + let k = DictionaryKind::from_str(kind) + .map_err(|err| PyValueError::new_err(format!("Failed to create kind: {}", err)))?; + + slf.inner.set_segmenter_dictionary_kind(&k); + + Ok(slf) + } + + #[pyo3(signature = (path))] + fn set_dictionary_path<'a>( + mut slf: PyRefMut<'a, Self>, + path: &str, + ) -> PyResult> { + slf.inner.set_segmenter_dictionary_path(Path::new(path)); + + Ok(slf) + } + + #[pyo3(signature = (path))] + fn set_user_dictionary_path<'a>( + mut slf: PyRefMut<'a, Self>, + path: &str, + ) -> PyResult> { + slf.inner + .set_segmenter_user_dictionary_path(Path::new(path)); + + Ok(slf) + } + + #[pyo3(signature = (kind))] + fn set_user_dictionary_kind<'a>( + mut slf: PyRefMut<'a, Self>, + kind: &str, + ) -> PyResult> { + let k = DictionaryKind::from_str(kind) + .map_err(|err| PyValueError::new_err(format!("Failed to create kind: {}", err)))?; + + slf.inner.set_segmenter_user_dictionary_kind(&k); + + Ok(slf) + } + + #[pyo3(signature = (name, **args))] + fn append_character_filter<'a>( + mut slf: PyRefMut<'a, Self>, + name: &str, + args: Option<&Bound<'_, PyDict>>, + ) -> PyResult> { + let character_filter_args = match args { + Some(a) => pydict_to_value(a)?, + None => json!({}), + }; + + slf.inner + .append_character_filter(name, &character_filter_args); + + Ok(slf) + } + + #[pyo3(signature = (name, **args))] + fn append_token_filter<'a>( + mut slf: PyRefMut<'a, Self>, + name: &str, + args: Option<&Bound<'_, PyDict>>, + ) -> PyResult> { + let token_filter_args = match args { + Some(a) => pydict_to_value(a)?, + None => json!({}), + }; + + slf.inner.append_token_filter(name, &token_filter_args); + + Ok(slf) + } + + #[pyo3(signature = ())] + fn build(&self) -> PyResult { + self.inner + .build() + .map_err(|err| PyValueError::new_err(format!("Failed to build tokenizer: {}", err))) + .map(|t| PyTokenizer { inner: t }) + } +} + #[pyclass(name = "Tokenizer")] pub struct PyTokenizer { inner: Tokenizer, @@ -22,34 +150,37 @@ pub struct PyTokenizer { #[pymethods] impl PyTokenizer { #[new] - #[pyo3(signature = (mode, dictionary, user_dictionary=None))] - fn new( - mode: &str, - dictionary: PyDictionary, - user_dictionary: Option, - ) -> PyResult { - let m = Mode::from_str(mode) - .map_err(|err| PyValueError::new_err(format!("Failed to create mode: {}", err)))?; - let u = user_dictionary.map(|d| d.inner); + #[pyo3(signature = (segmenter))] + fn new(segmenter: PySegmenter) -> PyResult { Ok(Self { - inner: Tokenizer::new(m, dictionary.inner, u), + inner: Tokenizer::new(segmenter.inner), }) } + #[pyo3(signature = (config))] + #[allow(clippy::wrong_self_convention)] + fn from_config(&self, config: &Bound<'_, PyDict>) -> PyResult { + let config_value = pydict_to_value(config)?; + let tokenizer = Tokenizer::from_config(&config_value) + .map_err(|err| PyValueError::new_err(format!("Failed to create tokenizer: {}", err)))?; + + Ok(Self { inner: tokenizer }) + } + #[pyo3(signature = (name, **args))] fn append_character_filter( &mut self, name: &str, args: Option<&Bound<'_, PyDict>>, ) -> PyResult<()> { - let character_filter_args = match args { - Some(a) => pydict_to_value(a)?, + let value = match args { + Some(pydict) => pydict_to_value(pydict)?, None => json!({}), }; - let filter = CharacterFilterLoader::load_from_value(name, &character_filter_args).map_err( - |err| PyValueError::new_err(format!("Failed to load character filter: {}", err)), - )?; + let filter = CharacterFilterLoader::load_from_value(name, &value).map_err(|err| { + PyValueError::new_err(format!("Failed to load character filter: {}", err)) + })?; self.inner.append_character_filter(filter); Ok(()) @@ -61,15 +192,14 @@ impl PyTokenizer { name: &str, args: Option<&Bound<'_, PyDict>>, ) -> PyResult<()> { - let token_filter_args = match args { - Some(a) => pydict_to_value(a)?, + let value = match args { + Some(pydict) => pydict_to_value(pydict)?, None => json!({}), }; - let filter = - TokenFilterLoader::load_from_value(name, &token_filter_args).map_err(|err| { - PyValueError::new_err(format!("Failed to load token filter: {}", err)) - })?; + let filter = TokenFilterLoader::load_from_value(name, &value).map_err(|err| { + PyValueError::new_err(format!("Failed to load token filter: {}", err)) + })?; self.inner.append_token_filter(filter); Ok(()) diff --git a/src/util.rs b/src/util.rs index 0c1373b..6f98b62 100644 --- a/src/util.rs +++ b/src/util.rs @@ -44,6 +44,37 @@ pub fn pydict_to_value(pydict: &Bound<'_, PyDict>) -> PyResult { Ok(json!(map)) } +pub fn value_to_pydict(py: Python, value: &Value) -> PyResult { + match value { + Value::Null => Ok(py.None()), + Value::Bool(b) => Ok(PyBool::new_bound(py, *b).into_py(py)), + Value::Number(num) => { + if let Some(i) = num.as_i64() { + Ok(i.into_py(py)) + } else if let Some(f) = num.as_f64() { + Ok(f.into_py(py)) + } else { + Err(PyTypeError::new_err("Unsupported number type")) + } + } + Value::String(s) => Ok(PyString::new_bound(py, s).into_py(py)), + Value::Array(arr) => { + let py_list = PyList::empty_bound(py); + for item in arr { + py_list.append(value_to_pydict(py, item)?)?; + } + Ok(py_list.into()) + } + Value::Object(obj) => { + let py_dict = PyDict::new_bound(py); + for (key, val) in obj { + py_dict.set_item(key, value_to_pydict(py, val)?)?; + } + Ok(py_dict.into()) + } + } +} + #[cfg(test)] mod tests { // use pyo3::types::IntoPyDict;