diff --git a/CHANGELOG.md b/CHANGELOG.md index a73a226..5948d01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.2.4] - 2025-10-26 +### Fixed +- Fixed splitters being sorted lexographically rather than by length, which should improve the meaningfulness of chunks. + +### Fixed +- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)). + ## [3.2.3] - 2025-08-13 ### Fixed - Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)). diff --git a/pyproject.toml b/pyproject.toml index bf2cccf..09ac0ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "3.2.3" +version = "3.2.4" authors = [ {name="Isaacus", email="support@isaacus.com"}, {name="Umar Butler", email="umar@umar.au"}, @@ -122,6 +122,8 @@ target-version = "py312" dev = [ "build>=1.2.2.post1", "hatch>=1.14.1", + "ipykernel>=6.31.0", + "isort>=6.1.0", "nltk>=3.9.1", "pytest>=8.4.0", "pytest-cov>=6.1.1", diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 0f05e1d..73eb84b 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -5,9 +5,9 @@ import inspect from typing import Callable, Sequence, TYPE_CHECKING +from functools import lru_cache from itertools import accumulate from contextlib import suppress -from functools import lru_cache import mpire @@ -68,13 +68,13 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]: # - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character; # - A semantically meaningful non-whitespace splitter. if "\n" in text or "\r" in text: - splitter = max(re.findall(r"[\r\n]+", text)) + splitter = max(re.findall(r"[\r\n]+", text), key=len) elif "\t" in text: - splitter = max(re.findall(r"\t+", text)) + splitter = max(re.findall(r"\t+", text), key=len) elif re.search(r"\s", text): - splitter = max(re.findall(r"\s+", text)) + splitter = max(re.findall(r"\s+", text), key=len) # If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences. if len(splitter) == 1: @@ -216,7 +216,7 @@ def chunk( text=split, chunk_size=local_chunk_size, token_counter=token_counter, - offsets=return_offsets, + offsets=True, _recursion_depth=_recursion_depth + 1, _start=split_start, )