From 877884f474a9f971e6a6f78ed052ef09db11dcc6 Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:44:29 +1100 Subject: [PATCH 1/7] fix: splitters not sorted by length --- src/semchunk/semchunk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 0f05e1d..466fed5 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -68,13 +68,13 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]: # - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character; # - A semantically meaningful non-whitespace splitter. if "\n" in text or "\r" in text: - splitter = max(re.findall(r"[\r\n]+", text)) + splitter = max(re.findall(r"[\r\n]+", text), key=len) elif "\t" in text: - splitter = max(re.findall(r"\t+", text)) + splitter = max(re.findall(r"\t+", text), key=len) elif re.search(r"\s", text): - splitter = max(re.findall(r"\s+", text)) + splitter = max(re.findall(r"\s+", text), key=len) # If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences. if len(splitter) == 1: From 97cde00f61a010a5c0b93e3fc1fe5f8960697a11 Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:45:59 +1100 Subject: [PATCH 2/7] chore: add ipykernel as dev dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index bf2cccf..0b27abc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,6 +122,7 @@ target-version = "py312" dev = [ "build>=1.2.2.post1", "hatch>=1.14.1", + "ipykernel>=6.31.0", "nltk>=3.9.1", "pytest>=8.4.0", "pytest-cov>=6.1.1", From abd30be1ee56ca3858930308ac7aea40700bb521 Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:47:48 +1100 Subject: [PATCH 3/7] fix: potential future bug --- src/semchunk/semchunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 466fed5..29ccb1f 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -216,7 +216,7 @@ def chunk( text=split, chunk_size=local_chunk_size, token_counter=token_counter, - offsets=return_offsets, + offsets=True, _recursion_depth=_recursion_depth + 1, _start=split_start, ) From d1747a8c6ba5f5dea9f77f8fb80b98c6cdc3c2a4 Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:49:02 +1100 Subject: [PATCH 4/7] docs: documented changes --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a73a226..5948d01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ ## Changelog 🔄 All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [3.2.4] - 2025-10-26 +### Fixed +- Fixed splitters being sorted lexographically rather than by length, which should improve the meaningfulness of chunks. + +### Fixed +- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)). + ## [3.2.3] - 2025-08-13 ### Fixed - Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)). From 84c59c5f2f9bb5498cdb676f78a9cdc1bfd8b202 Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:49:42 +1100 Subject: [PATCH 5/7] chore: sort imports --- src/semchunk/semchunk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semchunk/semchunk.py b/src/semchunk/semchunk.py index 29ccb1f..73eb84b 100644 --- a/src/semchunk/semchunk.py +++ b/src/semchunk/semchunk.py @@ -5,9 +5,9 @@ import inspect from typing import Callable, Sequence, TYPE_CHECKING +from functools import lru_cache from itertools import accumulate from contextlib import suppress -from functools import lru_cache import mpire From de440406cdb7fcdaae645439b756f589a28516ea Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:49:49 +1100 Subject: [PATCH 6/7] chore: add isort as dev dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0b27abc..0b9f56b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,6 +123,7 @@ dev = [ "build>=1.2.2.post1", "hatch>=1.14.1", "ipykernel>=6.31.0", + "isort>=6.1.0", "nltk>=3.9.1", "pytest>=8.4.0", "pytest-cov>=6.1.1", From 292e4bae10be2a7c975e54d3f47d0b8da2653c26 Mon Sep 17 00:00:00 2001 From: Umar Butler <8473183+umarbutler@users.noreply.github.com> Date: Sun, 26 Oct 2025 14:50:24 +1100 Subject: [PATCH 7/7] chore: bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0b9f56b..09ac0ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "semchunk" -version = "3.2.3" +version = "3.2.4" authors = [ {name="Isaacus", email="support@isaacus.com"}, {name="Umar Butler", email="umar@umar.au"},