Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
## Changelog 🔄
All notable changes to `semchunk` will be documented here. This project adheres to [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [3.2.4] - 2025-10-26
### Fixed
- Fixed splitters being sorted lexographically rather than by length, which should improve the meaningfulness of chunks.

### Fixed
- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)).

## [3.2.3] - 2025-08-13
### Fixed
- Fixed broken Python download count shield ([crflynn/pypistats.org#82](https://github.com/crflynn/pypistats.org/issues/82#issue-3285911460)).
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "semchunk"
version = "3.2.3"
version = "3.2.4"
authors = [
{name="Isaacus", email="support@isaacus.com"},
{name="Umar Butler", email="umar@umar.au"},
Expand Down Expand Up @@ -122,6 +122,8 @@ target-version = "py312"
dev = [
"build>=1.2.2.post1",
"hatch>=1.14.1",
"ipykernel>=6.31.0",
"isort>=6.1.0",
"nltk>=3.9.1",
"pytest>=8.4.0",
"pytest-cov>=6.1.1",
Expand Down
10 changes: 5 additions & 5 deletions src/semchunk/semchunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import inspect

from typing import Callable, Sequence, TYPE_CHECKING
from functools import lru_cache
from itertools import accumulate
from contextlib import suppress
from functools import lru_cache

import mpire

Expand Down Expand Up @@ -68,13 +68,13 @@ def _split_text(text: str) -> tuple[str, bool, list[str]]:
# - The largest sequence of whitespace characters or, if the largest such sequence is only a single character and there exists a whitespace character preceded by a semantically meaningful non-whitespace splitter, then that whitespace character;
# - A semantically meaningful non-whitespace splitter.
if "\n" in text or "\r" in text:
splitter = max(re.findall(r"[\r\n]+", text))
splitter = max(re.findall(r"[\r\n]+", text), key=len)

elif "\t" in text:
splitter = max(re.findall(r"\t+", text))
splitter = max(re.findall(r"\t+", text), key=len)

elif re.search(r"\s", text):
splitter = max(re.findall(r"\s+", text))
splitter = max(re.findall(r"\s+", text), key=len)

# If the splitter is only a single character, see if we can target whitespace characters that are preceded by semantically meaningful non-whitespace splitters to avoid splitting in the middle of sentences.
if len(splitter) == 1:
Expand Down Expand Up @@ -216,7 +216,7 @@ def chunk(
text=split,
chunk_size=local_chunk_size,
token_counter=token_counter,
offsets=return_offsets,
offsets=True,
_recursion_depth=_recursion_depth + 1,
_start=split_start,
)
Expand Down