From a4b662d88ff7c9af63d7dbea7effb9315b63d910 Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Sat, 29 Nov 2025 09:19:28 +0000 Subject: [PATCH 1/5] Add test with expectation of granular reload and append --- tests/test_ynotebook.py | 21 +----- tests/test_yunicode.py | 163 ++++++++++++++++++++++++++++++++++++++++ tests/utils.py | 26 +++++++ 3 files changed, 190 insertions(+), 20 deletions(-) diff --git a/tests/test_ynotebook.py b/tests/test_ynotebook.py index c196880..545a974 100644 --- a/tests/test_ynotebook.py +++ b/tests/test_ynotebook.py @@ -1,10 +1,9 @@ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. -from dataclasses import dataclass - from pycrdt import ArrayEvent, Map, MapEvent, TextEvent from pytest import mark +from utils import ExpectedEvent from jupyter_ydoc import YNotebook @@ -119,24 +118,6 @@ def record_changes(topic, event): ] -@dataclass -class ExpectedEvent: - kind: type - path: str | None = None - - def __eq__(self, other): - if not isinstance(other, self.kind): - return False - if self.path is not None and self.path != other.path: - return False - return True - - def __repr__(self): - if self.path is not None: - return f"ExpectedEvent({self.kind.__name__}, path={self.path!r})" - return f"ExpectedEvent({self.kind.__name__})" - - @mark.parametrize( "modifications, expected_events", [ diff --git a/tests/test_yunicode.py b/tests/test_yunicode.py index ef19131..4329ad6 100644 --- a/tests/test_yunicode.py +++ b/tests/test_yunicode.py @@ -1,6 +1,9 @@ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. +from pycrdt import TextEvent +from utils import ExpectedEvent + from jupyter_ydoc import YUnicode @@ -25,3 +28,163 @@ def record_changes(topic, event): # No changes should be observed at all assert changes == [] + + +def test_set_granular_changes(): + text = YUnicode() + + text.set( + "\n".join( + [ + "Mary had a little lamb,", + "Its fleece was white as snow.", + "And everywhere that Mary went,", + "The lamb was sure to go.", + ] + ) + ) + + changes = [] + + def record_changes(topic, event): + changes.append((topic, event)) # pragma: nocover + + text.observe(record_changes) + + # Call set with the bunny version + text.set( + "\n".join( + [ + "Mary had a little bunny,", + "Its fur was white as snow.", + "And everywhere that Mary went,", + "The bunny was sure to hop.", + ] + ) + ) + + assert len(changes) == 1 + source_events = [e for t, e in changes if t == "source"] + assert source_events == [ + ExpectedEvent( + TextEvent, + delta=[ + # "Mary had a little b" + {"retain": 18}, + {"delete": 3}, + {"retain": 1}, + # "Mary had a little b" + {"insert": "unny"}, + # ",↵ Its f" + {"retain": 7}, + {"delete": 5}, + # ",↵ Its f" + {"insert": "ur"}, + # " was white as snow.↵" + # "And everywhere that Mary went,↵" + # "The b" + {"retain": 55}, + {"delete": 3}, + {"retain": 1}, + # "The b was sure to" + {"insert": "unny"}, + {"retain": 13}, + # "o" + {"delete": 1}, + {"insert": "h"}, + {"retain": 1}, + {"insert": "p"}, + ], + ) + ] + + +def test_set_granular_append(): + text = YUnicode() + + text.set( + "\n".join( + [ + "Mary had a little lamb,", + "Its fleece was white as snow.", + ] + ) + ) + + changes = [] + + def record_changes(topic, event): + changes.append((topic, event)) # pragma: nocover + + text.observe(record_changes) + + # append a line + text.set( + "\n".join( + [ + "Mary had a little lamb,", + "Its fleece was white as snow.", + "And everywhere that Mary went,", + ] + ) + ) + + # append one more line + text.set( + "\n".join( + [ + "Mary had a little lamb,", + "Its fleece was white as snow.", + "And everywhere that Mary went,", + "The lamb was sure to go.", + ] + ) + ) + + assert len(changes) == 2 + source_events = [e for t, e in changes if t == "source"] + assert source_events == [ + ExpectedEvent( + TextEvent, delta=[{"retain": 53}, {"insert": "\nAnd everywhere that Mary went,"}] + ), + ExpectedEvent(TextEvent, delta=[{"retain": 84}, {"insert": "\nThe lamb was sure to go."}]), + ] + + +def test_set_hard_reload_if_very_different(): + text = YUnicode() + + text.set( + "\n".join( + [ + "Mary had a little lamb,", + "Its fleece was white as snow.", + "And everywhere that Mary went,", + "The lamb was sure to go.", + ] + ) + ) + + changes = [] + + def record_changes(topic, event): + changes.append((topic, event)) # pragma: nocover + + text.observe(record_changes) + + # Call set with a very different nursery rhyme + twinkle_lyrics = "\n".join( + [ + "Twinkle, twinkle, little star,", + "How I wonder what you are!", + "Up above the world so high,", + "Like a diamond in the sky.", + ] + ) + text.set(twinkle_lyrics) + + assert len(changes) == 1 + source_events = [e for t, e in changes if t == "source"] + assert source_events == [ + ExpectedEvent(TextEvent, delta=[{"delete": 109}, {"insert": twinkle_lyrics}]) + ] diff --git a/tests/utils.py b/tests/utils.py index 5797997..679833e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,6 +1,8 @@ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. +from dataclasses import dataclass + from anyio import Lock, connect_tcp @@ -41,3 +43,27 @@ async def ensure_server_running(host: str, port: int) -> None: pass else: break + + +@dataclass +class ExpectedEvent: + kind: type + path: str | None = None + delta: list[dict] | None = None + + def __eq__(self, other): + if not isinstance(other, self.kind): + return False + if self.path is not None and self.path != other.path: + return False + if self.delta is not None and self.delta != other.delta: + return False + return True + + def __repr__(self): + fragments = [self.kind.__name__] + if self.path is not None: + fragments.append(f"path={self.path!r}") + if self.delta is not None: + fragments.append(f"delta={self.delta!r}") + return f"ExpectedEvent({', '.join(fragments)})" From ba2102606df7e5870eb36f8e49b94885cb79339a Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Sat, 29 Nov 2025 09:29:55 +0000 Subject: [PATCH 2/5] Use stdlib sequence matcher to perform granular text updates --- jupyter_ydoc/yunicode.py | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/jupyter_ydoc/yunicode.py b/jupyter_ydoc/yunicode.py index 6a9dc6e..941c53e 100644 --- a/jupyter_ydoc/yunicode.py +++ b/jupyter_ydoc/yunicode.py @@ -2,6 +2,7 @@ # Distributed under the terms of the Modified BSD License. from collections.abc import Callable +from difflib import SequenceMatcher from functools import partial from typing import Any @@ -64,17 +65,40 @@ def set(self, value: str) -> None: :param value: The content of the document. :type value: str """ - if self.get() == value: + old_value = self.get() + if old_value == value: # no-op if the values are already the same, # to avoid side-effects such as cursor jumping to the top return with self._ydoc.transaction(): - # clear document - self._ysource.clear() - # initialize document - if value: - self._ysource += value + matcher = SequenceMatcher(a=old_value, b=value) + + # for very different strings, just replace the whole content; + # this avoids generating a huge number of operations + if matcher.ratio() < 0.6: + # clear document + self._ysource.clear() + # initialize document + if value: + self._ysource += value + else: + operations = matcher.get_opcodes() + offset = 0 + for tag, i1, i2, j1, j2 in operations: + if tag == "replace": + self._ysource[i1 + offset : i2 + offset] = value[j1:j2] + offset += (j2 - j1) - (i2 - i1) + elif tag == "delete": + del self._ysource[i1 + offset : i2 + offset] + offset -= i2 - i1 + elif tag == "insert": + self._ysource[i1 + offset : i2 + offset] = value[j1:j2] + offset += j2 - j1 + elif tag == "equal": + pass + else: + raise ValueError(f"Unknown tag '{tag}' in sequence matcher") def observe(self, callback: Callable[[str, Any], None]) -> None: """ From 7e0627e5437da9f4c011c6c092a89c318edabb7a Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Sun, 30 Nov 2025 10:49:49 +0000 Subject: [PATCH 3/5] Use `real_quick_ratio` to fast-reject very dissimilar updates --- jupyter_ydoc/yunicode.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/jupyter_ydoc/yunicode.py b/jupyter_ydoc/yunicode.py index 941c53e..911c85c 100644 --- a/jupyter_ydoc/yunicode.py +++ b/jupyter_ydoc/yunicode.py @@ -10,6 +10,9 @@ from .ybasedoc import YBaseDoc +# Heuristic threshold as recommended in difflib documentation +SIMILARITY_THREESHOLD = 0.6 + class YUnicode(YBaseDoc): """ @@ -74,15 +77,10 @@ def set(self, value: str) -> None: with self._ydoc.transaction(): matcher = SequenceMatcher(a=old_value, b=value) - # for very different strings, just replace the whole content; - # this avoids generating a huge number of operations - if matcher.ratio() < 0.6: - # clear document - self._ysource.clear() - # initialize document - if value: - self._ysource += value - else: + if ( + matcher.real_quick_ratio() >= SIMILARITY_THREESHOLD + and matcher.ratio() >= SIMILARITY_THREESHOLD + ): operations = matcher.get_opcodes() offset = 0 for tag, i1, i2, j1, j2 in operations: @@ -99,6 +97,15 @@ def set(self, value: str) -> None: pass else: raise ValueError(f"Unknown tag '{tag}' in sequence matcher") + else: + # for very different strings, just replace the whole content; + # this avoids generating a huge number of operations + + # clear document + self._ysource.clear() + # initialize document + if value: + self._ysource += value def observe(self, callback: Callable[[str, Any], None]) -> None: """ From ca32120cdf4d8d529d1b8e40240e690b3a9ece90 Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Mon, 1 Dec 2025 09:33:39 +0000 Subject: [PATCH 4/5] Use `insert()` which skips some of the checks as `__setitem__` also checks if index is a number or slice and then checks the range of the slice; we can skip those knowing that `i1 == i2` in the `insert` opcode. --- jupyter_ydoc/yunicode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jupyter_ydoc/yunicode.py b/jupyter_ydoc/yunicode.py index 911c85c..7690db9 100644 --- a/jupyter_ydoc/yunicode.py +++ b/jupyter_ydoc/yunicode.py @@ -39,7 +39,7 @@ def __init__(self, ydoc: Doc | None = None, awareness: Awareness | None = None): :type awareness: :class:`pycrdt.Awareness`, optional. """ super().__init__(ydoc, awareness) - self._ysource = self._ydoc.get("source", type=Text) + self._ysource: Text = self._ydoc.get("source", type=Text) self.undo_manager.expand_scope(self._ysource) @property @@ -91,7 +91,7 @@ def set(self, value: str) -> None: del self._ysource[i1 + offset : i2 + offset] offset -= i2 - i1 elif tag == "insert": - self._ysource[i1 + offset : i2 + offset] = value[j1:j2] + self._ysource.insert(i1 + offset, value[j1:j2]) offset += j2 - j1 elif tag == "equal": pass From b61975ef55b4e8011016e7fbdd83f601e5ae84df Mon Sep 17 00:00:00 2001 From: krassowski <5832902+krassowski@users.noreply.github.com> Date: Mon, 1 Dec 2025 10:01:56 +0000 Subject: [PATCH 5/5] Use match-case instead of elif --- jupyter_ydoc/yunicode.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/jupyter_ydoc/yunicode.py b/jupyter_ydoc/yunicode.py index 7690db9..d1adbdb 100644 --- a/jupyter_ydoc/yunicode.py +++ b/jupyter_ydoc/yunicode.py @@ -84,19 +84,20 @@ def set(self, value: str) -> None: operations = matcher.get_opcodes() offset = 0 for tag, i1, i2, j1, j2 in operations: - if tag == "replace": - self._ysource[i1 + offset : i2 + offset] = value[j1:j2] - offset += (j2 - j1) - (i2 - i1) - elif tag == "delete": - del self._ysource[i1 + offset : i2 + offset] - offset -= i2 - i1 - elif tag == "insert": - self._ysource.insert(i1 + offset, value[j1:j2]) - offset += j2 - j1 - elif tag == "equal": - pass - else: - raise ValueError(f"Unknown tag '{tag}' in sequence matcher") + match tag: + case "replace": + self._ysource[i1 + offset : i2 + offset] = value[j1:j2] + offset += (j2 - j1) - (i2 - i1) + case "delete": + del self._ysource[i1 + offset : i2 + offset] + offset -= i2 - i1 + case "insert": + self._ysource.insert(i1 + offset, value[j1:j2]) + offset += j2 - j1 + case "equal": + pass + case _: + raise ValueError(f"Unknown tag '{tag}' in sequence matcher") else: # for very different strings, just replace the whole content; # this avoids generating a huge number of operations