From ca729de7c130718dd1b59ac1a103aa45161d8788 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Wed, 8 Oct 2025 04:42:49 -0700 Subject: [PATCH 1/3] Adding function to detect cycles within a graph and hooking it up to the _closure function --- linkml_runtime/utils/schemaview.py | 84 ++++++++++++- tests/test_utils/input/cycles.yaml | 189 ++++++++++++++++++++++++++++ tests/test_utils/test_schemaview.py | 139 ++++++++++++++++++++ 3 files changed, 407 insertions(+), 5 deletions(-) create mode 100644 tests/test_utils/input/cycles.yaml diff --git a/linkml_runtime/utils/schemaview.py b/linkml_runtime/utils/schemaview.py index d56cd19c..be67592e 100644 --- a/linkml_runtime/utils/schemaview.py +++ b/linkml_runtime/utils/schemaview.py @@ -8,7 +8,6 @@ import uuid import warnings from collections import defaultdict, deque -from collections.abc import Callable, Mapping from copy import copy, deepcopy from dataclasses import dataclass from enum import Enum @@ -45,7 +44,7 @@ from linkml_runtime.utils.pattern import PatternResolver if TYPE_CHECKING: - from collections.abc import Mapping + from collections.abc import Callable, Iterable, Mapping from types import NotImplementedType from linkml_runtime.utils.metamodelcore import URI, URIorCURIE @@ -92,13 +91,88 @@ class OrderedBy(Enum): """ +WHITE = 0 +GREY = 1 +BLACK = 2 + + +def detect_cycles(f: Callable[[Any], Iterable[Any] | None], x: Any) -> None: + """Detect cycles in a graph, using function `f` to walk the graph. + + Uses the classic white/grey/black colour coding algorithm to track which nodes have been explored. + + WHITE: unexplored + GREY: node is being processed; processing includes exploring all neighbours reachable via f(node) + BLACK: node and all of its neighbours (and their neighbours, etc.) have been processed + + A directed cycle reachable from node `x` raises a CycleError. + + :param f: function that returns an iterable of neighbouring nodes (parents or children) + :type f: Callable[[Any], Iterable[Any] | None] + :param x: graph node + :type x: Any + :raises ValueError: if a cycle is discovered through repeated calls to f(x) + """ + # keep track of the processing state of nodes in the graph + processing_state: dict[Any, int] = {} + + # Stack entries are (node, processed_flag). + # processed_flag == True means all neighbours (nodes generated by running + # `f(node)`) have been added to the todo stack and the node can be marked BLACK. + todo: list[tuple[Any, bool]] = [(x, False)] + + while todo: + node, processed_flag = todo.pop() + + if processed_flag: + # all neighbours have been processed + processing_state[node] = BLACK + continue + + # check the state of this node + node_state = processing_state.get(node, WHITE) + + if node_state == GREY: + # this node was already being processed + # we have discovered an edge back to that node - i.e. a cycle + err_msg = f"Cycle detected at node {node!r}" + raise ValueError(err_msg) + + if node_state == BLACK: + # already fully explored - nothing to do + continue + + # mark the node as being processed (GREY) and set the processed_flag to True + processing_state[node] = GREY + todo.append((node, True)) + + # push the neighbours on to the processing stack + todo.extend((child, False) for child in f(node) or []) + + def _closure( - f: Callable, - x, + f: Callable[[Any], Iterable[Any] | None], + x: Any, reflexive: bool = True, depth_first: bool = True, - **kwargs: dict[str, Any] | None, # noqa: ARG001 + **kwargs: dict[str, Any] | None, ) -> list[str | ElementName | ClassDefinitionName | EnumDefinitionName | SlotDefinitionName | TypeDefinitionName]: + """Walk the graph using function `f` and generate the closure. + + :param f: function that returns an iterable of neighbouring nodes (parents or children) + :type f: Callable[[Any], Iterable[Any] | None] + :param x: start node + :type x: Any + :param reflexive: assume the graph is reflexive, defaults to True + :type reflexive: bool, optional + :param depth_first: depth first traversal, defaults to True + :type depth_first: bool, optional + :return: list of nodes + :rtype: list[str | ElementName | ClassDefinitionName | EnumDefinitionName | SlotDefinitionName | TypeDefinitionName] + """ + if kwargs and kwargs.get("detect_cycles"): + detect_cycles(f, x) + rv = [x] if reflexive else [] visited = [] todo = [x] diff --git a/tests/test_utils/input/cycles.yaml b/tests/test_utils/input/cycles.yaml new file mode 100644 index 00000000..50e22833 --- /dev/null +++ b/tests/test_utils/input/cycles.yaml @@ -0,0 +1,189 @@ +# yaml-language-server: $schema=https://linkml.io/linkml-model/linkml_model/jsonschema/meta.schema.json +id: https://example.org/test-cycle-schema +name: TestCycleSchema +description: | + Schema with intentional cycles in classes, mixins, slot ranges, and types. +prefixes: + ex: https://example.org/schema/ + +default_prefix: ex +default_range: string + +slots: + identifier_slot: + range: string + identifier: true + noncycle_slot: + slot_a: + slot_b: + slot_c: + mixed_in_slot_a: + mixed_in_slot_b: + +classes: + # Non‑cyclic base class with some children + BaseClass: + description: Simple base class with no cycles. + slots: + - noncycle_slot + - identifier_slot + + MixinA: + mixin: true + slots: + - mixed_in_slot_a + + MixinB: + mixin: true + slots: + - mixed_in_slot_b + + NonCycleClassA: + is_a: BaseClass + + NonCycleClassB: + mixins: + - MixinA + is_a: NonCycleClassA + + NonCycleClassC: + mixins: + - MixinB + is_a: NonCycleClassA + + # cycle in the range of a slot + IdentifierCycleClassA: + description: Class with a cycle in the range for the identifier slot. + attributes: + id_slot: + range: IdentifierCycleClassB + identifier: true + + IdentifierCycleClassB: + description: Class with a cycle in the range for the identifier slot. + attributes: + id_slot: + range: IdentifierCycleClassC + identifier: true + + IdentifierCycleClassC: + description: Class with a cycle in the range for the identifier slot. + attributes: + id_slot: + range: IdentifierCycleClassA + identifier: true + + IdentifierCycleClassD: + description: Class with itself as the range for the identifier slot. + attributes: + id_slot: + range: IdentifierCycleClassD + identifier: true + + # Cycle: ClassA -> ClassB -> ClassC -> ClassA + # ClassD and ClassE have the misfortune of inheriting all this nonsense. + ClassA: + is_a: ClassB + description: Part of a subclass inheritance cycle (A -> B). + slots: + - slot_a + + ClassB: + is_a: ClassC + description: Part of a subclass inheritance cycle (B -> C). + slots: + - slot_b + + ClassC: + is_a: ClassA + description: Part of a subclass inheritance cycle (C -> A). + slots: + - slot_c + + ClassD: + is_a: ClassA + + ClassE: + is_a: ClassD + + # Cycle: ClassF -> ClassF + # ClassG is_a ClassF so inherits the cycle + ClassF: + is_a: ClassF + + ClassG: + is_a: ClassF + + # Mixin cycle (mixins reference each other) + Mixin1: + description: Mixin that pulls in Mixin2, forming a mixin cycle. + mixin: true + mixins: + - Mixin2 + + Mixin2: + description: Mixin that pulls in Mixin1, forming a mixin cycle. + mixin: true + mixins: + - Mixin1 + + MixedClass: + description: Class that applies both Mixin1 and Mixin2, thus inheriting the mixin cycle. + mixins: + - Mixin1 + - Mixin2 + + +types: + # string and subtypes + string: + uri: ex:string + base: str + description: A character string + + super_string: + typeof: string + description: Type with ancestors, no cycles + + supreme_string: + typeof: super_string + description: Type with ancestors, no cycles + + integer: + uri: ex:integer + base: int + description: An integer + + boolean: + uri: ex:boolean + base: Bool + repr: bool + description: Your basic bool. + + # in a cycle with itself! + circle: + typeof: circle + uri: ex:circle + + circle_of_life: + typeof: circle + + # cycle between type_circular and circular_type + circular_type: + typeof: type_circular + uri: ex:circ + description: Type in a cycle + + type_circular: + typeof: circular_type + uri: ex:circ + description: Type in a cycle + + # inherit the type_circular/circular_type confusion + semi_circular_type: + typeof: circular_type + description: Type with cyclic ancestors + + curve_type: + typeof: semi_circular_type + description: Type with cyclic ancestors diff --git a/tests/test_utils/test_schemaview.py b/tests/test_utils/test_schemaview.py index fb067960..fd6e3816 100644 --- a/tests/test_utils/test_schemaview.py +++ b/tests/test_utils/test_schemaview.py @@ -36,7 +36,9 @@ TYPES, SchemaUsage, SchemaView, + detect_cycles, ) +from linkml_runtime.utils.schemaview import _closure as graph_closure from tests.test_utils import INPUT_DIR INPUT_DIR_PATH = Path(INPUT_DIR) @@ -2281,3 +2283,140 @@ def test_class_name_mappings() -> None: assert set(view.all_slots()) == set(slot_names) assert set(view.slot_name_mappings()) == set(slot_names.values()) assert {snm_def.name: snm for snm, snm_def in view.slot_name_mappings().items()} == slot_names + + +@pytest.fixture(scope="module") +def sv_cycles_schema() -> SchemaView: + """A schema containing cycles!""" + return SchemaView(INPUT_DIR_PATH / "cycles.yaml") + + +CYCLES = { + TYPES: { + # these types are all involved in cycles, either directly or via ancestors + 0: { + "curve_type": "circular_type", + "semi_circular_type": "circular_type", + "circular_type": "circular_type", + "type_circular": "type_circular", + "circle": "circle", + "circle_of_life": "circle", + }, + # type_ancestors of types not involved in cycles + 1: { + "supreme_string": {"supreme_string", "super_string", "string"}, + "super_string": {"super_string", "string"}, + "string": {"string"}, + "integer": {"integer"}, + "boolean": {"boolean"}, + }, + }, + CLASSES: { + # classes involved in cycles + 0: { + "ClassA": "ClassA", + "ClassB": "ClassB", + "ClassC": "ClassC", + "ClassD": "ClassA", + "ClassE": "ClassA", + "ClassF": "ClassF", + "ClassG": "ClassF", + "Mixin1": "Mixin1", + "Mixin2": "Mixin2", + "MixedClass": "Mixin2", + }, + # class ancestors for classes not in cycles + 1: { + "BaseClass": {"BaseClass"}, + "MixinA": {"MixinA"}, + "MixinB": {"MixinB"}, + "NonCycleClassA": {"NonCycleClassA", "BaseClass"}, + "NonCycleClassB": {"MixinA", "NonCycleClassB", "NonCycleClassA", "BaseClass"}, + "NonCycleClassC": {"MixinB", "NonCycleClassC", "NonCycleClassA", "BaseClass"}, + "IdentifierCycleClassA": {"IdentifierCycleClassA"}, + "IdentifierCycleClassB": {"IdentifierCycleClassB"}, + "IdentifierCycleClassC": {"IdentifierCycleClassC"}, + "IdentifierCycleClassD": {"IdentifierCycleClassD"}, + }, + }, +} + + +@pytest.mark.parametrize(("target", "cycle_start_node"), [(k, v) for k, v in CYCLES[TYPES][0].items()]) +@pytest.mark.parametrize("fn", ["detect_cycles", "graph_closure", "type_ancestors"]) +def test_detect_type_cycles_error(sv_cycles_schema: SchemaView, target: str, cycle_start_node: str, fn: str) -> None: + """Test detection of cycles in the types segment of the cycles schema.""" + if fn == "detect_cycles": + with pytest.raises(ValueError, match=f"Cycle detected at node '{cycle_start_node}'"): + detect_cycles(lambda x: sv_cycles_schema.type_parents(x), target) + elif fn == "graph_closure": + with pytest.raises(ValueError, match=f"Cycle detected at node '{cycle_start_node}'"): + graph_closure(lambda x: sv_cycles_schema.type_parents(x), target, detect_cycles=True) + else: + with pytest.raises(ValueError, match=f"Cycle detected at node '{cycle_start_node}'"): + sv_cycles_schema.type_ancestors(type_name=target, detect_cycles=True) + + +@pytest.mark.parametrize(("target", "expected"), [(k, v) for k, v in CYCLES[TYPES][1].items()]) +@pytest.mark.parametrize("fn", ["detect_cycles", "graph_closure", "type_ancestors"]) +def test_detect_type_cycles_no_cycles(sv_cycles_schema: SchemaView, target: str, expected: set[str], fn: str) -> None: + """Ensure that types without cycles in their ancestry do not throw an error.""" + if fn == "detect_cycles": + detect_cycles(lambda x: sv_cycles_schema.type_parents(x), target) + elif fn == "graph_closure": + got = graph_closure(lambda x: sv_cycles_schema.type_parents(x), target, detect_cycles=True) + assert set(got) == expected + else: + got = sv_cycles_schema.type_ancestors(target, detect_cycles=True) + assert set(got) == expected + + +@pytest.mark.parametrize(("target", "cycle_start_node"), [(k, v) for k, v in CYCLES[CLASSES][0].items()]) +@pytest.mark.parametrize("fn", ["detect_cycles", "graph_closure", "class_ancestors"]) +def test_detect_class_cycles_error(sv_cycles_schema: SchemaView, target: str, cycle_start_node: str, fn: str) -> None: + """Test detection of class cycles in the cycles schema.""" + if fn == "detect_cycles": + with pytest.raises(ValueError, match=f"Cycle detected at node '{cycle_start_node}'"): + detect_cycles(lambda x: sv_cycles_schema.class_parents(x), target) + + elif fn == "graph_closure": + with pytest.raises(ValueError, match=f"Cycle detected at node '{cycle_start_node}'"): + graph_closure(lambda x: sv_cycles_schema.class_parents(x), target, detect_cycles=True) + else: + with pytest.raises(ValueError, match=f"Cycle detected at node '{cycle_start_node}'"): + sv_cycles_schema.class_ancestors(target, detect_cycles=True) + + +@pytest.mark.parametrize(("target", "expected"), [(k, v) for k, v in CYCLES[CLASSES][1].items()]) +@pytest.mark.parametrize("fn", ["detect_cycles", "graph_closure", "class_ancestors"]) +def test_detect_class_cycles_no_cycles(sv_cycles_schema: SchemaView, target: str, expected: set[str], fn: str) -> None: + """Ensure that classes without cycles in their ancestry do not throw an error.""" + if fn == "detect_cycles": + detect_cycles(lambda x: sv_cycles_schema.class_parents(x), target) + elif fn == "graph_closure": + got = graph_closure(lambda x: sv_cycles_schema.class_parents(x), target, detect_cycles=True) + assert set(got) == expected + else: + got = sv_cycles_schema.class_ancestors(target, detect_cycles=True) + assert set(got) == expected + + +@pytest.mark.parametrize("target", CYCLES[CLASSES][1].keys()) +def test_detect_class_as_range_cycles(sv_cycles_schema: SchemaView, target: str) -> None: + """Test cycle detection in cases where a class is used as a range.""" + + def check_recursive_id_slots(class_name: str) -> list[str]: + """Given a class, retrieve any classes used as the range for the class identifier slot.""" + id_slot = sv_cycles_schema.get_identifier_slot(class_name, use_key=True) + if not id_slot: + return [] + ind_range = sv_cycles_schema.slot_range_as_union(id_slot) + return [sv_cycles_schema.get_class(x).name for x in ind_range if sv_cycles_schema.get_class(x)] or [] + + # classes with a cycle in the class identifier slot range are cunningly named + if "IdentifierCycle" in target: + with pytest.raises(ValueError, match="Cycle detected at node "): + detect_cycles(lambda x: check_recursive_id_slots(x), target) + + else: + detect_cycles(lambda x: check_recursive_id_slots(x), target) From 333ba2b669d29475656704f797f9ffcbc3b05985 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Wed, 8 Oct 2025 04:57:59 -0700 Subject: [PATCH 2/3] Adding comments --- tests/test_utils/test_schemaview.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_utils/test_schemaview.py b/tests/test_utils/test_schemaview.py index fd6e3816..40cd401d 100644 --- a/tests/test_utils/test_schemaview.py +++ b/tests/test_utils/test_schemaview.py @@ -2293,7 +2293,8 @@ def sv_cycles_schema() -> SchemaView: CYCLES = { TYPES: { - # these types are all involved in cycles, either directly or via ancestors + # types in cycles, either directly or via ancestors + # key: type, value: node where the cycle starts 0: { "curve_type": "circular_type", "semi_circular_type": "circular_type", @@ -2302,7 +2303,8 @@ def sv_cycles_schema() -> SchemaView: "circle": "circle", "circle_of_life": "circle", }, - # type_ancestors of types not involved in cycles + # types not involved in cycles + # key: the type, value: ancestors of the type 1: { "supreme_string": {"supreme_string", "super_string", "string"}, "super_string": {"super_string", "string"}, @@ -2313,6 +2315,7 @@ def sv_cycles_schema() -> SchemaView: }, CLASSES: { # classes involved in cycles + # key: class name, value: node where the cycle starts 0: { "ClassA": "ClassA", "ClassB": "ClassB", @@ -2326,6 +2329,7 @@ def sv_cycles_schema() -> SchemaView: "MixedClass": "Mixin2", }, # class ancestors for classes not in cycles + # key: class name, value: class ancestors 1: { "BaseClass": {"BaseClass"}, "MixinA": {"MixinA"}, From b09e72ee61b81a8d000d91f57727e7780e3160ea Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Wed, 8 Oct 2025 05:20:50 -0700 Subject: [PATCH 3/3] Clarify function docs --- linkml_runtime/utils/schemaview.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/linkml_runtime/utils/schemaview.py b/linkml_runtime/utils/schemaview.py index be67592e..e4ba6873 100644 --- a/linkml_runtime/utils/schemaview.py +++ b/linkml_runtime/utils/schemaview.py @@ -97,18 +97,20 @@ class OrderedBy(Enum): def detect_cycles(f: Callable[[Any], Iterable[Any] | None], x: Any) -> None: - """Detect cycles in a graph, using function `f` to walk the graph. + """Detect cycles in a graph, using function `f` to walk the graph, starting at node `x`. - Uses the classic white/grey/black colour coding algorithm to track which nodes have been explored. + Uses the classic white/grey/black colour coding algorithm to track which nodes have been explored. In this + case, "node" refers to any element in a schema and "neighbours" are elements that can be reached from that + node by executing function `f`. WHITE: unexplored GREY: node is being processed; processing includes exploring all neighbours reachable via f(node) BLACK: node and all of its neighbours (and their neighbours, etc.) have been processed - A directed cycle reachable from node `x` raises a CycleError. + A directed cycle reachable from node `x` raises a ValueError. :param f: function that returns an iterable of neighbouring nodes (parents or children) - :type f: Callable[[Any], Iterable[Any] | None] + :type f: Callable[[Any], Iterable[Any] | None] :param x: graph node :type x: Any :raises ValueError: if a cycle is discovered through repeated calls to f(x) @@ -117,8 +119,8 @@ def detect_cycles(f: Callable[[Any], Iterable[Any] | None], x: Any) -> None: processing_state: dict[Any, int] = {} # Stack entries are (node, processed_flag). - # processed_flag == True means all neighbours (nodes generated by running - # `f(node)`) have been added to the todo stack and the node can be marked BLACK. + # processed_flag == True means all neighbours (nodes generated by running `f(node)`) + # have been added to the todo stack and the node can be marked BLACK. todo: list[tuple[Any, bool]] = [(x, False)] while todo: