Fix dataset factory patterns in Experiment Tracking (#1588)

* add dataset factory resolver Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> * fix lint errors Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> * add release notes Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> * fix lint Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> * add pytest for resolve dataset factory patterns Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> * modify release statement Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> * add comments based on PR review Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com> --------- Signed-off-by: ravi-kumar-pilla <ravi_kumar_pilla@mckinsey.com>
kedro-org · Oct 27, 2023 · dca4581 · dca4581
1 parent d87b8ae
commit dca4581
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 0 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -5,6 +5,11 @@ Please follow the established format:
 - Use present tense (e.g. 'Add new feature')
 - Include the ID number for the related PR (or PRs) in parentheses
 -->
+# Upcoming Release
+
+## Bug fixes and other changes
+
+- Fix dataset factory patterns in Experiment Tracking. (#1588)
 
 # Release 6.6.1
 

diff --git a/package/kedro_viz/data_access/managers.py b/package/kedro_viz/data_access/managers.py
@@ -69,6 +69,21 @@ def set_db_session(self, db_session_class: sessionmaker):
         """Set db session on repositories that need it."""
         self.runs.set_db_session(db_session_class)
 
+    def resolve_dataset_factory_patterns(
+        self, catalog: DataCatalog, pipelines: Dict[str, KedroPipeline]
+    ):
+        """Resolve dataset factory patterns in data catalog by matching
+        them against the datasets in the pipelines.
+        """
+        for pipeline in pipelines.values():
+            if hasattr(pipeline, "datasets"):
+                datasets = pipeline.datasets()  # kedro 0.19.0 onwards
+            else:
+                datasets = pipeline.data_sets()
+
+            for dataset_name in datasets:
+                catalog.exists(dataset_name)
+
     def add_catalog(self, catalog: DataCatalog):
         """Add a catalog to the CatalogRepository and relevant tracking datasets to
         TrackingDatasetRepository.

diff --git a/package/kedro_viz/server.py b/package/kedro_viz/server.py
@@ -36,6 +36,10 @@ def populate_data(
         session_class = make_db_session_factory(session_store.location)
         data_access_manager.set_db_session(session_class)
 
+    # resolve the dataset factory patterns
+    data_access_manager.resolve_dataset_factory_patterns(catalog, pipelines)
+
+    # add catalog and relevant tracking datasets
     data_access_manager.add_catalog(catalog)
 
     # add dataset stats before adding pipelines

diff --git a/package/tests/conftest.py b/package/tests/conftest.py
@@ -110,6 +110,12 @@ def example_catalog():
             },
             "model_inputs": {"model_inputs"},
         },
+        dataset_patterns={
+            "{dataset_name}#csv": {
+                "type": "pandas.CSVDataset",
+                "filepath": "data/01_raw/{dataset_name}#csv.csv",
+            },
+        },
     )
 
 
@@ -290,3 +296,19 @@ def example_csv_dataset(tmp_path, example_data_frame):
     )
     new_csv_dataset.save(example_data_frame)
     yield new_csv_dataset
+
+
+# Create a mock for KedroPipeline with datasets method
+@pytest.fixture
+def pipeline_with_datasets_mock():
+    pipeline = mock.MagicMock()
+    pipeline.datasets.return_value = ["model_inputs#csv"]
+    return pipeline
+
+
+# Create a mock for KedroPipeline with data_sets method
+@pytest.fixture
+def pipeline_with_data_sets_mock():
+    pipeline = mock.MagicMock()
+    pipeline.data_sets.return_value = ["model_inputs#csv"]
+    return pipeline
diff --git a/package/tests/test_data_access/test_managers.py b/package/tests/test_data_access/test_managers.py
@@ -9,6 +9,7 @@
 
 from kedro_viz.constants import DEFAULT_REGISTERED_PIPELINE_ID, ROOT_MODULAR_PIPELINE_ID
 from kedro_viz.data_access.managers import DataAccessManager
+from kedro_viz.data_access.repositories.catalog import CatalogRepository
 from kedro_viz.models.flowchart import (
     DataNode,
     GraphEdge,
@@ -464,3 +465,25 @@ def test_add_pipelines_with_circular_modular_pipelines(
             digraph.add_edge(edge.source, edge.target)
         with pytest.raises(nx.NetworkXNoCycle):
             nx.find_cycle(digraph)
+
+
+class TestResolveDatasetFactoryPatterns:
+    def test_resolve_dataset_factory_patterns(
+        self,
+        example_catalog,
+        pipeline_with_datasets_mock,
+        pipeline_with_data_sets_mock,
+        data_access_manager: DataAccessManager,
+    ):
+        pipelines = {
+            "pipeline1": pipeline_with_datasets_mock,
+            "pipeline2": pipeline_with_data_sets_mock,
+        }
+        new_catalog = CatalogRepository()
+        new_catalog.set_catalog(example_catalog)
+
+        assert "model_inputs#csv" not in new_catalog.as_dict().keys()
+
+        data_access_manager.resolve_dataset_factory_patterns(example_catalog, pipelines)
+
+        assert "model_inputs#csv" in new_catalog.as_dict().keys()
diff --git a/package/tests/test_server.py b/package/tests/test_server.py
@@ -71,6 +71,9 @@ def test_run_server_from_project(
     ):
         run_server()
         # assert that when running server, data are added correctly to the data access manager
+        patched_data_access_manager.resolve_dataset_factory_patterns.assert_called_once_with(
+            example_catalog, example_pipelines
+        )
         patched_data_access_manager.add_catalog.assert_called_once_with(example_catalog)
         patched_data_access_manager.add_pipelines.assert_called_once_with(
             example_pipelines