-
Notifications
You must be signed in to change notification settings - Fork 105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make Viz compatible with kedro-datasets
#1214
Changes from 8 commits
d0bfeb4
06aca3b
1520eba
3d6b8e0
4a50f59
c62b973
395ff70
5f12780
214b084
34ca924
bd6a529
cad2a26
b2d6811
577dedd
7107404
5ff07cd
8748b49
65f1ac9
22302bb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -51,13 +51,12 @@ class TrackingDatasetGroup(str, Enum): | |||||||||||||||||
JSON = "json" | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
# pylint: disable=line-too-long | ||||||||||||||||||
TRACKING_DATASET_GROUPS = { | ||||||||||||||||||
rashidakanchwala marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||
"kedro.extras.datasets.plotly.plotly_dataset.PlotlyDataSet": TrackingDatasetGroup.PLOT, | ||||||||||||||||||
"kedro.extras.datasets.plotly.json_dataset.JSONDataSet": TrackingDatasetGroup.PLOT, | ||||||||||||||||||
"kedro.extras.datasets.matplotlib.matplotlib_writer.MatplotlibWriter": TrackingDatasetGroup.PLOT, | ||||||||||||||||||
"kedro.extras.datasets.tracking.metrics_dataset.MetricsDataSet": TrackingDatasetGroup.METRIC, | ||||||||||||||||||
"kedro.extras.datasets.tracking.json_dataset.JSONDataSet": TrackingDatasetGroup.JSON, | ||||||||||||||||||
"plotly.plotly_dataset.PlotlyDataSet": TrackingDatasetGroup.PLOT, | ||||||||||||||||||
"plotly.json_dataset.JSONDataSet": TrackingDatasetGroup.PLOT, | ||||||||||||||||||
"matplotlib.matplotlib_writer.MatplotlibWriter": TrackingDatasetGroup.PLOT, | ||||||||||||||||||
"tracking.metrics_dataset.MetricsDataSet": TrackingDatasetGroup.METRIC, | ||||||||||||||||||
"tracking.json_dataset.JSONDataSet": TrackingDatasetGroup.JSON, | ||||||||||||||||||
} | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
|
@@ -67,14 +66,15 @@ class TrackingDatasetModel: | |||||||||||||||||
|
||||||||||||||||||
dataset_name: str | ||||||||||||||||||
# dataset is the actual dataset instance, whereas dataset_type is a string. | ||||||||||||||||||
# e.g. "kedro.extras.datasets.tracking.metrics_dataset.MetricsDataSet" | ||||||||||||||||||
# e.g. "kedro_datasets.tracking.metrics_dataset.MetricsDataSet" | ||||||||||||||||||
dataset: AbstractVersionedDataSet | ||||||||||||||||||
dataset_type: str = field(init=False) | ||||||||||||||||||
# runs is a mapping from run_id to loaded data. | ||||||||||||||||||
runs: Dict[str, Any] = field(init=False, default_factory=dict) | ||||||||||||||||||
|
||||||||||||||||||
def __post_init__(self): | ||||||||||||||||||
self.dataset_type = get_dataset_type(self.dataset) | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this variable actually still used anywhere now in experiment tracking? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I tried removing this which caused errors. I think it's used for the schema in graphql. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect what's happening here is we're passing the full The full dataset type is used in the frontend in metadata panel in the flowchart view but not in experiment tracking (I think). So there's a good simplification we could make here if all we really need in experiment tracking is the full dataset type. Feel free to leave as it is for now, but should be an easy tidy up for someone to do in a follow up PR if I'm right. |
||||||||||||||||||
self.dataset_module_class = get_dataset_module_class(self.dataset) | ||||||||||||||||||
|
||||||||||||||||||
def load_tracking_data(self, run_id: str): | ||||||||||||||||||
# No need to reload data that has already been loaded. | ||||||||||||||||||
|
@@ -93,7 +93,10 @@ def load_tracking_data(self, run_id: str): | |||||||||||||||||
return | ||||||||||||||||||
|
||||||||||||||||||
try: | ||||||||||||||||||
if TRACKING_DATASET_GROUPS[self.dataset_type] is TrackingDatasetGroup.PLOT: | ||||||||||||||||||
if ( | ||||||||||||||||||
TRACKING_DATASET_GROUPS[self.dataset_module_class] | ||||||||||||||||||
is TrackingDatasetGroup.PLOT | ||||||||||||||||||
): | ||||||||||||||||||
self.runs[run_id] = {self.dataset._filepath.name: self.dataset.load()} | ||||||||||||||||||
else: | ||||||||||||||||||
self.runs[run_id] = self.dataset.load() | ||||||||||||||||||
|
@@ -111,3 +114,9 @@ def load_tracking_data(self, run_id: str): | |||||||||||||||||
|
||||||||||||||||||
def get_dataset_type(dataset: AbstractVersionedDataSet) -> str: | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Maybe? Sort of depends how There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we no longer have this. as get_dataset_module_class is the new 'get_dataset_type' e.g. pandas.csv_dataset.CSVDataset |
||||||||||||||||||
return f"{dataset.__class__.__module__}.{dataset.__class__.__qualname__}" | ||||||||||||||||||
|
||||||||||||||||||
|
||||||||||||||||||
def get_dataset_module_class(dataset: AbstractVersionedDataSet) -> str: | ||||||||||||||||||
class_name = f"{dataset.__class__.__qualname__}" | ||||||||||||||||||
_, dataset_type, dataset_file = f"{dataset.__class__.__module__}".rsplit(".", 2) | ||||||||||||||||||
return f"{dataset_type}.{dataset_file}.{class_name}" | ||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The naming is kind of confusing given we already call the "full path"
Suggested change
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -458,6 +458,10 @@ def __post_init__(self): | |
else None | ||
) | ||
|
||
self.dataset_module_class = ( | ||
self.get_dataset_module_class(self.kedro_obj) if self.kedro_obj else None | ||
) | ||
|
||
# the modular pipelines that a data node belongs to | ||
# are derived from its namespace, which in turn | ||
# is derived from the dataset's name. | ||
|
@@ -466,37 +470,39 @@ def __post_init__(self): | |
self._get_namespace(self.full_name) | ||
) | ||
|
||
@staticmethod | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd refactor this as per suggestions in experiment_tracking.py. |
||
def get_dataset_module_class(kedro_object) -> str: | ||
"""Get dataset class and the two last parts of the module part.""" | ||
class_name = f"{kedro_object.__class__.__qualname__}" | ||
_, dataset_type, dataset_file = f"{kedro_object.__class__.__module__}".rsplit( | ||
".", 2 | ||
) | ||
return f"{dataset_type}.{dataset_file}.{class_name}" | ||
|
||
# TODO: improve this scheme. | ||
def is_plot_node(self): | ||
"""Check if the current node is a plot node. | ||
Currently it only recognises one underlying dataset as a plot node. | ||
In the future, we might want to make this generic. | ||
""" | ||
return self.dataset_type in ( | ||
"kedro.extras.datasets.plotly.plotly_dataset.PlotlyDataSet", | ||
"kedro.extras.datasets.plotly.json_dataset.JSONDataSet", | ||
return self.dataset_module_class in ( | ||
"plotly.plotly_dataset.PlotlyDataSet", | ||
"plotly.json_dataset.JSONDataSet", | ||
) | ||
|
||
def is_image_node(self): | ||
"""Check if the current node is a matplotlib image node.""" | ||
return ( | ||
self.dataset_type | ||
== "kedro.extras.datasets.matplotlib.matplotlib_writer.MatplotlibWriter" | ||
self.dataset_module_class == "matplotlib.matplotlib_writer.MatplotlibWriter" | ||
) | ||
|
||
def is_metric_node(self): | ||
"""Check if the current node is a metrics node.""" | ||
return ( | ||
self.dataset_type | ||
== "kedro.extras.datasets.tracking.metrics_dataset.MetricsDataSet" | ||
) | ||
return self.dataset_module_class == "tracking.metrics_dataset.MetricsDataSet" | ||
|
||
def is_json_node(self): | ||
"""Check if the current node is a JSONDataSet node.""" | ||
return ( | ||
self.dataset_type | ||
== "kedro.extras.datasets.tracking.json_dataset.JSONDataSet" | ||
) | ||
return self.dataset_module_class == "tracking.json_dataset.JSONDataSet" | ||
|
||
def is_tracking_node(self): | ||
"""Checks if the current node is a tracking data node""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For some reason (I think just as an example of how to do it) Joel wanted to define this dataset inside the project itself and not use the kedro built-in one. So we should leave this as it was.