Fix issue with specifying format for SparkHiveDataSet (#1857)

Signed-off-by: jstammers <jimmy.stammers@cgastrategy.com> Signed-off-by: Ahdra Merali <ahdra.merali@quantumblack.com>
kedro-org · Oct 21, 2022 · 4e3e7b4 · 4e3e7b4
1 parent 6e52ba6
commit 4e3e7b4
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 1 deletion.
diff --git a/RELEASE.md b/RELEASE.md
@@ -48,6 +48,7 @@
 * Relaxed `delta-spark` upper bound to allow compatibility with Spark 3.1.x and 3.2.x.
 * Added `gdrive` to list of cloud protocols, enabling Google Drive paths for datasets.
 * Added svg logo resource for ipython kernel.
+* Fixed `format` in `save_args` for `SparkHiveDataSet`.
 
 ## Upcoming deprecations for Kedro 0.19.0
 * The Kedro IPython extension will no longer be available as `%load_ext kedro.extras.extensions.ipython`; use `%load_ext kedro.ipython` instead.

diff --git a/kedro/extras/datasets/spark/spark_hive_dataset.py b/kedro/extras/datasets/spark/spark_hive_dataset.py
@@ -114,7 +114,7 @@ def __init__(
         self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
         if save_args is not None:
             self._save_args.update(save_args)
-        self._format = self._save_args.get("format") or "hive"
+        self._format = self._save_args.pop("format", None) or "hive"
         self._eager_checkpoint = self._save_args.pop("eager_checkpoint", None) or True
 
     def _describe(self) -> Dict[str, Any]:

diff --git a/tests/extras/datasets/spark/test_spark_hive_dataset.py b/tests/extras/datasets/spark/test_spark_hive_dataset.py
@@ -301,3 +301,14 @@ def test_read_from_non_existent_table(self):
             r"table_doesnt_exist\], \[\], false\n",
         ):
             dataset.load()
+
+    def test_save_delta_format(self, mocker):
+        dataset = SparkHiveDataSet(
+            database="default_1", table="delta_table", save_args={"format": "delta"}
+        )
+        mocked_save = mocker.patch("pyspark.sql.DataFrameWriter.saveAsTable")
+        dataset.save(_generate_spark_df_one())
+        mocked_save.assert_called_with(
+            "default_1.delta_table", mode="errorifexists", format="delta"
+        )
+        assert dataset._format == "delta"