Deprecate as_pandas in favor of data(return_type="pandas") (#408)

## Description  Due to the new `data` method (#412), `as_pandas` is no longer necessary, as `data` provides more flexible options for accessing archive data, and it is also able to return dataframes by passing `return_type="pandas"`. This PR thus deprecates `as_pandas`. Because this is a fairly popular method, I have kept the method and raised a RuntimeError whenever it is called; however, I anticipate removing the method entirely in the future. I also considered keeping `as_pandas` as an alias to `data(return_type="pandas")`, but this would require changing the parameters of `as_pandas` since `data` takes in `fields` rather than `include_solutions` and `include_metadata`. Removing `as_pandas` entirely makes it clear that it has been deprecated. ## TODO  - [x] Introduce a `data()` method that returns the archive data in many forms - [x] Remove `as_pandas()` since `data(return_type="pandas")` now fulfills this role - [x] Replace as_pandas test with data test - [x] Fix usage of as_pandas in tests - [x] Remove as_pandas in tutorials ## Questions  ## Status - [x] I have read the guidelines in [CONTRIBUTING.md](https://github.com/icaros-usc/pyribs/blob/master/CONTRIBUTING.md) - [x] I have formatted my code using `yapf` - [x] I have tested my code by running `pytest` - [x] I have linted my code with `pylint` - [x] I have added a one-line description of my change to the changelog in `HISTORY.md` - [x] This PR is ready to go
icaros-usc · Nov 10, 2023 · 05e4910 · 05e4910
1 parent 122378c
commit 05e4910
Show file tree

Hide file tree

Showing 23 changed files with 98 additions and 182 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -6,6 +6,8 @@
 
 #### API
 
+- **Backwards-incompatible:** Deprecate `as_pandas` in favor of
+  `data(return_type="pandas")` ({pr}`408`)
 - **Backwards-incompatible:** Replace ArchiveDataFrame batch methods with
   `get_field` ({pr}`413`)
 - Add field_list and data methods to archives ({pr}`412`)

diff --git a/examples/lunar_lander.py b/examples/lunar_lander.py
@@ -18,7 +18,7 @@
 the --outdir flag) with the following files:
 
     - archive.csv: The CSV representation of the final archive, obtained with
-      as_pandas().
+      data().
     - archive_ccdf.png: A plot showing the (unnormalized) complementary
       cumulative distribution function of objectives in the archive. For
       each objective p on the x-axis, this plot shows the number of
@@ -297,7 +297,7 @@ def save_ccdf(archive, filename):
     """
     fig, ax = plt.subplots()
     ax.hist(
-        archive.as_pandas(include_solutions=False)["objective"],
+        archive.data("objective"),
         50,  # Number of cells.
         histtype="step",
         density=False,
@@ -395,7 +395,7 @@ def lunar_lander_main(workers=4,
     metrics = run_search(client, scheduler, env_seed, iterations, log_freq)
 
     # Outputs.
-    scheduler.archive.as_pandas().to_csv(outdir / "archive.csv")
+    scheduler.archive.data(return_type="pandas").to_csv(outdir / "archive.csv")
     save_ccdf(scheduler.archive, str(outdir / "archive_ccdf.png"))
     save_heatmap(scheduler.archive, str(outdir / "heatmap.png"))
     save_metrics(outdir, metrics)

diff --git a/examples/sphere.py b/examples/sphere.py
@@ -835,7 +835,7 @@ def sphere_main(algorithm,
         final_itr = itr == itrs
         if itr % log_freq == 0 or final_itr:
             if final_itr:
-                result_archive.as_pandas(include_solutions=final_itr).to_csv(
+                result_archive.data(return_type="pandas").to_csv(
                     outdir / f"{name}_archive.csv")
 
             # Record and display metrics.

diff --git a/ribs/archives/_archive_base.py b/ribs/archives/_archive_base.py
@@ -728,49 +728,15 @@ def data(self, fields=None, return_type="dict"):
         return data
 
     def as_pandas(self, include_solutions=True, include_metadata=False):
-        """Converts the archive into an :class:`ArchiveDataFrame` (a child class
-        of :class:`pandas.DataFrame`).
-
-        The implementation of this method in :class:`ArchiveBase` creates a
-        dataframe consisting of:
-
-        - 1 column of integers (``np.int32``) for the index, named ``index``.
-          See :meth:`index_of` for more info.
-        - :attr:`measure_dim` columns for the measures, named ``measures_0,
-          measures_1, ...``
-        - 1 column for the objectives, named ``objective``
-        - :attr:`solution_dim` columns for the solution parameters, named
-          ``solution_0, solution_1, ...``
-        - 1 column for the metadata objects, named ``metadata``
-
-        In short, the dataframe looks like this:
-
-        +-------+------------+------+-----------+------------+-----+----------+
-        | index | measures_0 | ...  | objective | solution_0 | ... | metadata |
-        +=======+============+======+===========+============+=====+==========+
-        |       |            | ...  |           |            | ... |          |
-        +-------+------------+------+-----------+------------+-----+----------+
-
-        Compared to :class:`pandas.DataFrame`, the :class:`ArchiveDataFrame`
-        adds methods and attributes which make it easier to manipulate archive
-        data. For more information, refer to the :class:`ArchiveDataFrame`
-        documentation.
-
-        Args:
-            include_solutions (bool): Whether to include solution columns.
-            include_metadata (bool): Whether to include the metadata column.
-                Note that methods like :meth:`~pandas.DataFrame.to_csv` may not
-                properly save the dataframe since the metadata objects may not
-                be representable in a CSV.
-        Returns:
-            ArchiveDataFrame: See above.
-        """  # pylint: disable = line-too-long
-        fields = ["index", "measures", "objective"]
-        if include_solutions:
-            fields.append("solution")
-        if include_metadata:
-            fields.append("metadata")
-        return ArchiveDataFrame(self._store.data(fields, return_type="pandas"))
+        """DEPRECATED."""
+        # pylint: disable = unused-argument
+        raise RuntimeError(
+            "as_pandas has been deprecated. Please use "
+            "archive.data(..., return_type='pandas') instead. For more "
+            "info, please see the archive data tutorial: "
+            # pylint: disable = line-too-long
+            "https://docs.pyribs.org/en/stable/tutorials/features/archive_data.html"
+        )
 
     def cqd_score(self,
                   iterations,

diff --git a/ribs/archives/_cvt_archive.py b/ribs/archives/_cvt_archive.py
@@ -49,9 +49,9 @@ class CVTArchive(ArchiveBase):
     subsequent experiments.
 
     .. note:: The idea of archive thresholds was introduced in `Fontaine 2022
-        <https://arxiv.org/abs/2205.10752>`_. Refer to our `CMA-MAE tutorial
-        <../../tutorials/cma_mae.html>`_ for more info on thresholds, including
-        the ``learning_rate`` and ``threshold_min`` parameters.
+        <https://arxiv.org/abs/2205.10752>`_. For more info on thresholds,
+        including the ``learning_rate`` and ``threshold_min`` parameters, refer
+        to our tutorial :doc:`/tutorials/cma_mae`.
 
     .. note:: For more information on our choice of k-D tree implementation, see
         :pr:`38`.

diff --git a/ribs/archives/_grid_archive.py b/ribs/archives/_grid_archive.py
@@ -16,9 +16,9 @@ class GridArchive(ArchiveBase):
     cell.
 
     .. note:: The idea of archive thresholds was introduced in `Fontaine 2022
-        <https://arxiv.org/abs/2205.10752>`_. Refer to our `CMA-MAE tutorial
-        <../../tutorials/cma_mae.html>`_ for more info on thresholds, including
-        the ``learning_rate`` and ``threshold_min`` parameters.
+        <https://arxiv.org/abs/2205.10752>`_. For more info on thresholds,
+        including the ``learning_rate`` and ``threshold_min`` parameters, refer
+        to our tutorial :doc:`/tutorials/cma_mae`.
 
     Args:
         solution_dim (int): Dimension of the solution space.

diff --git a/ribs/visualize/_cvt_archive_3d_plot.py b/ribs/visualize/_cvt_archive_3d_plot.py
@@ -154,10 +154,11 @@ def cvt_archive_3d_plot(
         df (ribs.archives.ArchiveDataFrame): If provided, we will plot data from
             this argument instead of the data currently in the archive. This
             data can be obtained by, for instance, calling
-            :meth:`ribs.archives.ArchiveBase.as_pandas()` and modifying the
-            resulting :class:`ArchiveDataFrame`. Note that, at a minimum, the
-            data must contain columns for index, objective, and measures. To
-            display a custom metric, replace the "objective" column.
+            :meth:`ribs.archives.ArchiveBase.data` with ``return_type="pandas"``
+            and modifying the resulting :class:`ArchiveDataFrame`. Note that, at
+            a minimum, the data must contain columns for index, objective, and
+            measures. To display a custom metric, replace the "objective"
+            column.
         measure_order (array-like of int): Specifies the axes order for plotting
             the measures. By default, the first measure (measure 0) in the
             archive appears on the x-axis, the second (measure 1) on y-axis, and
@@ -217,9 +218,15 @@ def cvt_archive_3d_plot(
     cmap = retrieve_cmap(cmap)
 
     # Retrieve archive data.
-    df = archive.as_pandas() if df is None else validate_df(df)
-    objective_batch = df.get_field("objective")
-    measures_batch = df.get_field("measures")
+    if df is None:
+        objective_batch = archive.data("objective")
+        measures_batch = archive.data("measures")
+        index_batch = archive.data("index")
+    else:
+        df = validate_df(df)
+        objective_batch = df.get_field("objective")
+        measures_batch = df.get_field("measures")
+        index_batch = df.get_field("index")
     lower_bounds = archive.lower_bounds
     upper_bounds = archive.upper_bounds
     centroids = archive.centroids
@@ -297,7 +304,7 @@ def cvt_archive_3d_plot(
     objs = []  # Also record objective for each ridge so we can color it.
 
     # Map from centroid index to objective.
-    pt_to_obj = dict(zip(df.get_field("index"), objective_batch))
+    pt_to_obj = dict(zip(index_batch, objective_batch))
 
     # The points in the Voronoi diagram are indexed by their placement in the
     # input list. Above, when we called Voronoi, `centroids` were placed first,

diff --git a/ribs/visualize/_cvt_archive_heatmap.py b/ribs/visualize/_cvt_archive_heatmap.py
@@ -9,8 +9,8 @@
 from ribs.visualize._utils import (archive_heatmap_1d, retrieve_cmap, set_cbar,
                                    validate_df, validate_heatmap_visual_args)
 
-# Matplotlib functions tend to have a ton of args.
-# pylint: disable = too-many-arguments
+# Matplotlib functions tend to have a ton of args and statements.
+# pylint: disable = too-many-arguments, too-many-statements
 
 
 def cvt_archive_heatmap(archive,
@@ -102,10 +102,11 @@ def cvt_archive_heatmap(archive,
         df (ribs.archives.ArchiveDataFrame): If provided, we will plot data from
             this argument instead of the data currently in the archive. This
             data can be obtained by, for instance, calling
-            :meth:`ribs.archives.ArchiveBase.as_pandas()` and modifying the
-            resulting :class:`ArchiveDataFrame`. Note that, at a minimum, the
-            data must contain columns for index, objective, and measures. To
-            display a custom metric, replace the "objective" column.
+            :meth:`ribs.archives.ArchiveBase.data` with ``return_type="pandas"``
+            and modifying the resulting :class:`ArchiveDataFrame`. Note that, at
+            a minimum, the data must contain columns for index, objective, and
+            measures. To display a custom metric, replace the "objective"
+            column.
         transpose_measures (bool): By default, the first measure in the archive
             will appear along the x-axis, and the second will be along the
             y-axis. To switch this behavior (i.e. to transpose the axes), set
@@ -182,7 +183,13 @@ def cvt_archive_heatmap(archive,
     cmap = retrieve_cmap(cmap)
 
     # Retrieve archive data.
-    df = archive.as_pandas() if df is None else validate_df(df)
+    if df is None:
+        index_batch = archive.data("index")
+        objective_batch = archive.data("objective")
+    else:
+        df = validate_df(df)
+        index_batch = df["index"]
+        objective_batch = df["objective"]
 
     if archive.measure_dim == 1:
         # Read in pcm kwargs -- the linewidth and edgecolor are overwritten by
@@ -220,10 +227,10 @@ def cvt_archive_heatmap(archive,
             inv_idx[x] = i
 
         # We only want inverse indexes that are actually used in the archive.
-        selected_inv_idx = inv_idx[df.get_field("index")]
+        selected_inv_idx = inv_idx[index_batch]
 
         cell_objectives = np.full(archive.cells, np.nan)
-        cell_objectives[selected_inv_idx] = df.get_field("objective")
+        cell_objectives[selected_inv_idx] = objective_batch
 
         ax = archive_heatmap_1d(archive, cell_boundaries, cell_objectives, ax,
                                 cmap, aspect, vmin, vmax, cbar, cbar_kwargs,
@@ -288,7 +295,7 @@ def cvt_archive_heatmap(archive,
         # the region index of each point.
         region_obj = [None] * len(vor.regions)
         min_obj, max_obj = np.inf, -np.inf
-        pt_to_obj = dict(zip(df.get_field("index"), df.get_field("objective")))
+        pt_to_obj = dict(zip(index_batch, objective_batch))
         for pt_idx, region_idx in enumerate(
                 vor.point_region[:-4]):  # Exclude faraway_pts.
             if region_idx != -1 and pt_idx in pt_to_obj:

diff --git a/ribs/visualize/_grid_archive_heatmap.py b/ribs/visualize/_grid_archive_heatmap.py
@@ -90,10 +90,11 @@ def grid_archive_heatmap(archive,
         df (ribs.archives.ArchiveDataFrame): If provided, we will plot data from
             this argument instead of the data currently in the archive. This
             data can be obtained by, for instance, calling
-            :meth:`ribs.archives.ArchiveBase.as_pandas()` and modifying the
-            resulting :class:`ArchiveDataFrame`. Note that, at a minimum, the
-            data must contain columns for index, objective, and measures. To
-            display a custom metric, replace the "objective" column.
+            :meth:`ribs.archives.ArchiveBase.data` with ``return_type="pandas"``
+            and modifying the resulting :class:`ArchiveDataFrame`. Note that, at
+            a minimum, the data must contain columns for index, objective, and
+            measures. To display a custom metric, replace the "objective"
+            column.
         transpose_measures (bool): By default, the first measure in the archive
             will appear along the x-axis, and the second will be along the
             y-axis. To switch this behavior (i.e. to transpose the axes), set
@@ -147,12 +148,18 @@ def grid_archive_heatmap(archive,
     cmap = retrieve_cmap(cmap)
 
     # Retrieve archive data.
-    df = archive.as_pandas() if df is None else validate_df(df)
+    if df is None:
+        index_batch = archive.data("index")
+        objective_batch = archive.data("objective")
+    else:
+        df = validate_df(df)
+        index_batch = df["index"]
+        objective_batch = df["objective"]
 
     if archive.measure_dim == 1:
         cell_objectives = np.full(archive.cells, np.nan)
-        cell_idx = archive.int_to_grid_index(df.get_field("index")).squeeze()
-        cell_objectives[cell_idx] = df.get_field("objective")
+        cell_idx = archive.int_to_grid_index(index_batch).squeeze()
+        cell_objectives[cell_idx] = objective_batch
 
         archive_heatmap_1d(
             archive,
@@ -171,7 +178,6 @@ def grid_archive_heatmap(archive,
 
     elif archive.measure_dim == 2:
         # Retrieve data from archive.
-        objective_batch = df.get_field("objective")
         lower_bounds = archive.lower_bounds
         upper_bounds = archive.upper_bounds
         x_dim, y_dim = archive.dims
@@ -180,7 +186,7 @@ def grid_archive_heatmap(archive,
 
         # Color for each cell in the heatmap.
         colors = np.full((y_dim, x_dim), np.nan)
-        grid_index_batch = archive.int_to_grid_index(df.get_field("index"))
+        grid_index_batch = archive.int_to_grid_index(index_batch)
         colors[grid_index_batch[:, 1], grid_index_batch[:, 0]] = objective_batch
 
         if transpose_measures:

diff --git a/ribs/visualize/_parallel_axes_plot.py b/ribs/visualize/_parallel_axes_plot.py
@@ -82,10 +82,11 @@ def parallel_axes_plot(archive,
         df (ribs.archives.ArchiveDataFrame): If provided, we will plot data from
             this argument instead of the data currently in the archive. This
             data can be obtained by, for instance, calling
-            :meth:`ribs.archives.ArchiveBase.as_pandas()` and modifying the
-            resulting :class:`ArchiveDataFrame`. Note that, at a minimum, the
-            data must contain columns for index, objective, and measures. To
-            display a custom metric, replace the "objective" column.
+            :meth:`ribs.archives.ArchiveBase.data` with ``return_type="pandas"``
+            and modifying the resulting :class:`ArchiveDataFrame`. Note that, at
+            a minimum, the data must contain columns for index, objective, and
+            measures. To display a custom metric, replace the "objective"
+            column.
         measure_order (list of int or list of (int, str)): If this is a list
             of ints, it specifies the axes order for measures (e.g. ``[2, 0,
             1]``). If this is a list of tuples, each tuple takes the form
@@ -163,7 +164,7 @@ def parallel_axes_plot(archive,
         upper_bounds = archive.upper_bounds[cols]
 
     host_ax = plt.gca() if ax is None else ax  # Try to get current axis.
-    df = archive.as_pandas() if df is None else validate_df(df)
+    df = archive.data(return_type="pandas") if df is None else validate_df(df)
     vmin = df["objective"].min() if vmin is None else vmin
     vmax = df["objective"].max() if vmax is None else vmax
     norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax, clip=True)

diff --git a/ribs/visualize/_sliding_boundaries_archive_heatmap.py b/ribs/visualize/_sliding_boundaries_archive_heatmap.py
@@ -68,10 +68,11 @@ def sliding_boundaries_archive_heatmap(archive,
         df (ribs.archives.ArchiveDataFrame): If provided, we will plot data from
             this argument instead of the data currently in the archive. This
             data can be obtained by, for instance, calling
-            :meth:`ribs.archives.ArchiveBase.as_pandas()` and modifying the
-            resulting :class:`ArchiveDataFrame`. Note that, at a minimum, the
-            data must contain columns for index, objective, and measures. To
-            display a custom metric, replace the "objective" column.
+            :meth:`ribs.archives.ArchiveBase.data` with ``return_type="pandas"``
+            and modifying the resulting :class:`ArchiveDataFrame`. Note that, at
+            a minimum, the data must contain columns for index, objective, and
+            measures. To display a custom metric, replace the "objective"
+            column.
         transpose_measures (bool): By default, the first measure in the archive
             will appear along the x-axis, and the second will be along the
             y-axis. To switch this behavior (i.e. to transpose the axes), set
@@ -119,8 +120,13 @@ def sliding_boundaries_archive_heatmap(archive,
     cmap = retrieve_cmap(cmap)
 
     # Retrieve archive data.
-    df = archive.as_pandas() if df is None else validate_df(df)
-    measures_batch = df.get_field("measures")
+    if df is None:
+        measures_batch = archive.data("measures")
+        objective_batch = archive.data("objective")
+    else:
+        df = validate_df(df)
+        measures_batch = df.get_field("measures")
+        objective_batch = df.get_field("objective")
     x = measures_batch[:, 0]
     y = measures_batch[:, 1]
     x_boundary = archive.boundaries[0]
@@ -144,7 +150,6 @@ def sliding_boundaries_archive_heatmap(archive,
     ax.set_aspect(aspect)
 
     # Create the plot.
-    objective_batch = df.get_field("objective")
     vmin = np.min(objective_batch) if vmin is None else vmin
     vmax = np.max(objective_batch) if vmax is None else vmax
     t = ax.scatter(x,

diff --git a/tests/archives/cvt_archive_benchmark.py b/tests/archives/cvt_archive_benchmark.py
@@ -38,26 +38,3 @@ def add_10k(archive):
         archive.add(solution_batch, objective_batch, measures_batch)
 
     benchmark.pedantic(add_10k, setup=setup, rounds=5, iterations=1)
-
-
-def benchmark_as_pandas_2000_items(benchmark):
-    cells = 2000
-    archive = CVTArchive(solution_dim=10,
-                         cells=cells,
-                         ranges=[(-1, 1), (-1, 1)],
-                         use_kd_tree=True,
-                         samples=50_000)
-
-    archive.add(
-        solution_batch=np.concatenate(
-            (archive.centroids, np.random.random((cells, 8))),
-            axis=1,
-        ),
-        objective_batch=np.ones(cells),
-        measures_batch=archive.centroids,
-    )
-
-    # Archive should be full.
-    assert len(archive) == cells
-
-    benchmark(archive.as_pandas)