Release 4.0 (#277)

innobi · Mar 15, 2024 · 60c6c55 · 60c6c55
1 parent 1098a88
commit 60c6c55
Show file tree

Hide file tree

Showing 8 changed files with 59 additions and 184 deletions.
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1,57 +1,50 @@
 API Reference
 =============
 
-.. py:function:: frame_to_hyper(df: pd.DataFrame, database: Union[str, pathlib.Path], *, table: Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], table_mode: str = "w", hyper_process: Optional[HyperProcess]) -> None:
+.. py:function:: frame_to_hyper(df: pd.DataFrame, database: Union[str, pathlib.Path], *, table: Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], table_mode: str = "w", not_null_columns: Optional[Iterable[str]] = None, json_columns: Optional[Iterable[str]] = None, geo_columns: Optional[Iterable[str]] = None) -> None:
 
    Convert a DataFrame to a .hyper extract.
 
    :param df: Data to be written out.
    :param database: Name / location of the Hyper file to write to.
-   :param table: Table to write to. Must be supplied as a keyword argument.
+   :param table: Table to write to.
    :param table_mode: The mode to open the table with. Default is "w" for write, which truncates the file before writing. Another option is "a", which will append data to the file if it already contains information.
-   :param hyper_process: A `HyperProcess` in case you want to spawn it by yourself. Optional. Must be supplied as a keyword argument.
-   :param use_parquet: Use a temporary parquet file to write into the Hyper database, which typically will yield better performance. Boolean, default False
+   :param not_null_columns: Columns which should be considered "NOT NULL" in the target Hyper database. By default, all columns are considered nullable
+   :param json_columns: Columns to be written as a JSON data type
+   :param geo_columns: Columns to be written as a GEOGRAPHY data type
 
-
-.. py:function:: frame_from_hyper(source: Union[str, pathlib.Path, tab_api.Connection], *, table: Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], hyper_process: Optional[HyperProcess], use_float_na: bool = False) -> pd.DataFrame:
+.. py:function:: frame_from_hyper(source: Union[str, pathlib.Path, tab_api.Connection], *, table: Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], return_type: Literal["pandas", "pyarrow", "polars"] = "pandas")
 
    Extracts a DataFrame from a .hyper extract.
 
    :param source: Name / location of the Hyper file to be read  or Hyper-API connection.
-   :param table: Table to read. Must be supplied as a keyword argument.
-   :param hyper_process: A `HyperProcess` in case you want to spawn it by yourself. Optional. Must be supplied as a keyword argument.
-   :param use_float_na: Flag indicating whether to use the pandas `Float32`/`Float64` dtypes which support the new pandas missing value  `pd.NA`, default False
-   :rtype: pd.DataFrame
+   :param table: Table to read.
+   :param return_type: The type of DataFrame to be returned
 
 
-.. py:function:: frames_to_hyper(dict_of_frames: Dict[Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], pd.DataFrame], database: Union[str, pathlib.Path], table_mode: str = "w", *, hyper_process: Optional[HyperProcess]) -> None:
+.. py:function:: frames_to_hyper(dict_of_frames: Dict[Union[str, tableauhyperapi.Name, tableauhyperapi.TableName], pd.DataFrame], database: Union[str, pathlib.Path], *, table_mode: str = "w", not_null_columns: Optional[Iterable[str]] = None, json_columns: Optional[Iterable[str]] = None, geo_columns: Optional[Iterable[str]] = None,) -> None:
 
    Writes multiple DataFrames to a .hyper extract.
 
    :param dict_of_frames: A dictionary whose keys are valid table identifiers and values are dataframes
    :param database: Name / location of the Hyper file to write to.
    :param table_mode: The mode to open the table with. Default is "w" for write, which truncates the file before writing. Another option is "a", which will append data to the file if it already contains information.
-   :param hyper_process: A `HyperProcess` in case you want to spawn it by yourself. Optional. Must be supplied as a keyword argument.
-   :param use_parquet: Use a temporary parquet file to write into the Hyper database, which typically will yield better performance. Boolean, default False
+   :param not_null_columns: Columns which should be considered "NOT NULL" in the target Hyper database. By default, all columns are considered nullable
+   :param json_columns: Columns to be written as a JSON data type
+   :param geo_columns: Columns to be written as a GEOGRAPHY data type
 
-.. py:function:: frames_from_hyper(source: Union[str, pathlib.Path, tab_api.Connection], *, hyper_process: Optional[HyperProcess]) -> Dict[tableauhyperapi.TableName, pd.DataFrame, use_float_na: bool = False]:
+.. py:function:: frames_from_hyper(source: Union[str, pathlib.Path, tab_api.Connection], *, return_type: Literal["pandas", "pyarrow", "polars"] = "pandas") -> dict:
 
    Extracts tables from a .hyper extract.
 
    :param source: Name / location of the Hyper file to be read  or Hyper-API connection.
-   :param hyper_process: A `HyperProcess` in case you want to spawn it by yourself. Optional. Must be supplied as a keyword argument.
-   :param use_float_na: Flag indicating whether to use the pandas `Float32`/`Float64` dtypes which support the new pandas missing value  `pd.NA`, default False
-   :rtype: Dict[tableauhyperapi.TableName, pd.DataFrame]
-
+   :param return_type: The type of DataFrame to be returned
 
-.. py:function:: frame_from_hyper_query(source: Union[str, pathlib.Path, tab_api.Connection], query: str, *, hyper_process: Optional[HyperProcess], use_float_na: bool = False) -> pd.DataFrame:
 
-.. versionadded:: 2.0
+.. py:function:: frame_from_hyper_query(source: Union[str, pathlib.Path, tab_api.Connection], query: str, *, return_type: Literal["pandas", "polars", "pyarrow"] = "pandas",)
 
    Executes a SQL query and returns the result as a pandas dataframe
 
    :param source: Name / location of the Hyper file to be read  or Hyper-API connection.
    :param query: SQL query to execute.
-   :param hyper_process: A `HyperProcess` in case you want to spawn it by yourself. Optional. Must be supplied as a keyword argument.
-   :param use_float_na: Flag indicating whether to use the pandas `Float32`/`Float64` dtypes which support the new pandas missing value  `pd.NA`, default False
-   :rtype: Dict[tableauhyperapi.TableName, pd.DataFrame]
+   :param return_type: The type of DataFrame to be returned
diff --git a/doc/source/caveats.rst b/doc/source/caveats.rst
diff --git a/doc/source/changelog.rst b/doc/source/changelog.rst
@@ -147,7 +147,7 @@ Pantab 3.0.0 (2022-09-14)
 =========================
 
 - Implemented a new ``use_parquet`` keyword in ``frame_to_hyper`` which uses Parquet as an intermediate storage solution instead of pantab's own internal C library. This may provide a small performance boost at the cost of additional disk usage
-- Fixed issue where pantab was not compatabile with Hyper versions 0.0.14567 and above. See the :ref:`compatability` documentation.
+- Fixed issue where pantab was not compatabile with Hyper versions 0.0.14567 and above.
 
 
 Pantab 2.1.1 (2022-04-13)

diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -5,7 +5,7 @@
 project = "pantab"
 copyright = "2019-2024, Will Ayd, innobi, LLC"
 author = "Will Ayd, innobi, LLC"
-release = "4.0.0.rc2"
+release = "4.0.0"
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/doc/source/examples.rst b/doc/source/examples.rst
@@ -113,8 +113,6 @@ Please note that ``table_mode="a"`` will create the table(s) if they do not alre
 Issuing SQL queries
 -------------------
 
-.. versionadded:: 2.0
-
 With ``frame_from_hyper_query``, one can execute SQL queries against a Hyper file and retrieve the resulting data as a DataFrame. This can be used, e.g. to retrieve only a part of the data (using a ``WHERE`` clause) or to offload computations to Hyper.
 
 .. code-block:: python
@@ -150,78 +148,55 @@ With ``frame_from_hyper_query``, one can execute SQL queries against a Hyper fil
    print(df)
 
 
-Providing your own HyperProcess
--------------------------------
-
-.. versionadded:: 2.0
-
-For convenience, pantab's functions internally spawn a `HyperProcess <https://tableau.github.io/hyper-db/docs/hyper-api/hyper_process>`_. In case you prefer to spawn your own ``HyperProcess``, you can supply it to pantab through the ``hyper_process`` keyword argument.
+Bring your own DataFrame
+------------------------
 
-By using your own ``HyperProcess``, you have full control over all its startup paramters.
-In the following example we use that flexibility to:
-
-- enable telemetry, thereby making sure the Hyper team at Tableau knows about our use case and potential issues we might be facing
-- `disable log files <https://tableau.github.io/hyper-db/docs/hyper-api/hyper_process#log_config>`_, as we operate in some environment with really tight disk space
-- opt-in to the `new Hyper file format <https://tableau.github.io/hyper-db/docs/hyper-api/hyper_process#default_database_version>`_
-
-By reusing the same ``HyperProcess`` for multiple operations, we also save a few milliseconds. While not noteworthy in this simple example, this might be a good optimization in case you call ``frame_to_hyper`` repeatedly in a loop.
+.. versionadded:: 4.0
 
+When pantab was first created, pandas was the dominant DataFrame library. In the years since then, many competing libraries have cropped up which all provide different advantages. To give users the most flexibility, pantab provides first class support for exchanging `pandas <https://pandas.pydata.org/>`_, `polars <https://pola.rs/>`_ and `pyarrow <https://arrow.apache.org/docs/python/index.html>`_ DataFrames. To wit, all of the following code samples will produce an equivalent Hyper file:
 
 .. code-block:: python
 
-   import pandas as pd
    import pantab as pt
-   from tableauhyperapi import HyperProcess, Telemetry
-
-   df = pd.DataFrame([
-       ["dog", 4],
-       ["cat", 4],
-   ], columns=["animal", "num_of_legs"])
-
-   parameters = {"log_config": "", "default_database_version": "1"}
-   with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters=parameters) as hyper:
-       # Insert some initial data
-       pt.frame_to_hyper(df, "example.hyper", table="animals", hyper_process=hyper)
-
-       # Append additional data to the same table using `table_mode="a"`
-       new_data = pd.DataFrame([["moose", 4]], columns=["animal", "num_of_legs"])
-       pt.frame_to_hyper(df, "example.hyper", table="animals", table_mode="a", hyper_process=hyper)
 
+   import pandas as pd
+   df = pd.DataFrame({"col": [1, 2, 3]})
+   pt.frame_to_hyper(df, "example.hyper", table="test")
 
-Providing your own Hyper Connection
------------------------------------
+   import pyarrow as pa
+   tbl = pa.Table.from_arrays([pa.array([1, 2, 3])], names=["col"])
+   pt.frame_to_hyper(tbl, "example.hyper", table="test")
 
-.. versionadded:: 2.0
+   import polars as pl
+   df = pl.DataFrame({"col": [1, 2, 3]})
+   pt.frame_to_hyper(df, "example.hyper", table="test")
 
-In order to interface with Hyper, pantab functions need a HyperAPI `Connection <https://tableau.github.io/hyper-db/docs/hyper-api/connection>`_ to interface with Hyper.
-For convenience, pantab creates those connections implicitly for you.
-However, establishing a connection is not for free, and by reusing the same ``Connection`` for multiple operations, we can save time.
-Hence, pantab also allows you to pass in a HyperAPI connection instead of the name / location of your Hyper file.
+As far as reading is concerned, you can control the type of DataFrame you receive back via the ``return_type`` keyword. pandas remains the default
 
 .. code-block:: python
 
-   import pandas as pd
-   import pantab as pt
-   from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode
+   >>> pt.frame_from_hyper("example.hyper", table="test")  # pandas by default
+      col
+   0    1
+   1    2
+   2    3
+   >>> pt.frame_from_hyper("example.hyper", table="test", return_type="pyarrow")
+   pyarrow.Table
+   col: int64
+   ----
+   col: [[1,2,3]]
+   >>> pt.frame_from_hyper("example.hyper", table="test", return_type="polars")
+   shape: (3, 1)
+   ┌─────┐
+   │ col │
+   │ --- │
+   │ i64 │
+   ╞═════╡
+   │ 1   │
+   │ 2   │
+   │ 3   │
+   └─────┘
 
-   df = pd.DataFrame([
-       ["dog", 4],
-       ["cat", 4],
-       ["centipede", 100],
-   ], columns=["animal", "num_of_legs"])
-   path = "example.hyper"
-
-   with HyperProcess(Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper:
-       pt.frames_to_hyper({"animals": df}, path, hyper_process=hyper)
-
-       with Connection(hyper.endpoint, path, CreateMode.NONE) as connection:
-            query = """
-            SELECT animal
-            FROM animals
-            WHERE num_of_legs > 4
-            """
-            many_legs_df = pt.frame_from_hyper_query(connection, query)
-            print(many_legs_df)
-
-            all_animals = pt.frame_from_hyper(connection, table="animals")
-            print(all_animals)
+.. note::
+
+   Technically pantab is able to *write* any DataFrame library that implements the `Arrow PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_
diff --git a/doc/source/index.rst b/doc/source/index.rst
@@ -5,7 +5,6 @@ pantab
    :hidden:
 
    examples
-   caveats
    api
    changelog
    support
@@ -19,7 +18,7 @@ What is it?
 How do I get it?
 ----------------
 
-``pantab`` requires Python 3.6+ and can run on any Python-supported OS. Installation is as easy as:
+Installation is as easy as:
 
 .. code-block:: bash
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "pantab"
-version = "4.0.0rc2"
+version = "4.0.0"
 description = "Converts pandas DataFrames into Tableau Hyper Extracts and back"
 license = {file = "LICENSE.txt"}
 readme = "README.md"

diff --git a/src/pantab/__init__.py b/src/pantab/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "4.0.0rc2"
+__version__ = "4.0.0"
 
 
 from pantab._reader import frame_from_hyper, frame_from_hyper_query, frames_from_hyper