TODO(Vlad): TODO(Samarth): convert the notebook into unit tests CmTask7331

The notebook demonstrates current behavior of various parquet functions with respect to
different time units.

# Imports

In [5]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import logging
from typing import List

import pandas as pd
import pyarrow

import helpers.hdbg as hdbg
import helpers.henv as henv
import helpers.hio as hio
import helpers.hparquet as hparque
import helpers.hprint as hprint

In [7]:
hdbg.init_logger(verbosity=logging.INFO)
_LOG = logging.getLogger(__name__)
_LOG.info("%s", henv.get_system_signature()[0])
hprint.config_notebook()

[31m-----------------------------------------------------------------------------
This code is not in sync with the container:
code_version='1.13.0' != container_version='1.15.0'
-----------------------------------------------------------------------------
You need to:
- merge origin/master into your branch with `invoke git_merge_master`
- pull the latest container with `invoke docker_pull`[0m
INFO  # Git
  branch_name='CmampTask7292_DEV_TOOLS___Docker___Update_pyarrow_to_latest_version'
  hash='56b65f870'
  # Last commits:
    * 56b65f870 vlady    7292_second_iter                                                  (43 minutes ago) Thu Feb 29 15:09:48 2024  (HEAD -> CmampTask7292_DEV_TOOLS___Docker___Update_pyarrow_to_latest_version, origin/CmampTask7292_DEV_TOOLS___Docker___Update_pyarrow_to_latest_version)
    * 1dc509b73 vlady    7292_jyp_test                                                     (   2 hours ago) Thu Feb 29 13:59:03 2024           
    *   47978e4a9 vlady    Merge bra

In [8]:
pyarrow.__version__

'15.0.0'

# Test data

In [9]:
timestamp_us = pd.Timestamp("2022-01-01 00:00:00.123456", tz="America/New_York")
index = [timestamp_us for _ in range(6)]
initial_df = pd.DataFrame(
    {
        "n_legs": [2, 2, 4, 4, 5, 100],
        "animal": [
            "Flamingo",
            "Parrot",
            "Dog",
            "Horse",
            "Brittle stars",
            "Centipede",
        ],
        "year": [2001, 2002, 2001, 2003, 2003, 2001],
    },
    index=index,
)
initial_df

Unnamed: 0,n_legs,animal,year
2022-01-01 00:00:00.123456-05:00,2,Flamingo,2001
2022-01-01 00:00:00.123456-05:00,2,Parrot,2002
2022-01-01 00:00:00.123456-05:00,4,Dog,2001
2022-01-01 00:00:00.123456-05:00,4,Horse,2003
2022-01-01 00:00:00.123456-05:00,5,Brittle stars,2003
2022-01-01 00:00:00.123456-05:00,100,Centipede,2001


In [10]:
initial_df.index.unit

'ns'

In [34]:
def test_write_and_read_partition_parquet_with_unit(
    initial_df: pd.DataFrame,
    partition_columns: List[str],
    dst_dir: str,
    unit: str,
    *,
    clean_up: bool = False,
) -> None:
    """
    Write the provided DataFrame to partitioned Parquet files and read it back,
    verifying the retention of time unit information in the index.

    :param initial_df: dataframe to write
    :param dst_dir: root folder to write partition parquet
    :param partition_columns: partition columns to write
    :param unit: initial time unit in the index
    :param clean_up: delete parquet folder at the end
    """
    current_df = initial_df.copy()
    _LOG.info("Initial DF unit: %s", current_df.index.unit)
    _LOG.info("Converting DF unit from ns to: %s", unit)
    current_df.index = current_df.index.as_unit(unit)
    _LOG.info(
        "DF Unit before writing to parquet files: %s", current_df.index.unit
    )
    # The `to_partitioned_parquet` saves the given dataframe as Parquet
    # files partitioned along the given columns.
    hparque.to_partitioned_parquet(current_df, partition_columns, dst_dir)
    # Generates the DF from parquet files in the `dst_dir`.
    df = hparque.from_parquet(dst_dir)
    print("\n")
    print("DF from parquet files")
    print(df)
    _LOG.info("DF Unit after reading from parquet files: %s", df.index.unit)
    if clean_up:
        hio.delete_dir(dst_dir)
    print("\n")

In [36]:
def test_write_and_read_parquet_file_with_unit(
    initial_df: pd.DataFrame, file_name: str, unit: str, *, clean_up: bool = False
) -> None:
    """
    Write the provided DataFrame to Parquet file and read it back, verifying
    the retention of time unit information in the index.

    :param initial_df: dataframe to write
    :param file_name: destination parquet file name
    :param unit: initial time unit in the index
    :param clean_up: delete parquet file at the end
    """
    current_df = initial_df.copy()
    _LOG.info("Initial DF unit: %s", current_df.index.unit)
    _LOG.info("Converting DF unit from ns to: %s", unit)
    current_df.index = current_df.index.as_unit(unit)
    _LOG.info(
        "Unit before writing to single parquet file: %s", current_df.index.unit
    )
    # The `to_parquet` function writes a DF to a single parquet file without
    # any partition.
    hparque.to_parquet(current_df, file_name)
    df = hparque.from_parquet(file_name)
    print("\n")
    print("DF from single parquet file")
    print(df)
    _LOG.info("DF Unit after reading from parquet file: %s", df.index.unit)
    if clean_up:
        hio.delete_file(file_name)
    print("\n")

In [32]:
# Columns to partition on.
partition_columns = ["year", "n_legs"]
# Testing on different time units.
test_units = ["ms", "us", "ns"]

# Overview

The upcoming 3 sections shows the working of parquet functions with different condition described in each section.
The behavior is different based on if we are writing/reading a partitioned parquet files from root dir or just a single parquet file.

# Current behavior


This includes what we have in the current master

## `hparque.to_partitioned_parquet()`

amp/helpers/hparquet.py:885
```python
        pq.write_to_dataset(
            table,
            dst_dir,
            partition_cols=partition_columns,
            # partition_filename_cb=partition_filename,
            filesystem=filesystem,
        )
```

## `hparque.to_parquet()`

amp/helpers/hparquet.py:266

The dictionary introduced by GP

```python
        # This is needed to handle:
        # ```
        # pyarrow.lib.ArrowInvalid: Casting from timestamp[ns, tz=America/New_York]
        #   to timestamp[us] would lose data: 1663595160000000030
        # ```
        parquet_args = {
            "coerce_timestamps": "us",
            "allow_truncated_timestamps": True,
        }
        pq.write_table(table, file_name, filesystem=filesystem, **parquet_args)
```

## `hparque.from_parquet()`

amp/helpers/hparquet.py:172

The hacks we applied in the version 14 upgrade

```python
            # Convert timestamp columns to `ns` resolution to keep the old
            # behaviour with pyarrow=10.0.0 as opposed to pyarrow>=14.0.0
            # which preserves the returned resolution.
            # See CmTask7097 for details. https://github.com/cryptokaizen/cmamp/issues/7097
            df = table.to_pandas(coerce_temporal_nanoseconds=True)
            # Convert timestamp indices to `ns` resolution to keep the old
            # behaviour with pyarrow=10.0.0 as opposed to pyarrow>=14.0.0
            # which preserves the returned resolution.
            # See CmTask7097 for details. https://github.com/cryptokaizen/cmamp/issues/7097
            if isinstance(df.index, pd.DatetimeIndex):
                df.index = df.index.as_unit("ns")
```

In [38]:
dst_dir = "tmp.pyarrow_current"
_LOG.info("\n" + hprint.frame("Testing partition write and read"))
for unit in test_units:
    _LOG.info("\n" + hprint.frame(f"Unit: {unit}"))
    # The case where DF is partitioned in multiple PQ files.
    # Under the hood we are calling `to_partioned_parquet` function
    # as we use `partition_columns`.
    #
    # While writing to parquet, the unit is preserved.
    # While reading from the parquet, the unit is always `ns`.
    test_write_and_read_partition_parquet_with_unit(
        initial_df, partition_columns, dst_dir, unit, clean_up=True
    )

INFO  
################################################################################
Testing partition write and read
################################################################################
INFO  
################################################################################
Unit: ms
################################################################################
INFO  Initial DF unit: ns
INFO  Converting DF unit from ns to: ms
INFO  DF Unit before writing to parquet files: ms
INFO  pyarrow.Table
animal: string
__index_level_0__: timestamp[ms, tz=America/New_York]
year: int32
n_legs: int32
----
animal: [["Centipede"],["Flamingo"],...,["Horse"],["Brittle stars"]]
__index_level_0__: [[2022-01-01 05:00:00.123Z],[2022-01-01 05:00:00.123Z],...,[2022-01-01 05:00:00.123Z],[2022-01-01 05:00:00.123Z]]
year: [[2001],[2001],...,[2003],[2003]]
n_legs: [[100],[2],...,[4],[5]]


DF from parquet files
                                         animal  year  n_legs
2022-01-01 00:00:00.1230

  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(


In [50]:
file_name = "tmp_current.parquet"
_LOG.info("\n" + hprint.frame("Testing file write and read"))
for unit in test_units:
    _LOG.info("\n" + hprint.frame(f"Unit: {unit}"))
    # The case where a Df is converted to single PQ file without any
    # partition. Under the hood, we call `to_parquet` function which has
    # GP's dictionary.
    #
    # While writing to parquet, the unit is always `us` because of GP's  dictionary.
    # While reading from parquet, the unit is always ns.
    test_write_and_read_parquet_file_with_unit(
        initial_df, file_name, unit, clean_up=True
    )

INFO  
################################################################################
Testing file write and read
################################################################################
INFO  
################################################################################
Unit: ms
################################################################################
INFO  Initial DF unit: ns
INFO  Converting DF unit from ns to: ms
INFO  Unit before writing to single parquet file: ms
INFO  pyarrow.Table
n_legs: int64
animal: string
year: int64
__index_level_0__: timestamp[us, tz=America/New_York]
----
n_legs: [[2,2,4,4,5,100]]
animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]]
year: [[2001,2002,2001,2003,2003,2001]]
__index_level_0__: [[2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z]]


DF from single parquet file
                         

  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(



# Remove hacks from the `hparque.from_parquet()`

This includes removing ns v/s us hacks but keeping the GP's dictionary

## `hparque.to_partitioned_parquet()`

amp/helpers/hparquet.py:885
```python
        pq.write_to_dataset(
            table,
            dst_dir,
            partition_cols=partition_columns,
            # partition_filename_cb=partition_filename,
            filesystem=filesystem,
        )
```

## `hparque.to_parquet()`

amp/helpers/hparquet.py:266
```python
        # This is needed to handle:
        # ```
        # pyarrow.lib.ArrowInvalid: Casting from timestamp[ns, tz=America/New_York]
        #   to timestamp[us] would lose data: 1663595160000000030
        # ```
        parquet_args = {
            "coerce_timestamps": "us",
            "allow_truncated_timestamps": True,
        }
        pq.write_table(table, file_name, filesystem=filesystem, **parquet_args)
```

## `hparque.from_parquet()`

amp/helpers/hparquet.py:172
```python
            table = dataset.read_pandas(columns=columns)
            # Convert timestamp columns to `ns` resolution to keep the old
            # behaviour with pyarrow=10.0.0 as opposed to pyarrow>=14.0.0
            # which preserves the returned resolution.
            # See CmTask7097 for details. https://github.com/cryptokaizen/cmamp/issues/7097
            # df = table.to_pandas(coerce_temporal_nanoseconds=True)
            df = table.to_pandas()
            # Convert timestamp indices to `ns` resolution to keep the old
            # behaviour with pyarrow=10.0.0 as opposed to pyarrow>=14.0.0
            # which preserves the returned resolution.
            # See CmTask7097 for details. https://github.com/cryptokaizen/cmamp/issues/7097
            # if isinstance(df.index, pd.DatetimeIndex):
                # df.index = df.index.as_unit("ns")
```

In [48]:
dst_dir = "tmp.pyarrow_current"
_LOG.info("\n" + hprint.frame("Testing partition write and read"))
for unit in test_units:
    _LOG.info("\n" + hprint.frame(f"Unit: {unit}"))
    # The case where DF is partitioned in multiple PQ files.
    # Under the hood we are calling `to_partioned_parquet` function
    # as we use `partition_columns`.
    #
    # While writing to parquet, the unit is preserved.
    # While reading from the parquet, the unit is preserved.
    test_write_and_read_partition_parquet_with_unit(
        initial_df, partition_columns, dst_dir, unit, clean_up=True
    )

INFO  
################################################################################
Testing partition write and read
################################################################################
INFO  
################################################################################
Unit: ms
################################################################################
INFO  Initial DF unit: ns
INFO  Converting DF unit from ns to: ms
INFO  DF Unit before writing to parquet files: ms
INFO  pyarrow.Table
animal: string
__index_level_0__: timestamp[ms, tz=America/New_York]
year: int32
n_legs: int32
----
animal: [["Centipede"],["Flamingo"],...,["Horse"],["Brittle stars"]]
__index_level_0__: [[2022-01-01 05:00:00.123Z],[2022-01-01 05:00:00.123Z],...,[2022-01-01 05:00:00.123Z],[2022-01-01 05:00:00.123Z]]
year: [[2001],[2001],...,[2003],[2003]]
n_legs: [[100],[2],...,[4],[5]]


DF from parquet files
                                         animal  year  n_legs
2022-01-01 00:00:00.1230

  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(


In [49]:
file_name = "tmp_current.parquet"
_LOG.info("\n" + hprint.frame("Testing file write and read"))
for unit in test_units:
    _LOG.info("\n" + hprint.frame(f"Unit: {unit}"))
    # The case where a Df is converted to single PQ file without any
    # partition. Under the hood, we call `to_parquet` function which has
    # GP's dictionary.
    #
    # While writing to parquet, the unit is always `us` because of GP's  dictionary.
    # While reading from parquet, the unit is preserved. In this case it will be `us` only.
    test_write_and_read_parquet_file_with_unit(
        initial_df, file_name, unit, clean_up=True
    )

INFO  
################################################################################
Testing file write and read
################################################################################
INFO  
################################################################################
Unit: ms
################################################################################
INFO  Initial DF unit: ns
INFO  Converting DF unit from ns to: ms
INFO  Unit before writing to single parquet file: ms
INFO  pyarrow.Table
n_legs: int64
animal: string
year: int64
__index_level_0__: timestamp[us, tz=America/New_York]
----
n_legs: [[2,2,4,4,5,100]]
animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]]
year: [[2001,2002,2001,2003,2003,2001]]
__index_level_0__: [[2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z,2022-01-01 05:00:00.123000Z]]


DF from single parquet file
                         

  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(


# Remove both hacks


This includes removing GP's dictionary and ns v/s us hacks introduced in the verison 14 upgrade

## `hparque.to_partitioned_parquet()`

amp/helpers/hparquet.py:885
```python
        pq.write_to_dataset(
            table,
            dst_dir,
            partition_cols=partition_columns,
            # partition_filename_cb=partition_filename,
            filesystem=filesystem,
        )
```

## `hparque.to_parquet()`

amp/helpers/hparquet.py:266
```python
        table = pa.Table.from_pandas(df)
        # This is needed to handle:
        # ```
        # pyarrow.lib.ArrowInvalid: Casting from timestamp[ns, tz=America/New_York]
        #   to timestamp[us] would lose data: 1663595160000000030
        # ```
        # parquet_args = {
        #     "coerce_timestamps": "us",
        #     "allow_truncated_timestamps": True,
        # }
        pq.write_table(table, file_name, filesystem=filesystem)
        # pq.write_table(table, file_name, filesystem=filesystem, **parquet_args)

```

## `hparque.from_parquet()`

amp/helpers/hparquet.py:172
```python
            table = dataset.read_pandas(columns=columns)
            # Convert timestamp columns to `ns` resolution to keep the old
            # behaviour with pyarrow=10.0.0 as opposed to pyarrow>=14.0.0
            # which preserves the returned resolution.
            # See CmTask7097 for details. https://github.com/cryptokaizen/cmamp/issues/7097
            # df = table.to_pandas(coerce_temporal_nanoseconds=True)
            df = table.to_pandas()
            # Convert timestamp indices to `ns` resolution to keep the old
            # behaviour with pyarrow=10.0.0 as opposed to pyarrow>=14.0.0
            # which preserves the returned resolution.
            # See CmTask7097 for details. https://github.com/cryptokaizen/cmamp/issues/7097
            # if isinstance(df.index, pd.DatetimeIndex):
                # df.index = df.index.as_unit("ns")
```

In [51]:
dst_dir = "tmp.pyarrow_current"
_LOG.info("\n" + hprint.frame("Testing partition write and read"))
for unit in test_units:
    _LOG.info("\n" + hprint.frame(f"Unit: {unit}"))
    # The case where DF is partitioned in multiple PQ files.
    # Under the hood we are calling `to_partioned_parquet` function
    # as we use `partition_columns`.
    #
    # While writing to parquet, the unit is preserved.
    # While reading from the parquet, the unit is preserved.
    test_write_and_read_partition_parquet_with_unit(
        initial_df, partition_columns, dst_dir, unit, clean_up=True
    )

INFO  
################################################################################
Testing partition write and read
################################################################################
INFO  
################################################################################
Unit: ms
################################################################################
INFO  Initial DF unit: ns
INFO  Converting DF unit from ns to: ms
INFO  DF Unit before writing to parquet files: ms
INFO  pyarrow.Table
animal: string
__index_level_0__: timestamp[ms, tz=America/New_York]
year: int32
n_legs: int32
----
animal: [["Centipede"],["Flamingo"],...,["Horse"],["Brittle stars"]]
__index_level_0__: [[2022-01-01 05:00:00.123Z],[2022-01-01 05:00:00.123Z],...,[2022-01-01 05:00:00.123Z],[2022-01-01 05:00:00.123Z]]
year: [[2001],[2001],...,[2003],[2003]]
n_legs: [[100],[2],...,[4],[5]]


DF from parquet files
                                         animal  year  n_legs
2022-01-01 00:00:00.1230

  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(


In [52]:
file_name = "tmp_current.parquet"
_LOG.info("\n" + hprint.frame("Testing file write and read"))
for unit in test_units:
    _LOG.info("\n" + hprint.frame(f"Unit: {unit}"))
    # The case where a Df is converted to single PQ file without any
    # partition. Under the hood, we call `to_parquet` function which do not have
    # GP's dictionary.
    #
    # While writing to parquet, the unit is preserved.
    # While reading from the parquet, the unit is preserved.
    test_write_and_read_parquet_file_with_unit(
        initial_df, file_name, unit, clean_up=True
    )

INFO  
################################################################################
Testing file write and read
################################################################################
INFO  
################################################################################
Unit: ms
################################################################################
INFO  Initial DF unit: ns
INFO  Converting DF unit from ns to: ms
INFO  Unit before writing to single parquet file: ms
INFO  pyarrow.Table
n_legs: int64
animal: string
year: int64
__index_level_0__: timestamp[ms, tz=America/New_York]
----
n_legs: [[2,2,4,4,5,100]]
animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]]
year: [[2001,2002,2001,2003,2003,2001]]
__index_level_0__: [[2022-01-01 05:00:00.123Z,2022-01-01 05:00:00.123Z,2022-01-01 05:00:00.123Z,2022-01-01 05:00:00.123Z,2022-01-01 05:00:00.123Z,2022-01-01 05:00:00.123Z]]


DF from single parquet file
                                  n_legs   

  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
  dataset = pq.ParquetDataset(
