<a href="https://colab.research.google.com/github/harrydevforlife/sandbox/blob/main/pyarrow_and_fastparquet_writer_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyarrow fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cramjam, fastparquet
Successfully installed cramjam-2.9.0 fastparquet-2024.11.0


In [6]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import fastparquet
import time
import os
import tempfile

def generate_sample_data(num_rows=10_000_000, num_cols=10):
    """
    Generates a pandas DataFrame with random data.

    Args:
        num_rows (int): Number of rows.
        num_cols (int): Number of columns.

    Returns:
        pd.DataFrame: Generated DataFrame.
    """
    data = {
        f'col_{i}': np.random.randn(num_rows) for i in range(num_cols)
    }
    df = pd.DataFrame(data)
    return df

def write_parquet_pyarrow(df, file_path, compression='snappy', use_threads=True):
    """
    Writes DataFrame to Parquet using PyArrow.

    Args:
        df (pd.DataFrame): Data to write.
        file_path (str): Destination file path.
        compression (str): Compression algorithm.
        use_threads (bool): Whether to use multi-threading.
    """
    table = pa.Table.from_pandas(df)
    pq.write_table(table, file_path, compression=compression)

def write_parquet_fastparquet(df, file_path, compression='SNAPPY', compression_level=None):
    """
    Writes DataFrame to Parquet using FastParquet.

    Args:
        df (pd.DataFrame): Data to write.
        file_path (str): Destination file path.
        compression (str): Compression algorithm.
        compression_level (int, optional): Compression level.
    """
    fastparquet.write(file_path, df, compression=compression)

def benchmark_write(func, *args, **kwargs):
    """
    Benchmarks the time taken by a function to execute.

    Args:
        func (callable): Function to benchmark.
        *args: Positional arguments for the function.
        **kwargs: Keyword arguments for the function.

    Returns:
        float: Time taken in seconds.
    """
    start_time = time.time()
    func(*args, **kwargs)
    end_time = time.time()
    return end_time - start_time

def run_benchmark(df, num_runs=3, compression='snappy'):
    """
    Runs the benchmark for both PyArrow and FastParquet.

    Args:
        df (pd.DataFrame): Data to write.
        num_runs (int): Number of times to run each benchmark.
        compression (str): Compression algorithm to use.

    Returns:
        dict: Average write times for each library.
    """
    results = {'pyarrow': [], 'fastparquet': []}

    for run in range(1, num_runs + 1):
        print(f"\nRun {run} of {num_runs}:")

        with tempfile.TemporaryDirectory() as tmpdirname:
            # Define file paths
            pyarrow_file = os.path.join(tmpdirname, 'data_pyarrow.parquet')
            fastparquet_file = os.path.join(tmpdirname, 'data_fastparquet.parquet')

            # Benchmark PyArrow
            time_pyarrow = benchmark_write(
                write_parquet_pyarrow,
                df,
                pyarrow_file,
                compression=compression,
                use_threads=True
            )
            results['pyarrow'].append(time_pyarrow)
            print(f"PyArrow write time: {time_pyarrow:.2f} seconds")

            # Benchmark FastParquet
            time_fastparquet = benchmark_write(
                write_parquet_fastparquet,
                df,
                fastparquet_file,
                compression=compression.upper()
            )
            results['fastparquet'].append(time_fastparquet)
            print(f"FastParquet write time: {time_fastparquet:.2f} seconds")

    # Calculate average times
    avg_pyarrow = sum(results['pyarrow']) / num_runs
    avg_fastparquet = sum(results['fastparquet']) / num_runs

    return {
        'PyArrow Average Time (s)': avg_pyarrow,
        'FastParquet Average Time (s)': avg_fastparquet
    }

def main():
    # Parameters
    NUM_ROWS = 10_000_000  # 10 million rows
    NUM_COLS = 10          # 10 columns
    NUM_RUNS = 10          # Number of benchmark runs
    COMPRESSION = 'snappy' # Compression algorithm ('snappy', 'gzip', 'brotli', etc.)

    print("Generating sample data...")
    df = generate_sample_data(num_rows=NUM_ROWS, num_cols=NUM_COLS)
    print(f"DataFrame with {NUM_ROWS} rows and {NUM_COLS} columns generated.")

    print("\nStarting benchmark...")
    results = run_benchmark(df, num_runs=NUM_RUNS, compression=COMPRESSION)

    print("\nBenchmark Results:")
    for lib, avg_time in results.items():
        print(f"{lib}: {avg_time:.2f} seconds on average over {NUM_RUNS} runs")

if __name__ == "__main__":
    main()


Generating sample data...
DataFrame with 10000000 rows and 10 columns generated.

Starting benchmark...

Run 1 of 10:
PyArrow write time: 7.24 seconds
FastParquet write time: 13.07 seconds

Run 2 of 10:
PyArrow write time: 10.86 seconds
FastParquet write time: 8.50 seconds

Run 3 of 10:
PyArrow write time: 11.34 seconds
FastParquet write time: 8.52 seconds

Run 4 of 10:
PyArrow write time: 12.72 seconds
FastParquet write time: 15.45 seconds

Run 5 of 10:
PyArrow write time: 10.24 seconds
FastParquet write time: 11.96 seconds

Run 6 of 10:
PyArrow write time: 13.98 seconds
FastParquet write time: 5.38 seconds

Run 7 of 10:
PyArrow write time: 10.45 seconds
FastParquet write time: 9.73 seconds

Run 8 of 10:
PyArrow write time: 9.45 seconds
FastParquet write time: 12.32 seconds

Run 9 of 10:
PyArrow write time: 10.53 seconds
FastParquet write time: 11.04 seconds

Run 10 of 10:
PyArrow write time: 11.65 seconds
FastParquet write time: 9.36 seconds

Benchmark Results:
PyArrow Average Time (