In [19]:
import pandas as pd
import pyarrow as pa
import time
import pyarrow.parquet as pq

def write_parquet_in_chunks(data_iterator, file_path):
    first_chunk = True
    writer = None

    for chunk in data_iterator:
        table = pa.Table.from_pandas(chunk)
        if first_chunk:
            writer = pq.ParquetWriter(file_path, table.schema, compression='SNAPPY')
            first_chunk = False
        writer.write_table(table)

    if writer:
        writer.close()

# Example data generator function
def data_generator(num_chunks, chunk_size):
    for _ in range(num_chunks):
        yield pd.DataFrame({
            'A': range(chunk_size),
            'B': range(chunk_size, 2 * chunk_size)
        })

chunk_size = 100000
num_chunks = 100
file_path = 'large_data.parquet'
data = data_generator(num_chunks, chunk_size)

# Calculate total items in the dataset
total_items = num_chunks * chunk_size

# Start timer
start_time = time.time()

# Write the data
write_parquet_in_chunks(data, file_path)

# End timer
end_time = time.time()
elapsed_time = end_time - start_time

# Calculate items per second
items_per_second = total_items / elapsed_time

print(f"Wrote {total_items} items to {file_path}")
print(f"Time elapsed: {elapsed_time:.2f} seconds")
print(f"Performance: {items_per_second:.2f} items per second")

Wrote 10000000 items to large_data.parquet
Time elapsed: 0.85 seconds
Performance: 11780005.66 items per second
