In [14]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

# Define the number of rows and columns
num_rows = 30000000
num_columns = 5
batch_size = 1000000

# Create Parquet schema
schema = pa.schema([
    (f'col{i}', pa.float64()) for i in range(num_columns)
])

# Create a Parquet file writer
parquet_file = 'data1.parquet'
with pq.ParquetWriter(parquet_file, schema) as writer:
    # Generate and write data in batches
    progress_bar = tqdm(total=num_rows, desc='Writing Parquet Data', position=0, leave=True)
    for i in range(0, num_rows, batch_size):
        # Generate batch of random data
        data = {
            f'col{i}': [np.random.rand() for _ in range(batch_size)] for i in range(num_columns)
        }
        table = pa.Table.from_pydict(data, schema=schema)
        writer.write_table(table)

        # Update progress bar
        progress_bar.update(batch_size)

progress_bar.close()
print('Data generation and writing completed.')


Writing Parquet Data: 100%|██████████| 30000000/30000000 [00:50<00:00, 592838.03it/s]

Data generation and writing completed.



