<a href="https://colab.research.google.com/github/hoangvypy/Volcano_duckbd_vs_dplyr/blob/main/gen_parquet_files.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# gen_parquet_files.py
# -------------------------
# Data generation adapted from Zach Wilson's his newsletter email,
# "DuckDB benchmarked against Spark"

# For very large data generation 27e7 and 5e8 (11GB and 22GB respectively),
# my local 8 GB RAM laptop ran out of memory during processing.
# My 8 GB RAM personal laptop couldn't handle,
# so I moved to Collab for higher free RAM processing

import duckdb
import os

# Get the folder
# BASE_DIR = os.path.dirname(os.path.abspath(__file__))          #for local folder
BASE_DIR = os.getcwd()                                           #for colab

# Create dummy_data folder inside project if it doesn't exist
OUTPUT_DIR = os.path.join(BASE_DIR, "dummy_data")
os.makedirs(OUTPUT_DIR, exist_ok=True)

def gen_dataset(rows):
    lower = "2020-01-01"
    upper = "2025-01-01"
    con = duckdb.connect()

    file_path = os.path.join(OUTPUT_DIR, f"ds_{rows}_rows.parquet")

    con.execute(f"""
        COPY (
          SELECT
            t.row_id,
            CAST(uuid() AS VARCHAR) AS txn_key,
            DATE '{lower}'
              + (random() * (date_diff('day', DATE '{lower}', DATE '{upper}')))::INT AS rand_dt,
            ROUND(random() * 100, 2) AS rand_val,
            SUBSTR('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',
                   1 + (random() * 25)::INT, 1) AS rand_str
          FROM generate_series(1, {rows}) t(row_id)
        ) TO '{file_path}' (FORMAT 'parquet');
    """)

#sizes = [500, 5000]
#sizes = [50000, 500000, 5000000, 50000000]
sizes = [270000000, 500000000]
for n in sizes:
    print(f"Generating {n} rows...")
    gen_dataset(n)

print("✅ All datasets generated!")