In [None]:
import os
import time
from pathlib import Path
import pandas as pd
import pyarrow.dataset as ds
import duckdb


INPUT_ROOT = Path("/d/hpc/projects/FRI/bigdata/students/in7357/cleaned_parquet/GREEN")
YEAR = 2024
OUTDIR = Path(f"/d/hpc/projects/FRI/bigdata/students/in7357/exports/green_{YEAR}")

def human_bytes(n: int) -> str:
    for unit in ["B", "KB", "MB", "GB", "TB"]:
        if n < 1024 or unit == "TB":
            return f"{n:.1f} {unit}" if unit != "B" else f"{n} {unit}"
        n /= 1024
    return f"{n:.1f} TB"

def read_year_partition(input_root: Path, year: int) -> pd.DataFrame:
    part_dir = input_root / f"year={year}"
    dataset = ds.dataset(str(part_dir), format="parquet")
    return dataset.to_table().to_pandas()

def export_all(df: pd.DataFrame, outdir: Path, year: int):
    outdir.mkdir(parents=True, exist_ok=True)
    outputs = {}

    # CSV
    csv_path = outdir / f"green_{year}.csv"
    df.to_csv(csv_path, index=False)
    outputs["csv"] = csv_path

    # CSV.GZ
    csv_gz_path = outdir / f"green_{year}.csv.gz"
    df.to_csv(csv_gz_path, index=False, compression="gzip")
    outputs["csv.gz"] = csv_gz_path

    # HDF5
    h5_path = outdir / f"green_{year}.h5"
    
    df_hdf = df.copy()
    for col in df_hdf.select_dtypes(include="string"):
        df_hdf[col] = df_hdf[col].astype("object")

    df_hdf.to_hdf(
        h5_path,
        key="green",
        format="table",
        mode="w",
        complib="blosc",
        complevel=9,
    )
    outputs["hdf5"] = h5_path

    # DuckDB
    duckdb_path = outdir / f"green_{year}.duckdb"
    if duckdb_path.exists():
        duckdb_path.unlink()
    con = duckdb.connect(str(duckdb_path))
    con.register("df", df)
    con.execute(f"CREATE TABLE green_{year} AS SELECT * FROM df")
    con.unregister("df")
    con.close()
    outputs["duckdb"] = duckdb_path

    return outputs

def file_size(path: Path) -> int:
    return path.stat().st_size

def timed_read(func, *args, **kwargs):
    best = float("inf")
    for _ in range(2):
        t0 = time.perf_counter()
        df = func(*args, **kwargs)
        dt = time.perf_counter() - t0
        best = min(best, dt)
    df = func(*args, **kwargs) 
    return df, best

def measure_reads(paths):
    results = []

    # CSV
    df_csv, t_csv = timed_read(pd.read_csv, paths["csv"])
    results.append({"format": "csv", "rows": len(df_csv),
                    "size_bytes": file_size(paths["csv"]),
                    "read_s": t_csv})

    # CSV.GZ
    df_gz, t_gz = timed_read(pd.read_csv, paths["csv.gz"])
    results.append({"format": "csv.gz", "rows": len(df_gz),
                    "size_bytes": file_size(paths["csv.gz"]),
                    "read_s": t_gz})

    # HDF5
    df_h5, t_h5 = timed_read(pd.read_hdf, paths["hdf5"], key="green")
    results.append({"format": "hdf5(table)", "rows": len(df_h5),
                    "size_bytes": file_size(paths["hdf5"]),
                    "read_s": t_h5})

    # DuckDB
    def read_duckdb(path):
        con = duckdb.connect(str(path), read_only=True)
        year = int(path.stem.split("_")[-1])
        df = con.execute(f"SELECT * FROM green_{year}").df()
        con.close()
        return df

    df_ddb, t_ddb = timed_read(read_duckdb, paths["duckdb"])
    results.append({"format": "duckdb", "rows": len(df_ddb),
                    "size_bytes": file_size(paths["duckdb"]),
                    "read_s": t_ddb})

    out = pd.DataFrame(results)
    out["size_human"] = out["size_bytes"].map(human_bytes)
    return out.sort_values("read_s").reset_index(drop=True)

# --- Run ---
df = read_year_partition(INPUT_ROOT, YEAR)
print(f"Loaded: {len(df):,} rows × {len(df.columns)} columns")

paths = export_all(df, OUTDIR, YEAR)
summary = measure_reads(paths)

summary


Loaded: 659,543 rows × 37 columns


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,format,rows,size_bytes,read_s,size_human
0,csv,659543,83697918,1.639582,79.8 MB
1,csv.gz,659543,13108234,1.927728,12.5 MB
2,duckdb,659543,18886656,2.100156,18.0 MB
3,hdf5(table),659543,28618353,7.024108,27.3 MB


In [5]:
summary.to_csv('/d/hpc/projects/FRI/bigdata/students/in7357/read_size_TASK3.csv', index=False)