### Big data course project
<strong>T3: Dataset size in different file types</strong>

Jovana Videnovic & Haris Kupinic

In [None]:
!hostnamectl

In [None]:
import pandas as pd
import time
import os
import duckdb
from pathlib import Path

In [None]:
input_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/partitioned_data/green/2024/part-0.parquet")
output_path = Path("/d/hpc/projects/FRI/bigdata/students/jv8043/T3")
table_name =  "green_2024"

In [None]:
df = pd.read_parquet(input_path)

In [None]:
df["store_and_fwd_flag"] = df["store_and_fwd_flag"].astype(str)

In [None]:
display(df.head())
print("DF types", df.dtypes)

In [None]:
# Create output directory if it doesn't exist
output_path.mkdir(parents=True, exist_ok=True)
# Save DataFrame to different formats
df.to_csv(output_path / f"{table_name}.csv", index=False)
df.to_csv(output_path / f"{table_name}.csv.gz", index=False, compression='gzip')
df.to_hdf(output_path / f"{table_name}.h5", key='data', mode='w')

In [None]:
# Connect/create DuckDB file
con = duckdb.connect(output_path / f"{table_name}.duckdb")
con.execute(f"CREATE TABLE IF NOT EXISTS {table_name} AS SELECT * FROM df")
# Close connection
con.close()

In [None]:
def read_time_and_size(path, read_type, table_name=None, compression=None):
    if read_type == "duckdb":
        con = duckdb.connect(path)

    start = time.time()
    if read_type == "csv":
        df_loaded = pd.read_csv(path, compression=compression)
    elif read_type == "hdf5":
        df_loaded = pd.read_hdf(path, key="data")
    elif read_type == "duckdb":
        df_loaded = con.execute(f"SELECT * FROM {table_name}").fetchdf()
    duration = time.time() - start
    size_mb = os.path.getsize(path) / (1024 * 1024)
    if read_type == "duckdb":
        con.close()
    return duration, size_mb, df_loaded.shape

In [None]:
print("Reading and size comparison:")

csv_stats = read_time_and_size(output_path / f"{table_name}.csv", read_type="csv")
print(f"CSV:      Time={csv_stats[0]:.3f}s, Size={csv_stats[1]:.2f} MB, Shape={csv_stats[2]}")

csv_gz_stats = read_time_and_size(output_path / f"{table_name}.csv.gz", read_type="csv", compression='gzip')
print(f"CSV GZ:   Time={csv_gz_stats[0]:.3f}s, Size={csv_gz_stats[1]:.2f} MB, Shape={csv_gz_stats[2]}")

hdf5_stats = read_time_and_size(output_path / f"{table_name}.h5", read_type="hdf5", table_name=table_name)
print(f"HDF5:     Time={hdf5_stats[0]:.3f}s, Size={hdf5_stats[1]:.2f} MB, Shape={hdf5_stats[2]}")

duckdb_stats = read_time_and_size(output_path / f"{table_name}.duckdb", read_type="duckdb", table_name=table_name)
print(f"DuckDB:   Time={duckdb_stats[0]:.3f}s, Size={duckdb_stats[1]:.2f} MB, Shape={duckdb_stats[2]}")

<strong>Data source reading time and size comparison</strong>

| Format  | Read Time (s) | File Size |
|---------|---------------|-----------|
| CSV     | 0.936         | 89.83 MB  | 
| CSV GZ  | 1.175         | 15.31 MB  | 
| HDF5    | 0.176         | 123.16 MB | 
| DuckDB  | 0.207         | 39.51 MB  | 
