In [1]:
# pip conda install -c conda-forge pyarrow fastparquet

ERROR: unknown command "conda"
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import time
import pandas as pd
import pyarrow.parquet as pq

In [3]:
csv_file_path = "all_stocks_5yr.csv"
parquet_file_path = "all_stocks_5yr.parquet"

df = pd.read_csv(csv_file_path)

df.head()

Unnamed: 0,date,open,high,low,close,volume,name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [8]:
#read/write times benchmarking

def benchmark_storage(df, scale_factor):
    print(f"\n🚀 Benchmarking for {scale_factor}x dataset...\n")

    # scale dataset
    df_scaled = pd.concat([df] * scale_factor, ignore_index=True)

    # measure csv write time
    csv_scaled_path = f"stocks_{scale_factor}x.csv"
    start_time = time.time()
    df_scaled.to_csv(csv_scaled_path, index=False)
    csv_write_time = time.time() - start_time

    # measure csv read time
    start_time = time.time()
    pd.read_csv(csv_scaled_path)
    csv_read_time = time.time() - start_time

    # measure parquet write time
    parquet_scaled_path = f"stocks_{scale_factor}x.parquet"
    start_time = time.time()
    df_scaled.to_parquet(parquet_scaled_path, engine="pyarrow", compression="snappy")
    parquet_write_time = time.time() - start_time

    # measure parquet read time
    start_time = time.time()
    pd.read_parquet(parquet_scaled_path, engine="pyarrow")
    parquet_read_time = time.time() - start_time

    # measure file sizes
    csv_size = os.path.getsize(csv_scaled_path) / (1024 * 1024)  # convert to MB
    parquet_size = os.path.getsize(parquet_scaled_path) / (1024 * 1024)  # convert to MB

    # store results
    results = {
        "Scale": scale_factor,
        "CSV Read Time (s)": csv_read_time,
        "CSV Write Time (s)": csv_write_time,
        "CSV Size (MB)": csv_size,
        "Parquet Read Time (s)": parquet_read_time,
        "Parquet Write Time (s)": parquet_write_time,
        "Parquet Size (MB)": parquet_size
    }
    
    return results

In [10]:
# run benchmarks for 1x, 10x, and 100x
benchmark_results = []
for scale in [1, 10, 100]:
    benchmark_results.append(benchmark_storage(df, scale))


🚀 Benchmarking for 1x dataset...


🚀 Benchmarking for 10x dataset...


🚀 Benchmarking for 100x dataset...



In [18]:
# convert to df and print
df_results = pd.DataFrame(benchmark_results)
print("\n📊 Benchmarking Results:\n")
print(df_results)


📊 Benchmarking Results:

   Scale  CSV Read Time (s)  CSV Write Time (s)  CSV Size (MB)  \
0      1           0.382704            2.383521      28.210210   
1     10           3.831713           22.232221     282.101781   
2    100          69.496476          230.343434    2821.017491   

   Parquet Read Time (s)  Parquet Write Time (s)  Parquet Size (MB)  
0               3.542507                0.273106          10.151486  
1               0.519474                2.087340          95.354862  
2              22.452015               38.603693         951.709558  


In [19]:
# save results as CSV
df_results.to_csv("benchmark_results.csv", index=False)
print("\n✅ Results saved as 'benchmark_results.csv'")


✅ Results saved as 'benchmark_results.csv'
