In [1]:
# pip install pandas polars scikit-learn ta matplotlib seaborn pyarrow fastparquet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import time
import pandas as pd
import pyarrow.parquet as pq

In [3]:
csv_file_path = "all_stocks_5yr.csv"
parquet_file_path = "all_stocks_5yr.parquet"

df = pd.read_csv(csv_file_path)

df.head()

Unnamed: 0,date,open,high,low,close,volume,name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [4]:
#read/write times benchmarking

def benchmark_storage(df, scale_factor):
    print(f"\n🚀 Benchmarking for {scale_factor}x dataset...\n")

    # scale dataset
    df_scaled = pd.concat([df] * scale_factor, ignore_index=True)

    results = {
        "Scale": scale_factor,
        "CSV Read Time (s)": None,
        "CSV Write Time (s)": None,
        "CSV Size (MB)": None,
        "Parquet Read Time (s)": None,
        "Parquet Write Time (s)": None,
        "Parquet Size (MB)": None
    }

    # optimized CSV handling for 100x
    csv_scaled_path = f"stocks_{scale_factor}x.csv"

    # measure csv write time (using chunks for efficiency)
    start_time = time.time()
    df_scaled.to_csv(csv_scaled_path, index=False, chunksize=1000000)  # larger chunk size for 100x
    results["CSV Write Time (s)"] = time.time() - start_time
    print(f"CSV written: {csv_scaled_path} in {results['CSV Write Time (s)']:.2f} seconds")

    # measure CSV read time
    start_time = time.time()
    pd.read_csv(csv_scaled_path)
    results["CSV Read Time (s)"] = time.time() - start_time
    print(f"📖 CSV read completed in {results['CSV Read Time (s)']:.2f} seconds")
    
    # measure CSV file size
    results["CSV Size (MB)"] = os.path.getsize(csv_scaled_path) / (1024 * 1024)
    print(f"📂 CSV file size: {results['CSV Size (MB)']:.2f} MB")

    # write parquet for all scales
    parquet_scaled_path = f"stocks_{scale_factor}x.parquet"

    start_time = time.time()
    df_scaled.to_parquet(parquet_scaled_path, engine="pyarrow", compression="snappy")
    results["Parquet Write Time (s)"] = time.time() - start_time
    print(f" Parquet written: {parquet_scaled_path} in {results['Parquet Write Time (s)']:.2f} seconds")

    # measure parquet read time
    start_time = time.time()
    pd.read_parquet(parquet_scaled_path, engine="pyarrow")
    results["Parquet Read Time (s)"] = time.time() - start_time
    print(f"Parquet read completed in {results['Parquet Read Time (s)']:.2f} seconds")

    # measure parquet file size
    results["Parquet Size (MB)"] = os.path.getsize(parquet_scaled_path) / (1024 * 1024)
    print(f" Parquet file size: {results['Parquet Size (MB)']:.2f} MB")

    return results

In [5]:
# run benchmarks for 1x, 10x, and 100x
benchmark_results = []
for scale in [1, 10, 100]:  # now fully benchmarks CSV & parquex at 100x
    benchmark_results.append(benchmark_storage(df, scale))


🚀 Benchmarking for 1x dataset...

CSV written: stocks_1x.csv in 2.38 seconds
📖 CSV read completed in 0.37 seconds
📂 CSV file size: 28.21 MB
 Parquet written: stocks_1x.parquet in 0.22 seconds
Parquet read completed in 0.17 seconds
 Parquet file size: 10.15 MB

🚀 Benchmarking for 10x dataset...

CSV written: stocks_10x.csv in 23.37 seconds
📖 CSV read completed in 3.61 seconds
📂 CSV file size: 282.10 MB
 Parquet written: stocks_10x.parquet in 1.96 seconds
Parquet read completed in 0.57 seconds
 Parquet file size: 95.35 MB

🚀 Benchmarking for 100x dataset...

CSV written: stocks_100x.csv in 240.81 seconds
📖 CSV read completed in 59.47 seconds
📂 CSV file size: 2821.02 MB
 Parquet written: stocks_100x.parquet in 26.11 seconds
Parquet read completed in 21.84 seconds
 Parquet file size: 951.71 MB


In [6]:
# convert to df and print

df_results = pd.DataFrame(benchmark_results)
print("\nBenchmarking Results:\n")
print(df_results)


Benchmarking Results:

   Scale  CSV Read Time (s)  CSV Write Time (s)  CSV Size (MB)  \
0      1           0.371634            2.375720      28.210210   
1     10           3.612562           23.365336     282.101781   
2    100          59.466201          240.814782    2821.017491   

   Parquet Read Time (s)  Parquet Write Time (s)  Parquet Size (MB)  
0               0.172529                0.218260          10.151486  
1               0.571065                1.962999          95.354862  
2              21.841824               26.111475         951.709558  


In [7]:
#save to csv

df_results = pd.DataFrame(benchmark_results)
print("\nBenchmarking Results:\n")
print(df_results)


Benchmarking Results:

   Scale  CSV Read Time (s)  CSV Write Time (s)  CSV Size (MB)  \
0      1           0.371634            2.375720      28.210210   
1     10           3.612562           23.365336     282.101781   
2    100          59.466201          240.814782    2821.017491   

   Parquet Read Time (s)  Parquet Write Time (s)  Parquet Size (MB)  
0               0.172529                0.218260          10.151486  
1               0.571065                1.962999          95.354862  
2              21.841824               26.111475         951.709558  
