In [38]:
import os
import pandas as pd
import polars as pl


In [None]:

# Assuming all files are in the same directory
folder_path = './raw_data/binance-eth-usd'

# List all files in the directory
files = os.listdir(folder_path)

# Initialize an empty list to store DataFrames
dfs = []

# Read each file and append its DataFrame to the list
for file in files:
    if file.endswith('.csv'):  # Adjust file format if needed
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, header=None) 
        dfs.append(pl.from_pandas(df))  # Convert pandas DataFrame to polars DataFrame

# Concatenate all DataFrames in the list
combined_df = pl.concat(dfs)

# Save the combined DataFrame as a parquet file
combined_df.write_parquet('./data/raw-usd-eth.parquet')


In [39]:
# Lazily read the Parquet file into a polars DataFrame
parquet_file_path = './data/raw-usd-eth.parquet'
df = (
    pl.scan_parquet(parquet_file_path).collect(streaming=True)
    .select([
        pl.col('0').alias('timestamp'),  # Rename the first column to 'timestamp'
        pl.col('1').alias('price')       # Rename the second column to 'price'
    ])
   .with_columns(
       pl.from_epoch("timestamp", time_unit="ms")
   )
)

df = df.sort('timestamp')


df.write_parquet('./data/usd-eth.parquet')

# Now you can work with the eager DataFrame
print(df)

shape: (11_664_000, 2)
┌─────────────────────┬─────────┐
│ timestamp           ┆ price   │
│ ---                 ┆ ---     │
│ datetime[ms]        ┆ f64     │
╞═════════════════════╪═════════╡
│ 2023-12-31 00:00:00 ┆ 2291.17 │
│ 2023-12-31 00:00:01 ┆ 2291.78 │
│ 2023-12-31 00:00:02 ┆ 2291.78 │
│ 2023-12-31 00:00:03 ┆ 2291.78 │
│ 2023-12-31 00:00:04 ┆ 2291.78 │
│ …                   ┆ …       │
│ 2024-05-13 23:59:55 ┆ 2950.56 │
│ 2024-05-13 23:59:56 ┆ 2950.56 │
│ 2024-05-13 23:59:57 ┆ 2950.56 │
│ 2024-05-13 23:59:58 ┆ 2950.56 │
│ 2024-05-13 23:59:59 ┆ 2950.56 │
└─────────────────────┴─────────┘


In [41]:
# Lazily read the Parquet file into a polars DataFrame
parquet_file_path = './data/usd-eth.parquet'
df = pl.scan_parquet(parquet_file_path).collect(streaming=True)

print("There are {} prices in our dataset.".format(
    df.shape[0]))

print("Minimum Timestamp:", df['timestamp'].min())
print("Maximum Timestamp:", df['timestamp'].max())
print("Minimum Price:", df['price'].min())
print("Maximum Price:", df['price'].max())

# Now you can work with the eager DataFrame
print(df)

There are 11664000 prices in our dataset.
Minimum Timestamp: 2023-12-31 00:00:00
Maximum Timestamp: 2024-05-13 23:59:59
Minimum Price: 2108.49
Maximum Price: 4094.01
shape: (11_664_000, 2)
┌─────────────────────┬─────────┐
│ timestamp           ┆ price   │
│ ---                 ┆ ---     │
│ datetime[ms]        ┆ f64     │
╞═════════════════════╪═════════╡
│ 2023-12-31 00:00:00 ┆ 2291.17 │
│ 2023-12-31 00:00:01 ┆ 2291.78 │
│ 2023-12-31 00:00:02 ┆ 2291.78 │
│ 2023-12-31 00:00:03 ┆ 2291.78 │
│ 2023-12-31 00:00:04 ┆ 2291.78 │
│ …                   ┆ …       │
│ 2024-05-13 23:59:55 ┆ 2950.56 │
│ 2024-05-13 23:59:56 ┆ 2950.56 │
│ 2024-05-13 23:59:57 ┆ 2950.56 │
│ 2024-05-13 23:59:58 ┆ 2950.56 │
│ 2024-05-13 23:59:59 ┆ 2950.56 │
└─────────────────────┴─────────┘
