In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from datetime import datetime

load_dotenv()

RAW_PATH = os.getenv("DATA_DIR_RAW")
PROCESSED_PATH = os.getenv("DATA_DIR_PROCESSED")

In [2]:
df = pd.DataFrame({
    'date': pd.date_range('2023-01-01', periods=5),
    'price': [100, 101.5, 102.3, 99.8, 98.6],
    'volume': [200, 210, 190, 230, 220]
})

In [3]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
csv_path = f"{RAW_PATH}/sample_{timestamp}.csv"
parquet_path = f"{PROCESSED_PATH}/sample_{timestamp}.parquet"

df.to_csv(csv_path, index=False)
df.to_parquet(parquet_path, index=False)

print("CSV saved to:", csv_path)
print("Parquet saved to:", parquet_path)


CSV saved to: data/raw/sample_20250817-1319.csv
Parquet saved to: data/processed/sample_20250817-1319.parquet


In [7]:
df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path)

print("Shape CSV:", df_csv.shape)
print("Shape Parquet:", df_parquet.shape)
print("Dtypes CSV:\n", df_csv.dtypes)
print("Dtypes Parquet:\n", df_parquet.dtypes)

def validate_df(df1, df2):
    return df1.shape == df2.shape and all(df1.columns == df2.columns)

print("Validation result:", validate_df(df_csv, df_parquet))


Shape CSV: (5, 3)
Shape Parquet: (5, 3)
Dtypes CSV:
 date       object
price     float64
volume      int64
dtype: object
Dtypes Parquet:
 date      datetime64[ns]
price            float64
volume             int64
dtype: object
Validation result: True


In [8]:
def write_df(df, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    ext = path.split(".")[-1]
    if ext == "csv":
        df.to_csv(path, index=False)
    elif ext == "parquet":
        try:
            df.to_parquet(path, index=False)
        except ImportError:
            print("Install pyarrow or fastparquet to use Parquet.")
    else:
        raise ValueError("Unsupported file type.")

def read_df(path):
    ext = path.split(".")[-1]
    if ext == "csv":
        return pd.read_csv(path)
    elif ext == "parquet":
        try:
            return pd.read_parquet(path)
        except ImportError:
            print("Install pyarrow or fastparquet to read Parquet.")
    else:
        raise ValueError("Unsupported file type.")


In [9]:
write_df(df, f"{RAW_PATH}/test.csv")
write_df(df, f"{PROCESSED_PATH}/test.parquet")

df1 = read_df(f"{RAW_PATH}/test.csv")
df2 = read_df(f"{PROCESSED_PATH}/test.parquet")

print("Utility read/write test passed:", validate_df(df1, df2))


Utility read/write test passed: True
