In [1]:
from pathlib import Path 
import polars as pl #polars rather than pandas for more efficient memory

pl.enable_string_cache(True)

In [2]:
WEATHER_LOAD_DIR = Path(Path.cwd().parent, "data.nosync", "raw_data", "weather")
LOAD_LOAD_DIR = Path(Path.cwd().parent, "data.nosync", "raw_data", "load")

WEATHER_SAVE_DIR = Path(
    Path.cwd().parent, "data.nosync", "combined_data", "combined_weather.parquet"
)
LOAD_SAVE_DIR = Path(
    Path.cwd().parent, "data.nosync", "combined_data", "combined_load.parquet"
)

# Weather

In [3]:
weather_schema = {
    "Forecast Date": str,
    "Vintage Date": str,
    "Vintage": pl.Categorical,
    "Station ID": pl.Categorical,
    "Max Temp": pl.Int16,
    "Min Temp": pl.Int16,
    "Max Wet Bulb": pl.Int16,
    "Min Wet Bulb": pl.Int16,
}

In [4]:
weather_files = list(Path.glob(WEATHER_LOAD_DIR, "*.csv"))
df_weather = pl.read_csv(weather_files[0], dtypes=weather_schema)

for file in weather_files[1:]:
    df_weather = pl.concat(
        [df_weather, pl.read_csv(file, dtypes=weather_schema, ignore_errors=True)],
        how="diagonal",
    )

df_weather = df_weather.with_columns(
    [
        pl.col("Forecast Date").str.to_date(r"%m/%d/%Y"),
        pl.col("Vintage Date").str.to_date(r"%m/%d/%Y"),
    ]
)

In [5]:
df_weather.write_parquet(WEATHER_SAVE_DIR)

# Load

In [29]:
load_schema = {
    "Time Stamp": str,
    "Time Zone": pl.Categorical,
    "Name": pl.Categorical,
    "PTID": pl.Int16,
    "Load": pl.Float32,
}

In [32]:
load_files = list(Path.glob(LOAD_LOAD_DIR, "*.csv"))
df_load = pl.read_csv(load_files[0], dtypes=load_schema)

for file in load_files[1:]:
    df_load = pl.concat(
        [df_load, pl.read_csv(file, dtypes=load_schema, ignore_errors=True)],
        how="diagonal",
    )

df_load = df_load.with_columns(
    pl.col("Time Stamp").str.to_datetime(r"%m/%d/%Y %H:%M:%S")
)

In [33]:
df_load.write_parquet(LOAD_SAVE_DIR)