In [1]:
import datetime
import os
import polars as pl

In [22]:
def read_partition(path: str) -> pl.DataFrame:
    def _read_partition(path: str, partitions: dict[str, str]) -> list[pl.DataFrame]:
        if path.lower().endswith(".xlsx"):
            if f"device{partitions["Device"]}" not in path:
                raise Exception(f"File {path} in wrong Device directory")
            df = pl.read_excel(path)
            return [ df.with_columns([pl.lit(v).alias(k) for k, v in partitions.items()]) ]
        elif os.path.isdir(path):
            return [
                df
                for subpath in os.listdir(path)
                for df in _read_partition(
                    f"{path}/{subpath}",
                    partitions | (dict([subpath.split("=", 1)]) if "=" in subpath else {})
                )
            ]
        else:
            return []
    
    dfs = _read_partition(path, {})
    return pl.concat(dfs) if dfs else pl.DataFrame()


def parse_tamtop_datetime(input: str) -> datetime.datetime:
    return datetime.datetime.fromisoformat(
        input.replace(" ", "T").replace("(+08:00)", "") # Drop time zone and just treat everything as local time (PT) as TamTop fails to account for DST
    )

In [23]:
df = read_partition("data")
df = df.drop(pl.col("NO.")).with_columns(
    pl.col("Time").map_elements(parse_tamtop_datetime, return_dtype=pl.Datetime),
    pl.col("PM2.5(ug/m³)").cast(pl.Float64),
    pl.col("AQI").cast(pl.UInt16),
    pl.col("CO₂(ppm)").cast(pl.UInt16),
    pl.col("TVOC").cast(pl.UInt16),
    pl.col("Temperature(℉)").cast(pl.Float64),
    pl.col("Humidity(%RH)").cast(pl.UInt16),
)
display(
    df.group_by(pl.col("Device")).agg(
        pl.col("Time").len().alias("Readings"),
        pl.col("Time").min().alias("Earliest"),
        pl.col("Time").max().alias("Latest"),
        pl.col("Floor").n_unique().alias("Floors"),
        pl.col("Location").n_unique().alias("Locations")
    ).sort(by=pl.col("Device"))
)

Device,Readings,Earliest,Latest,Floors,Locations
str,u32,datetime[μs],datetime[μs],u32,u32
"""01""",135,2024-08-18 17:00:00,2024-08-24 14:00:00,1,2
"""02""",136,2024-08-18 17:00:00,2024-08-24 14:00:00,1,3
"""03""",489,2024-08-17 20:00:00,2024-09-07 14:00:00,1,3


In [24]:
df.with_columns(
    pl.lit("L").add(pl.col("Floor")).add(pl.lit(" ")).add(pl.col("Location")).alias("Floor & Location")
).filter(
    pl.col("Time") > datetime.datetime.fromisoformat("2024-08-24T04:00:00")
).pivot("Floor & Location", index="Time", values="AQI").plot.line(x="Time", y="L2 Unit 210")