In [11]:
# Read configs (required!)

import tomllib
with open("settings.toml", "rb") as f:
    config = tomllib.load(f)

with open("sensors.toml", "rb") as f:
    sensors = tomllib.load(f)

In [12]:
def transformSensorFile(df_dict: dict, sensor_name: str, datetime_col=False):
    """Split the three existing columns into eight with sensor info, location and separate columns for time data."""
    df_dict["df"].drop(df_dict["idxcol"], axis=1, inplace=True)
    if not datetime_col: df_dict["df"]["Datum"] = df_dict["df"][df_dict["timecol"]].dt.date
    else: df_dict["df"]["Datum"] = df_dict["df"][df_dict["timecol"]]
    df_dict["df"]["Jahr"] = df_dict["df"][df_dict["timecol"]].dt.year
    df_dict["df"]["Monat"] = df_dict["df"][df_dict["timecol"]].dt.month
    df_dict["df"]["Tag"] = df_dict["df"][df_dict["timecol"]].dt.day
    df_dict["df"]["Uhrzeit"] = df_dict["df"][df_dict["timecol"]].dt.time
    df_dict["df"]["Sensor"] = sensor_name
    df_dict["df"]["Standort"] = sensors[sensor_name]["location"]
    df_dict["df"].drop(df_dict["timecol"], axis=1, inplace=True)
    df_dict["df"].rename(columns={df_dict["tmpcol"]: "Temperatur"}, inplace=True)
    return df_dict

In [None]:
# Read excel files
import pandas as pd
import glob
import re
import time

path_to_files = r"C:\Users\kevin\Documents\Code\angelverein\Daten FGV"
save_path = "all_data.csv"

def concat_sensor_files(path_to_files: str, save_path=None) -> None | pd.DataFrame :
    """Concatenate all given csv files collected from sensors into new ones."""
    data_paths = glob.glob(path_to_files+"\\FGV_*.xlsx", recursive=True)
    print("Found", len(data_paths), "files")

    sensors_chunks = {key: [] for key in sensors.keys()}

    if save_path is not None: stime = time.perf_counter()

    for idx, file in enumerate(data_paths):
        sensor_name = re.search(r"FGV_\d+", file).group()
        if not sensor_name in sensors.keys():
            raise(NameError(f"Trying to read sensor {sensor_name} which is not defined in sensors.toml"))
        print("Reading sensor", sensor_name, f"({idx+1}/{len(data_paths)})")
        df = pd.read_excel(file)

        idxcol, timecol, tmpcol = None, None, None
        for idx, col in enumerate(df.columns):
            # print(idx, col)
            if config["names"]["index_column"] in col:
                idxcol = col
                # print("Found index column:", col)
            elif config["names"]["timestamp_column"] in col:
                timecol = col
                # print("Found timestamp column:", col)
            elif config["names"]["temperature_column"] in col:
                tmpcol = col
                # print("Found temperature column:", col)
            else: raise(IndexError(f"Found unknown column: {col}"))

        df[timecol] = pd.to_datetime(df[timecol], format=config["formats"]["time_format"])
        df.sort_values(timecol, ascending=False, inplace=True)
        df.dropna()
        # sensors_chunks[sensor_name].append({"df": df, "idxcol": idxcol, "timecol": timecol, "tmpcol": tmpcol})
        df = transformSensorFile({"df": df, "idxcol": idxcol, "timecol": timecol, "tmpcol": tmpcol}, sensor_name, datetime_col=True)["df"]
        sensors_chunks[sensor_name].append(df)

    # Use topmost entry
    searchfunc = lambda x: x["Datum"].iloc[0]
    # Sort chunks after newest newest entry
    for key in sensors_chunks.keys():
        sensors_chunks[key].sort(key=searchfunc, reverse=True) # Newest at top
        for idx, x in enumerate(sensors_chunks[key]):
            if save_path is not None: print(f"{idx}: {searchfunc(x)}", end=" ")
        if save_path is not None: print(f"(Sensor {key})")
        sensors_chunks[key] = pd.concat(sensors_chunks[key])

    all_sensors_chunks = [sensors_chunks[key] for key in sensors_chunks.keys()]
    all_sensors_chunks = pd.concat(all_sensors_chunks)
    # sensors_chunks should now contain all the sensor files, sorted for all sensors read with newest data at the top

    if save_path is not None:
        with open(save_path, "w") as f:
            all_sensors_chunks.to_csv(f, index=False)

        ttime = time.perf_counter() - stime
        print(f"Finished in {ttime:.2f}s")

    else: return all_sensors_chunks

In [4]:
concat_sensor_files(path_to_files, save_path)

Found 16 files
Reading sensor FGV_01 (1/16)
Reading sensor FGV_02 (2/16)
Reading sensor FGV_02 (3/16)
Reading sensor FGV_03 (4/16)
Reading sensor FGV_03 (5/16)
Reading sensor FGV_04 (6/16)
Reading sensor FGV_05 (7/16)
Reading sensor FGV_06 (8/16)
Reading sensor FGV_07 (9/16)
Reading sensor FGV_08 (10/16)
Reading sensor FGV_08 (11/16)
Reading sensor FGV_09 (12/16)
Reading sensor FGV_09 (13/16)
Reading sensor FGV_10 (14/16)
Reading sensor FGV_10 (15/16)
Reading sensor FGV_11 (16/16)
0: 2025-02-12 15:50:00 (Sensor FGV_01)
0: 2025-02-12 16:00:00 1: 2024-07-19 20:20:00 (Sensor FGV_02)
0: 2025-02-12 16:10:00 1: 2024-07-19 20:30:00 (Sensor FGV_03)
0: 2024-07-19 20:40:00 (Sensor FGV_04)
0: 2024-07-19 20:50:00 (Sensor FGV_05)
0: 2024-07-24 15:10:00 (Sensor FGV_06)
0: 2024-07-24 12:40:00 (Sensor FGV_07)
0: 2025-02-13 13:30:00 1: 2024-07-07 17:50:00 (Sensor FGV_08)
0: 2025-02-13 13:30:00 1: 2024-07-07 18:10:00 (Sensor FGV_09)
0: 2024-09-23 10:30:00 1: 2024-07-07 16:40:00 (Sensor FGV_10)
0: 2024-1

In [4]:
def append_sensor_files(path_to_files: str, old_file: str, save_path: str):
    """Concatenate an existing file with new ones."""
    stime = time.perf_counter()
    print("Reading old file...")
    base = pd.read_csv(old_file)
    print(f"Done ({time.perf_counter()-stime:.2f}s). Concatenating new files...")
    new = concat_sensor_files(path_to_files=path_to_files)
    print(f"Done ({time.perf_counter()-stime:.2f}s). Combining...")
    combined = pd.concat([new, base])
    print(f"Done ({time.perf_counter()-stime:.2f}s). Saving...")
    with open(save_path, "w") as f:
        combined.to_csv(f, index=False)

    print(f"Done combining files, took {time.perf_counter()-stime:.2f}s")

append_sensor_files(path_to_files, save_path, "all_duplicate.csv")

Reading old file...
Done (0.67s). Concatenating new files...
Found 16 files
Reading sensor FGV_01 (1/16)
Reading sensor FGV_02 (2/16)
Reading sensor FGV_02 (3/16)
Reading sensor FGV_03 (4/16)
Reading sensor FGV_03 (5/16)
Reading sensor FGV_04 (6/16)
Reading sensor FGV_05 (7/16)
Reading sensor FGV_06 (8/16)
Reading sensor FGV_07 (9/16)
Reading sensor FGV_08 (10/16)
Reading sensor FGV_08 (11/16)
Reading sensor FGV_09 (12/16)
Reading sensor FGV_09 (13/16)
Reading sensor FGV_10 (14/16)
Reading sensor FGV_10 (15/16)
Reading sensor FGV_11 (16/16)
Done (16.64s). Combining...
Done (16.93s). Saving...
Done combining files, took 21.51s


In [18]:
# Concatenate all files in given list

import pandas as pd
import glob
import re
import time

# path_to_files = r"C:\Users\kevin\Documents\Code\angelverein\Daten FGV"

def concat_sensor_files_testfunc(path_to_files: str |list[str], save_path=None) -> None | pd.DataFrame :
    """Concatenate all given csv files collected from sensors into new ones."""
    if type(path_to_files) is list: 
        print(f"Selected files: {path_to_files}")
        data_paths = path_to_files
    else:
        data_paths = glob.glob(path_to_files+"\\FGV_*.xlsx", recursive=True)
        print("Found", len(data_paths), "files")

        # sensors_chunks = {key: [] for key in self.__config.sensors}
    sensors_chunks = {}

    if save_path is not None: stime = time.perf_counter()

    for idx, file in enumerate(data_paths):
        sensor_name = re.search(r"FGV_\d+", file).group()
        if not sensor_name in sensors.keys():
            raise(NameError(f"Trying to read sensor {sensor_name} which is not defined in sensors.toml"))
        if sensor_name not in sensors_chunks.keys():
                sensors_chunks[sensor_name] = []
        print("Reading sensor", sensor_name, f"({idx+1}/{len(data_paths)})")
        df = pd.read_excel(file)

        idxcol, timecol, tmpcol = None, None, None
        for idx, col in enumerate(df.columns):
            # print(idx, col)
            if config["names"]["index_column"] in col:
                idxcol = col
                # print("Found index column:", col)
            elif config["names"]["timestamp_column"] in col:
                timecol = col
                # print("Found timestamp column:", col)
            elif config["names"]["temperature_column"] in col:
                tmpcol = col
                # print("Found temperature column:", col)
            else: raise(IndexError(f"Found unknown column: {col}"))

        df[timecol] = pd.to_datetime(df[timecol], format=config["formats"]["time_format"])
        df.sort_values(timecol, ascending=False, inplace=True)
        df.dropna()
        # sensors_chunks[sensor_name].append({"df": df, "idxcol": idxcol, "timecol": timecol, "tmpcol": tmpcol})
        df = transformSensorFile({"df": df, "idxcol": idxcol, "timecol": timecol, "tmpcol": tmpcol}, sensor_name, datetime_col=True)["df"]
        sensors_chunks[sensor_name].append(df)

    # Use topmost entry
    searchfunc = lambda x: x["Datum"].iloc[0]
    # Sort chunks after newest newest entry
    for key in sensors_chunks.keys():
        sensors_chunks[key].sort(key=searchfunc, reverse=True) # Newest at top
        for idx, x in enumerate(sensors_chunks[key]):
            if save_path is not None: print(f"{idx}: {searchfunc(x)}", end=" ")
        if save_path is not None: print(f"(Sensor {key})")
        sensors_chunks[key] = pd.concat(sensors_chunks[key])

    all_sensors_chunks = [sensors_chunks[key] for key in sensors_chunks.keys()]
    all_sensors_chunks = pd.concat(all_sensors_chunks)
    # sensors_chunks should now contain all the sensor files, sorted for all sensors read with newest data at the top

    if save_path is not None:
        with open(save_path, "w") as f:
            all_sensors_chunks.to_csv(f, index=False)

        ttime = time.perf_counter() - stime
        print(f"Finished in {ttime:.2f}s")

    else: return all_sensors_chunks

In [19]:
path_to_files = [r"C:\Users\kevin\Documents\Code\angelverein\Daten FGV\FGV_03 2024-07-19 20_34_15 CEST (Data CEST).xlsx", r"C:\Users\kevin\Documents\Code\angelverein\Daten FGV\FGV_08 2025-02-13 13_28_34 CET (Data CET).xlsx"]
save_path = "all_data_und.csv"
concat_sensor_files_testfunc(path_to_files, save_path)

Selected files: ['C:\\Users\\kevin\\Documents\\Code\\angelverein\\Daten FGV\\FGV_03 2024-07-19 20_34_15 CEST (Data CEST).xlsx', 'C:\\Users\\kevin\\Documents\\Code\\angelverein\\Daten FGV\\FGV_08 2025-02-13 13_28_34 CET (Data CET).xlsx']
Reading sensor FGV_03 (1/2)
Reading sensor FGV_08 (2/2)
0: 2024-07-19 20:30:00 (Sensor FGV_03)
0: 2025-02-13 13:30:00 (Sensor FGV_08)
Finished in 3.12s
