In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from matplotlib.dates import DateFormatter

# Update rcParams to set the default font to Times New Roman
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'DejaVu Serif'

In [3]:
# define experiment to be preprocessed
experiment_series = "2024_04_03"

substance = "COVID" # one of PMMoV, COVID
rain_scenario = "KeinRegen" # one of KeinRegen, Nieselregen, MittelstarkerRegen
degradation_setting = "no_decay" # one of "no_decay", "linear_decay_dynamics", "constant_decay_dynamics"

In [4]:
simulation_results_path = f"../simulation_output/wastewater_model/{experiment_series}/{substance}/{rain_scenario}/{degradation_setting}/"
preprocessed_data_path = f"../preprocessing/preprocessed_data/wastewater_model/{experiment_series}"
preprocessed_file_name = f"{substance}_{rain_scenario}_{degradation_setting}.csv"

In [5]:
# create directories if they do not already exist
os.makedirs(os.path.join(preprocessed_data_path, "concentrations"), exist_ok=True)
os.makedirs(os.path.join(preprocessed_data_path, "flow_rates"), exist_ok=True)

# Preprocessing of wastewater data

In [7]:
"""
For a specific systems simulation scenario, get an overview over: 
    - measurment points, 
    - memilio ids (i.e. iterations)
    - concentration files (for further preprocessing of the output of the simulations)
"""
def get_overview_of_systems_output(simulation_results_path):
    flow_rate_files = [el for el in os.listdir(simulation_results_path) if ("flow_rate" in el)]
    concentration_files = [el for el in os.listdir(simulation_results_path) if ("INSIDe_substances_results" in el) and not ("Sampling" in el)]
    measurement_points = np.unique([el.split(".")[0] for el in [el.split("_")[-1] for el in concentration_files]])
    memilio_ids = np.unique([int(el.split("results_")[1].split("_")[0]) for el in concentration_files])

    # measurement points
    # assert len(np.unique([el.split("pipe_")[1].split("_")[0] for el in flow_rate_files])) == 90 # does this hold for all experiments?
    # memilio_ids
    assert set(np.unique([int(el.split("results_")[1].split("_")[0]) for el in flow_rate_files])) == set(memilio_ids)

    flow_rate_files = [el for el in flow_rate_files if el.split("pipe_")[1].split("_")[0] in measurement_points]
    flow_rate_measurement_points = np.unique([el.split("pipe_")[1].split("_")[0] for el in flow_rate_files if el.split("pipe_")[1].split("_")[0] in measurement_points])
    assert len(flow_rate_measurement_points) == 15
    
    return flow_rate_files, concentration_files, measurement_points, memilio_ids

flow_rate_files, concentration_files, measurement_points, memilio_ids = get_overview_of_systems_output(simulation_results_path)

print(f"Measurement Points: {measurement_points} \n")
# print(f"Memilio Iterations: {memilio_ids}")
print(f"N Iterations: {len(memilio_ids)}") # this should be 250

# check for consistency of the results
assert len([el for el in measurement_points if "MW" in el]) == 11
assert len([el for el in measurement_points if "RW" in el]) == 7
assert len(memilio_ids) == 250

Measurement Points: ['MW017' 'MW022' 'MW023' 'MW043' 'MW046' 'MW048' 'MW052' 'MW054' 'MW059'
 'MW061' 'MW064' 'RW126' 'RW141' 'RW143' 'RW155' 'RW156' 'RW157' 'RW211'] 

N Iterations: 250


### Concentration (core simulation result, necessary for all hydraulic settings)

In [8]:
"""
For a specific systems simulation scenario, get the results of the INSIDe substances simulations in a tidy data format
"""
def tidy_INSIDe_substances_results(simulation_results_path, file_name):
    print(file_name)
    df = pd.read_csv(os.path.join(simulation_results_path, file_name), sep="\t", decimal=",")
    df.rename(columns={"time[min]": "minutes", "COV19(COV19) concentration [copies/l]":"copies/l"}, inplace=True)
    df_2_index = df.loc[df.minutes=="time[min]", :].index
    df_1 = df.iloc[:df_2_index[0],:].copy()
    df_2 = df.iloc[df_2_index[0]+1:,:].copy()
    df_2.rename(columns={"copies/l": "temperature"}, inplace=True)
    df = df_1.merge(df_2, on="minutes", how="outer")
    df["sampling_point"] = file_name.split("_")[-1].split(".")[0]
    df["memilio_id"] = int(file_name.split("results_")[1].split("_")[0])
    df["minutes"] = df["minutes"].astype(int)
    df["copies/l"] = df["copies/l"].astype(float)
    df["temperature"] = df["temperature"].astype(float)
    df["time"] = pd.to_datetime(df["minutes"], unit="m", origin=pd.Timestamp('2024-01-01'))
    return df

In [9]:
# tidy the results of the INSIDe substances simulations for all concentration files
df_res = [tidy_INSIDe_substances_results(simulation_results_path, file_name) for file_name in concentration_files]
df_res = pd.concat(df_res)

INSIDe_substances_results_112_output_manhole_RW141.txt
INSIDe_substances_results_30_output_manhole_RW126.txt
INSIDe_substances_results_38_output_manhole_RW126.txt
INSIDe_substances_results_186_output_manhole_RW143.txt
INSIDe_substances_results_163_output_manhole_RW155.txt
INSIDe_substances_results_131_output_manhole_RW143.txt
INSIDe_substances_results_165_output_manhole_RW126.txt
INSIDe_substances_results_5_output_manhole_RW141.txt
INSIDe_substances_results_111_output_manhole_MW064.txt
INSIDe_substances_results_135_output_manhole_MW064.txt
INSIDe_substances_results_78_output_manhole_MW017.txt
INSIDe_substances_results_198_output_manhole_MW061.txt
INSIDe_substances_results_185_output_manhole_MW054.txt
INSIDe_substances_results_130_output_manhole_MW022.txt
INSIDe_substances_results_203_output_manhole_RW126.txt
INSIDe_substances_results_103_output_manhole_RW126.txt
INSIDe_substances_results_27_output_manhole_RW141.txt
INSIDe_substances_results_141_output_manhole_RW141.txt
INSIDe_substance

In [10]:
df_res.to_csv(os.path.join(preprocessed_data_path, "concentrations", preprocessed_file_name), index=False)

### Flow rates - optional (necessary for flow rate normalization or basic checks and visualizations)

In [19]:
"""
For a specific systems simulation scenario, get the results of the INSIDe flow simulations in a tidy data format
"""
def tidy_INSIDe_flow_results(simulation_results_path, file_name):
    print(file_name)
    df = pd.read_csv(os.path.join(simulation_results_path, file_name), sep="\t", decimal=".")
    df.rename(columns={"t [min] ": "minutes", " flow rate [l/s]":"l/s"}, inplace=True)
    assert np.array_equal(df.minutes, df.minutes.astype(int)) # check whether minutes are all integer
    df["minutes"] = df["minutes"].astype(int)
    df["sampling_point"] = file_name.split("pipe_")[-1].split("_")[0]
    df["memilio_id"] = int(file_name.split("results_")[1].split("_")[0])
    df["time"] = pd.to_datetime(df["minutes"], unit="m", origin=pd.Timestamp('2024-01-01'))
    return df


In [20]:
# tidy the results of the INSIDe substances simulations for all concentration files
df_flow = [tidy_INSIDe_flow_results(simulation_results_path, file_name) for file_name in flow_rate_files]
df_flow = pd.concat(df_flow)

INSIDe_hydraulic_results_188_output_pipe_MW052_flow_rate.txt
INSIDe_hydraulic_results_3_output_pipe_MW046_flow_rate.txt
INSIDe_hydraulic_results_114_output_pipe_MW046_flow_rate.txt
INSIDe_hydraulic_results_236_output_pipe_RW141_flow_rate.txt
INSIDe_hydraulic_results_49_output_pipe_MW022_flow_rate.txt
INSIDe_hydraulic_results_247_output_pipe_MW043_flow_rate.txt
INSIDe_hydraulic_results_203_output_pipe_MW059_flow_rate.txt
INSIDe_hydraulic_results_125_output_pipe_RW143_flow_rate.txt
INSIDe_hydraulic_results_203_output_pipe_RW141_flow_rate.txt
INSIDe_hydraulic_results_180_output_pipe_MW054_flow_rate.txt
INSIDe_hydraulic_results_51_output_pipe_MW048_flow_rate.txt
INSIDe_hydraulic_results_184_output_pipe_RW141_flow_rate.txt
INSIDe_hydraulic_results_193_output_pipe_MW023_flow_rate.txt
INSIDe_hydraulic_results_217_output_pipe_MW059_flow_rate.txt
INSIDe_hydraulic_results_20_output_pipe_RW156_flow_rate.txt
INSIDe_hydraulic_results_155_output_pipe_MW043_flow_rate.txt
INSIDe_hydraulic_results_139_

In [23]:
df_flow.to_csv(os.path.join(preprocessed_data_path, "flow_rates", preprocessed_file_name), index=False)