In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os

from matplotlib.dates import DateFormatter

# Update rcParams to set the default font to Times New Roman
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'DejaVu Serif'

In [2]:
# define experiment to be preprocessed
experiment_series = "2024_09_17"

### Tidy memilio data part 2: time since transmission & location per agent and timepoint

In [3]:
infection_model_output_path = f"../preprocessing/preprocessed_data/infection_model/{experiment_series}"
os.makedirs(os.path.join(infection_model_output_path), exist_ok=True)
os.makedirs(os.path.join(infection_model_output_path, "agent_trajectories"), exist_ok=True)

In [4]:
def get_systems_location(df_locations, memilio_location):
    locs = df_locations.loc[df_locations.memilio_locations.apply(lambda locations: memilio_location in locations), "systems_location"]
    if len(locs) > 1:
        print(f"Warning: Multiple locations found for {memilio_location}: {locs}")
    return locs.iloc[0]

In [5]:
def load_infection_path_information(simulation_id, experiment_series):
    """
    This function loads the infection path information from the MEmilio output and returns a tidied version of it.

    Parameters
    ----------
    simulation_id : int
        identifier of memilio simulation
    experiment_series : str
        name of the experiment series in which the simulation was run

    Returns
    -------
    pd.DataFrame
        with columns agent_id, t_exposed (time span of exposure), t_infected (time span of infection), hospitalized (whether or not the agent was hospitalized)
    """
    # read in infection paths
    infection_paths = pd.read_csv(f"../simulation_output/infection_model/{experiment_series}/{simulation_id}_infection_paths.txt", header=0, index_col=None, sep=" ").reset_index()
    new_cols = infection_paths.columns[1:].to_list()
    new_cols.append("bla")
    infection_paths.columns = new_cols
    infection_paths = infection_paths.drop(columns="bla")

    # infection duration
    infection_paths["t_infected"] = infection_paths["E"] + infection_paths["I_ns"] + infection_paths["I_sy"] + infection_paths["I_sev"] + infection_paths["I_cri"]
    infection_paths["hospitalized"] = infection_paths["I_sev"] + infection_paths["I_cri"] > 0

    # For the shedding curve, we only need the agent id, duration of infection, the duration of exposure, and whether the agent was hospitalized
    infection_paths = infection_paths[["Agent_id", "E", "I_ns", "t_infected", "hospitalized"]]
    infection_paths.columns = ["agent_id", "t_exposed", "t_nonsymptomatic", "t_infected", "hospitalized"]
    
    # change time scale to days
    infection_paths[["t_exposed", "t_nonsymptomatic", "t_infected"]] = infection_paths[["t_exposed", "t_nonsymptomatic", "t_infected"]] / 24

    return infection_paths

In [6]:
def tidy_memilio_output_file(simulation_id, experiment_series):
    """
    This function loads the output.txt information from the MEmilio simulation result and returns a tidied version of it.

    Parameters
    ----------
    simulation_id : int
        identifier of memilio simulation
    experiment_series : str
        name of the experiment series in which the simulation was run

    Returns
    -------
    pd.DataFrame
        with columns
        - systems_location: location of the agent
        - agent_id: identifier of the agent
        - time_in_hours: simulation time (hours)
        - time_in_days simulation time (days)
        - time_since_transmission: time since transmission of the agent in hours
        - simulation_id: identifier of the memilio simulation
    """
        
    # Load location mapping information and transform it to tidy data
    df_locations = pd.read_csv(f"../simulation_output/infection_model/{experiment_series}/{simulation_id}_location_mapping.txt", index_col=None, header=None)
    df_locations["systems_location"] = df_locations.apply(lambda row: row.iloc[0].split(" ")[0].zfill(5), axis=1)
    df_locations["memilio_locations"] = df_locations.apply(lambda row: row.iloc[0].split(" ")[1:], axis=1)
    df_locations["memilio_locations"] = df_locations.memilio_locations.apply(lambda x: [i for i in x if i != ""])
    df_locations = df_locations[['systems_location', 'memilio_locations']]

    # preprocessing of output.txt
    df = pd.read_csv(f"../simulation_output/infection_model/{experiment_series}/{simulation_id}_output.txt", header=None)
    # add location information
    df["memilio_location"] = df.apply(lambda row: row.iloc[0].split(" ")[0], axis=1)
    df.loc[1:,"systems_location"] = df["memilio_location"].iloc[1:].apply(lambda x: get_systems_location(df_locations, x))
    df.loc[df.memilio_location=="1000", "systems_location"] = "graveyard"


    # create new dataframe that contains information about the agent location and time since transmission for each timepoint
    # df_tidy = pd.DataFrame()

    def get_df_sub_from_row(row_output):
        df_sub = pd.DataFrame()
        n_timesteps = int(row_output[0].split(" ")[1])
        output = row_output[0].split(" ")[2:]
        systems_location = row_output["systems_location"]

        # transform
        # current timestep, number of agent, agent_id_1, time since transmission for a1, ...
        # to df with location, agent_id, t, time_since_transmission information
        for i in range(0, n_timesteps):
            # evaluate one timestep
            current_timestep = float(output[0])
            current_n_agents = int(output[1])
            n_entries = current_n_agents*2
            timestep_output = output[2:2+n_entries]
            timestep_output = [float(i) for i in timestep_output]
            times_since_transmission = timestep_output[1::2]

            df_sub = pd.concat((df_sub, pd.DataFrame({"systems_location": [systems_location]*current_n_agents, "agent_id": [int(el) for el in timestep_output[::2]], "time_in_hours": [current_timestep] * current_n_agents, "time_since_transmission": times_since_transmission})))
            output = output[2+n_entries:]
        return df_sub

    df_tidy = pd.concat(df.apply(lambda row: get_df_sub_from_row(row), axis=1).to_list())

    df_tidy["time_in_days"] = df_tidy["time_in_hours"]/24
    df_tidy["time_since_transmission"] = df_tidy["time_since_transmission"]/24
    df_tidy["simulation_id"] = simulation_id
    return df_tidy

In [7]:
# Attention! running this for every simulation id takes quite some time
[tidy_memilio_output_file(simulation_id, experiment_series).to_csv(f"{infection_model_output_path}/agent_trajectories/agent_trajectories_{simulation_id}.csv") for simulation_id in range(1, 251)]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,