# Converting Tests into Yaml-Files 

In [9]:
import logging
from pathlib import Path
from typing import Optional

import pandas as pd
import yaml
from _gettsim_tests import TEST_DATA_DIR

In [10]:
note_columns = ["note", "Note", "comment", "Comment"]
source_columns = ["source", "Source"]

roles = {
    "arbeitsl_geld": {
        "in_assumed": [
            "anwartschaftszeit",
            "arbeitssuchend",
            "m_durchg_alg1_bezug",
            "soz_vers_pflicht_5j",
        ],
        "out": [
            "arbeitsl_geld_m"
        ]
    }
}

In [11]:
def list_csv_files() -> list[Path]:
    return list(TEST_DATA_DIR.glob("*.csv"))

def read_file(file_name: str) -> pd.DataFrame:
    return pd.read_csv(TEST_DATA_DIR / file_name, header=0, index_col=0) \
        .squeeze("columns") \
        .reset_index()

def unique_years(df: pd.DataFrame, column_name: str = "jahr") -> list[int]:
    return sorted(list(df[column_name].unique()))

def grouped_by_year(df: pd.DataFrame, column_name: str = "jahr") -> dict[int, pd.DataFrame]:
    return {year: df[df[column_name] == year] for year in unique_years(df, column_name)}

def columns_by_role(df: pd.DataFrame, name: str) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
    out_cols = roles[name]["out"] if name in roles and "out" in roles[name] else []
    in_cols_assumed = (
        roles[name]["in_assumed"] if name in roles and "in_assumed" in roles[name] else []
    )

    in_cols = [c for c in df if c not in out_cols]
    in_cols_provided = [
        col for col in in_cols if col not in [*in_cols_assumed, *note_columns, *source_columns]
    ]

    note_cols = [col for col in df if col in note_columns]
    source_cols = [col for col in df if col in source_columns]

    return in_cols_provided, in_cols_assumed, out_cols, note_cols, source_cols

def create_yaml(df: pd.DataFrame, name: str) -> list[dict]:
    in_cols_provided, in_cols_assumed, out_cols, note_cols, source_cols = columns_by_role(df, name)

    out = []

    if "hh_id" in df:
        for hh_id in df["hh_id"].unique():
            df_hh = df.loc[df["hh_id"] == hh_id]

            source = "\n\n".join(str(df_hh[source_column].iloc[0]) for source_column in source_cols)
            note = "\n\n".join(str(df_hh[note_column].iloc[0]) for note_column in note_cols)
            specs = {"note": note, "source": source}

            inputs = {
                "provided": df_hh[in_cols_provided].to_dict("list"),
                "assumed": df_hh[in_cols_assumed].to_dict("list"),
            }
            outputs = df_hh[out_cols].to_dict("list")
            out.append({"inputs": inputs, "outputs": outputs, "info": specs})
    else:
        source = "\n\n".join(str(df[source_column].iloc[0]) for source_column in source_cols)
        note = "\n\n".join(str(df[note_column].iloc[0]) for note_column in note_cols)
        specs = {"note": note, "source": source}

        inputs = {
            "provided": df[in_cols_provided].to_dict("list"),
            "assumed": df[in_cols_assumed].to_dict("list"),
        }
        outputs = df[out_cols].to_dict("list")
        out.append({"inputs": inputs, "outputs": outputs, "info": specs})

    return out

def write_yaml_to_file(out: list[dict], name: str, year: Optional[int] = None) -> None:
    text = yaml.dump(out, sort_keys=False, allow_unicode=True, indent=2, width=88)
    if year is None:
        path = TEST_DATA_DIR / name / f"{name}.yaml"
    else:
        path = TEST_DATA_DIR / name / str(year) / f"{name}.yaml"

    path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Writing to {path}")

    with open(path, "w") as text_file:
        text_file.write(text)

def convert_test_data() -> None:
    for path in list_csv_files():
        df = read_file(path)
        name = path.stem

        if "jahr" not in df:
            yaml_out = create_yaml(df, name)
            write_yaml_to_file(yaml_out, name)
        else:
            for year, year_df in grouped_by_year(df).items():
                yaml_out = create_yaml(year_df, name)
                write_yaml_to_file(yaml_out, name, year)

In [12]:
convert_test_data()
