# Converting Tests into Yaml-Files 

In [84]:
from pathlib import Path
from typing import Any, Optional

import pandas as pd
import yaml
from _gettsim_tests import TEST_DATA_DIR

In [85]:
note_columns = [
    "note",
    "Note",
    "notes",
    "comment",
    "Comment",
    "Notes on Entgeltpunkte",
    "Notes on Regelaltersgrenze",
]
source_columns = ["source", "Source", "Quelle Arbeitgeber"]

roles = {
    "arbeitsl_geld": {
        "in_assumed": [
            "anwartschaftszeit",
            "arbeitssuchend",
            "m_durchg_alg1_bezug",
            "soz_vers_pflicht_5j",
        ],
        "out": ["arbeitsl_geld_m"],
    },
    "arbeitsl_geld_2": {
        "out": [
            "arbeitsl_geld_2_eink_m",
            "arbeitsl_geld_2_regelsatz_m_hh",
            "arbeitsl_geld_2_kost_unterk_m_hh",
            "arbeitsl_geld_2_m_hh",
        ]
    },
    "benefit_checks": {
        "out": ["kinderzuschl_m_hh", "wohngeld_m_hh", "arbeitsl_geld_2_m_hh"]
    },
    "eink_st": {
        "out": [
            "eink_st_ohne_kinderfreib_tu",
            "eink_st_mit_kinderfreib_tu",
            "abgelt_st_tu",
            "soli_st_tu",
        ]
    },
    "elterngeld": {
        "out": [
            "elterngeld_m",
            "elterngeld_geschw_bonus_anspruch",
            "_elterngeld_anz_mehrlinge_anspruch",
            "elternzeit_anspruch",
        ]
    },
    "favorability_check": {"out": ["eink_st_tu", "zu_verst_eink_tu"]},
    "full_taxes_and_transfers": {
        "out": [
            # TODO: what are the targets?
            # "eink_st_tu",
            # "soli_st_tu",
            # "abgelt_st_tu",
            # "ges_rentenv_beitr_m",
            # "arbeitsl_v_beitr_m",
            # "ges_krankenv_beitr_m",
            # "ges_pflegev_beitr_m",
            # "arbeitsl_geld_m",
            # "kindergeld_m_tu",
            # "arbeitsl_geld_2_m_hh",
            # "kinderzuschl_m_hh",
            # "wohngeld_m_hh",
            # "unterhaltsvors_m_hh",
        ]
    },
    "grundrente": {
        "out": [
            "grundr_zuschlag_bonus_entgeltp",
            "grundr_zuschlag_vor_eink_anr_m",
            "grundr_zuschlag_m",
            "ges_rente_m",
        ]
    },
    "grundrente_proxy_rente": {
        "out": [
            "rente_vorj_vor_grundr_proxy_m",
        ]
    },
    "grunds_im_alter": {
        "out": [
            "grunds_im_alter_m_hh",
        ]
    },
    "kindergeld": {
        "out": [
            "kindergeld_m_tu",
            "kinderbonus_m_tu",
            "kindergeld_m_hh",
            "kinderbonus_m_hh",
            "kinderbonus_m_tu",
        ]
    },
    "kinderzuschl": {
        "out": [
            "_kinderzuschl_vor_vermög_check_m_tu",
            "_kinderzuschl_nach_vermög_check_m_tu",
        ]
    },
    "renten_alter": {
        "out": [
            # TODO: what are the targets?
            # "ges_rente_regelaltersgrenze",
            # "ges_rente_frauen_altersgrenze",
            # "_ges_rente_langj_altersgrenze",
            # "_ges_rente_besond_langj_altersgrenze",
        ]
    },
    "renten_anspr": {
        "out": [
            "entgeltp_update",
            "entgeltp_update_lohn",
            "_ges_rente_altersgrenze_abschlagsfrei",
        ]
    },
    "soli_st": {"out": ["soli_st_tu"]},
    "sozialv_beitr": {
        "out": [
            "sozialv_beitr_m",
            "sozialv_beitr_arbeitg_m",
            "_sozialv_beitr_arbeitn_arbeitg_m",
            "ges_rentenv_beitr_m",
            "arbeitsl_v_beitr_m",
            "ges_krankenv_beitr_m",
            "ges_pflegev_beitr_m",
        ]
    },
    "unterhalt": {"out": ["kind_unterh_zahlbetr_m"]},
    "unterhaltsvors": {"out": ["unterhaltsvors_m"]},
    "vorsorgeaufw": {"out": ["vorsorgeaufw_tu"]},
    "wohngeld": {
        "out": ["wohngeld_vor_vermög_check_m_hh", "wohngeld_nach_vermög_check_m_hh"]
    },
    "zu_verst_eink": {
        "out": [
            "_zu_verst_eink_ohne_kinderfreib_tu",
            "zu_verst_eink_mit_kinderfreib_tu",
            "eink_st_kinderfreib_tu",
            "eink_st_altersfreib",
            "alleinerz_freib_tu",
            "sum_eink",
            "_eink_st_behinderungsgrad_pauschbetrag",
        ]
    },
}

In [86]:
def list_csv_files() -> list[Path]:
    return list(TEST_DATA_DIR.glob("*.csv"))


def read_file(file_name: str) -> pd.DataFrame:
    return (
        pd.read_csv(TEST_DATA_DIR / file_name, header=0, index_col=0, encoding="utf-8")
        .squeeze("columns")
        .reset_index()
    )


def unique_years(df: pd.DataFrame, column_name: str = "jahr") -> list[int]:
    return sorted(df[column_name].unique())


def grouped_by_year(
    df: pd.DataFrame, column_name: str = "jahr"
) -> dict[int, pd.DataFrame]:
    return {year: df[df[column_name] == year] for year in unique_years(df, column_name)}


def columns_by_role(
    df: pd.DataFrame, name: str
) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
    out_cols = roles[name]["out"] if name in roles and "out" in roles[name] else []
    in_cols_assumed = (
        roles[name]["in_assumed"]
        if name in roles and "in_assumed" in roles[name]
        else []
    )

    in_cols = [c for c in df if c not in out_cols and not c.startswith("Unnamed:")]
    in_cols_provided = [
        col
        for col in in_cols
        if col not in [*in_cols_assumed, *note_columns, *source_columns]
    ]

    note_cols = [col for col in df if col in note_columns]
    source_cols = [col for col in df if col in source_columns]

    return in_cols_provided, in_cols_assumed, out_cols, note_cols, source_cols


def create_yaml(df: pd.DataFrame, name: str) -> list[dict]:
    (
        in_cols_provided,
        in_cols_assumed,
        out_cols,
        note_cols,
        source_cols,
    ) = columns_by_role(df, name)

    out = []

    if "hh_id" in df:
        for hh_id in df["hh_id"].unique():
            df_hh = df.loc[df["hh_id"] == hh_id]

            source = "\n\n".join(
                value_to_string(df_hh[source_column].iloc[0])
                for source_column in source_cols
                if value_to_string(df_hh[source_column].iloc[0]) != ""
            )
            note = "\n\n".join(
                value_to_string(df_hh[note_column].iloc[0])
                for note_column in note_cols
                if value_to_string(df_hh[note_column].iloc[0]) != ""
            )
            specs = {"note": note, "source": source}

            inputs = {
                "provided": df_hh[in_cols_provided].to_dict("list"),
                "assumed": df_hh[in_cols_assumed].to_dict("list"),
            }
            outputs = df_hh[out_cols].to_dict("list")
            out.append({"info": specs, "inputs": inputs, "outputs": outputs})
    else:
        source = "\n\n".join(
            value_to_string(df[source_column].iloc[0])
            for source_column in source_cols
            if value_to_string(df[source_column].iloc[0]) != ""
        )
        note = "\n\n".join(
            value_to_string(df[note_column].iloc[0])
            for note_column in note_cols
            if value_to_string(df[note_column].iloc[0]) != ""
        )
        specs = {"note": note, "source": source}

        inputs = {
            "provided": df[in_cols_provided].to_dict("list"),
            "assumed": df[in_cols_assumed].to_dict("list"),
        }
        outputs = df[out_cols].to_dict("list")
        out.append({"info": specs, "inputs": inputs, "outputs": outputs, })

    return out


def value_to_string(value: Any) -> str:
    if pd.isnull(value):
        return ""
    else:
        return str(value)


def write_yaml_to_file(out: list[dict], name: str, year: Optional[int] = None) -> None:
    text = yaml.dump(out, sort_keys=False, allow_unicode=True, indent=2, width=88)
    if year is None:
        path = TEST_DATA_DIR / name / f"{name}.yaml"
    else:
        path = TEST_DATA_DIR / name / f"{year}.yaml"

    path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Writing to {path}")

    with open(path, "w", encoding="utf-8") as text_file:
        text_file.write(text)


def convert_test_data() -> None:
    for path in list_csv_files():
        df = read_file(path)
        name = path.stem

        if "jahr" not in df:
            yaml_out = create_yaml(df, name)
            write_yaml_to_file(yaml_out, name)
        else:
            for year, year_df in grouped_by_year(df).items():
                yaml_out = create_yaml(year_df, name)
                write_yaml_to_file(yaml_out, name, year)

In [87]:
for file in list_csv_files():
    print(f'"{file.stem}": {"{}"},')

"arbeitsl_geld": {},
"arbeitsl_geld_2": {},
"benefit_checks": {},
"eink_st": {},
"elterngeld": {},
"favorability_check": {},
"full_taxes_and_transfers": {},
"grundrente": {},
"grundrente_proxy_rente": {},
"grunds_im_alter": {},
"kindergeld": {},
"kinderzuschl": {},
"renten_alter": {},
"renten_anspr": {},
"soli_st": {},
"sozialv_beitr": {},
"unterhalt": {},
"unterhaltsvors": {},
"vorsorgeaufw": {},
"wohngeld": {},
"zu_verst_eink": {},


In [88]:
convert_test_data()


Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld\2010.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld\2011.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld\2015.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld\2019.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld_2\2005.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld_2\2006.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld_2\2009.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld_2\2013.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\arbeitsl_geld_2\2018.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src

  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")
  outputs = df_hh[out_cols].to_dict("list")


Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2006.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2009.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2013.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2016.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2017.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2019.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2020.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\kinderzuschl\2021.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_tests\test_data\renten_alter\renten_alter.yaml
Writing to C:\Users\Lars\Repositories\work\gettsim\src\_gettsim_t