# Converting Tests into Yaml-Files 

In [None]:
from pathlib import Path
from typing import Any, Optional

import numpy as np
import pandas as pd
import yaml
from _gettsim_tests import TEST_DATA_DIR

In [None]:
note_columns = [
    "note",
    "Note",
    "notes",
    "comment",
    "Comment",
    "Notes on Entgeltpunkte",
    "Notes on Regelaltersgrenze",
]
source_columns = ["source", "Source", "Quelle Arbeitgeber"]

roles = {
    "arbeitsl_geld": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "bruttolohn_vorj_m",
            "wohnort_ost",
            "kind",
            "arbeitsstunden_w",
            "alter",
            "geburtsjahr",
            "jahr",
        ],
        "in_assumed": [
            "anwartschaftszeit",
            "arbeitssuchend",
            "m_durchg_alg1_bezug",
            "sozialv_pflicht_5j",
        ],
        "out": ["arbeitsl_geld_m"],
    },
    "arbeitsl_geld_2": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind",
            "alter",
            "bruttokaltmiete_m_hh",
            "heizkosten_m_hh",
            "wohnfläche_hh",
            "bewohnt_eigentum_hh",
            "alleinerz",
            "bruttolohn_m",
            "sum_ges_rente_priv_rente_m",
            "kapitaleink_brutto_m",
            "arbeitsl_geld_m",
            "sonstig_eink_m",
            "eink_selbst_m",
            "eink_vermietung_m",
            "eink_st_tu",
            "soli_st_tu",
            "sozialv_beitr_m",
            "kindergeld_m_hh",
            "kind_unterh_erhalt_m",
            "unterhaltsvors_m",
            "elterngeld_m",
            "jahr",
            "wohngeld_vor_vermög_check_m_hh",
            "vermögen_bedürft_hh",
            "geburtsjahr",
            "rentner",
            "in_ausbildung",
            "arbeitsstunden_w",
            "bürgerg_bezug_vorj",
        ],
        "out": [
            "arbeitsl_geld_2_eink_anr_frei_m",
            "arbeitsl_geld_2_eink_m",
            # "_arbeitsl_geld_2_alleinerz_mehrbedarf_m_hh",
            "arbeitsl_geld_2_regelsatz_m_hh",
            "arbeitsl_geld_2_kost_unterk_m_hh",
            # "unterhaltsvors_m_hh",
            # "arbeitsl_geld_2_vor_vorrang_m_hh",
            "arbeitsl_geld_2_m_hh",
        ],
    },
    "benefit_checks": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind",
            "rentner",
            "alter",
            "vermögen_bedürft_hh",
            "_kinderzuschl_vor_vermög_check_m_tu",
            "wohngeld_vor_vermög_check_m_hh",
            "arbeitsl_geld_2_regelbedarf_m_hh",
            "kindergeld_m_hh",
            "kind_unterh_erhalt_m_hh",
            "unterhaltsvors_m_hh",
            "arbeitsl_geld_2_eink_m_hh",
            "geburtsjahr",
            "jahr",
        ],
        "out": ["kinderzuschl_m_hh", "wohngeld_m_hh", "arbeitsl_geld_2_m_hh"],
    },
    "eink_st": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind",
            "zu_verst_eink_kein_kinderfreib",
            "zu_verst_eink_kinderfreib",
            "kapitaleink_brutto",
        ],
        "out": [
            "eink_st_ohne_kinderfreib_tu",
            "eink_st_mit_kinderfreib_tu",
            "abgelt_st_tu",
            "soli_st_tu",
        ],
    },
    "elterngeld": {
        "in_provided": [
            "hh_id",
            "tu_id",
            "p_id",
            "kind",
            "bruttolohn_m",
            "bruttolohn_vorj_m",
            "wohnort_ost",
            "eink_st_m",
            "soli_st_m",
            "sozialv_beitr_m",
            "geburtsjahr",
            "geburtsmonat",
            "geburtstag",
            "m_elterngeld_mut_hh",
            "m_elterngeld_vat_hh",
            "m_elterngeld",
            "jahr",
        ],
        "out": [
            "elterngeld_m",
            "elterngeld_geschw_bonus_anspruch",
            "_elterngeld_anz_mehrlinge_anspruch",
            "elternzeit_anspruch",
        ],
    },
    "favorability_check": {
        "in_provided": [
            "hh_id",
            "tu_id",
            "p_id",
            "kind",
            "eink_st_ohne_kinderfreib_tu",
            "eink_st_mit_kinderfreib_tu",
            "zu_verst_eink_mit_kinderfreib_tu",
            "_zu_verst_eink_ohne_kinderfreib_tu",
            "abgelt_st_tu",
            "kindergeld_m",
            "jahr",
        ],
        "out": ["eink_st_tu", "zu_verst_eink_tu"],
    },
    "full_taxes_and_transfers": {
        "in_provided": [],
        "out": [
            # TODO: what are the targets?
            # "eink_st_tu",
            # "soli_st_tu",
            # "abgelt_st_tu",
            # "ges_rentenv_beitr_m",
            # "arbeitsl_v_beitr_m",
            # "ges_krankenv_beitr_m",
            # "ges_pflegev_beitr_m",
            # "arbeitsl_geld_m",
            # "kindergeld_m_tu",
            # "arbeitsl_geld_2_m_hh",
            # "kinderzuschl_m_hh",
            # "wohngeld_m_hh",
            # "unterhaltsvors_m_hh",
        ],
    },
    "grundrente": {
        "in_provided": [
            "p_id",
            "tu_id",
            "hh_id",
            "grundr_zeiten",
            "grundr_bew_zeiten",
            "wohnort_ost",
            "rente_vorj_vor_grundr_proxy_m",
            "bruttolohn_vorj_m",
            "eink_selbst",
            "eink_vermietung",
            "kapitaleink",
            "alter",
            "alleinstehend",
            "geburtsjahr",
            "bruttolohn_m",
            "entgeltp",
            "ges_rente_zugangsfaktor",
            "rentner",
            "grundr_entgeltp",
            "kind",
        ],
        "out": [
            "grundr_zuschlag_bonus_entgeltp",
            "grundr_zuschlag_vor_eink_anr_m",
            "grundr_zuschlag_m",
            "ges_rente_m",
        ],
    },
    "grundrente_proxy_rente": {
        "in_provided": [
            "p_id",
            "tu_id",
            "hh_id",
            "alter",
            "priv_rente_m",
            "entgeltp",
            "geburtsjahr",
            "geburtsmonat",
            "rentner",
            "jahr_renteneintr",
            "wohnort_ost",
            "bruttolohn_m",
            "weiblich",
            "y_pflichtbeitr_ab_40",
            "m_pflichtbeitrag",
            "m_freiw_beitrag",
            "m_ersatzzeit",
            "m_schul_ausbild",
            "m_kind_berücks_zeit",
            "m_pfleg_berücks_zeit",
            "m_arbeitsunfähig",
            "m_krank_ab_16_bis_24",
            "m_mutterschutz",
            "m_arbeitslos",
            "m_ausbild_suche",
            "m_alg1_übergang",
            "m_geringf_beschäft",
        ],
        "out": [
            "rente_vorj_vor_grundr_proxy_m",
        ],
    },
    "grunds_im_alter": {
        "in_provided": [
            "p_id",
            "tu_id",
            "hh_id",
            "jahr",
            "kind",
            "alter",
            "bruttokaltmiete_m_hh",
            "heizkosten_m_hh",
            "wohnfläche_hh",
            "bruttolohn_m",
            "kapitaleink_brutto_m",
            "grundr_zeiten",
            "rentner",
            "schwerbeh_g",
            "vermögen_bedürft_hh",
            "alleinerz",
            "bewohnt_eigentum_hh",
            "arbeitsl_geld_m",
            "sonstig_eink_m",
            "eink_selbst_m",
            "eink_vermietung_m",
            "eink_st_tu",
            "soli_st_tu",
            "sozialv_beitr_m",
            "kindergeld_m_hh",
            "kind_unterh_erhalt_m",
            "unterhaltsvors_m",
            "elterngeld_m",
            "priv_rente_m",
            "ges_rente_m",
            "geburtstag",
            "geburtsmonat",
            "geburtsjahr",
        ],
        "out": [
            "grunds_im_alter_m_hh",
        ],
    },
    "kindergeld": {
        "in_provided": [
            "hh_id",
            "tu_id",
            "p_id",
            "alter",
            "kind",
            "arbeitsstunden_w",
            "in_ausbildung",
            "bruttolohn_m",
            "_zu_verst_eink_ohne_kinderfreib_tu",
        ],
        "out": [
            "kindergeld_m_tu",
            "kinderbonus_m_tu",
            "kindergeld_m_hh",
            "kinderbonus_m_hh",
            "kinderbonus_m_tu",
        ],
    },
    "kinderzuschl": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind",
            "alter",
            "arbeitsstunden_w",
            "bruttolohn_m",
            "in_ausbildung",
            "bruttokaltmiete_m_hh",
            "heizkosten_m_hh",
            "alleinerz",
            "kindergeld_anspruch",
            "_arbeitsl_geld_2_alleinerz_mehrbedarf_m_hh",
            "kinderzuschl_bruttoeink_eltern_m",
            "kinderzuschl_eink_eltern_m",
            "kindergeld_m_hh",
            "kind_unterh_erhalt_m",
            "unterhaltsvors_m",
            "jahr",
            "geburtsjahr",
            "vermögen_bedürft_hh",
            "bürgerg_bezug_vorj",
        ],
        "out": [
            "_kinderzuschl_vor_vermög_check_m_tu",
            "_kinderzuschl_nach_vermög_check_m_tu",
        ],
    },
    "renten_alter": {
        "in_provided": [
            # TODO: What are the inputs?
            # "p_id",
            # "hh_id",
            # "tu_id",
            # "alter",
            # "jahr",
            # "geburtsjahr",
            # "geburtsmonat",
            # "m_arbeitsunfähig",
            # "m_krank_ab_16_bis_24",
            # "m_mutterschutz",
            # "m_arbeitslos",
            # "m_ausbild_suche",
            # "m_schul_ausbild",
            # "m_alg1_übergang",
            # "m_geringf_beschäft",
            # "weiblich",
            # "y_pflichtbeitr_ab_40",
            # "m_pflichtbeitrag",
            # "m_freiw_beitrag",
            # "m_ersatzzeit",
            # "m_kind_berücks_zeit",
            # "m_pfleg_berücks_zeit",
        ],
        "out": [
            # TODO: what are the targets?
            # "ges_rente_regelaltersgrenze",
            # "ges_rente_frauen_altersgrenze",
            # "_ges_rente_langj_altersgrenze",
            # "_ges_rente_besond_langj_altersgrenze",
        ],
    },
    "renten_anspr": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "bruttolohn_m",
            "wohnort_ost",
            "alter",
            "jahr",
            "geburtsjahr",
            "entgeltp",
            "geburtsmonat",
            "m_arbeitsunfähig",
            "m_krank_ab_16_bis_24",
            "m_mutterschutz",
            "m_arbeitslos",
            "m_ausbild_suche",
            "m_schul_ausbild",
            "m_alg1_übergang",
            "m_geringf_beschäft",
            "weiblich",
            "y_pflichtbeitr_ab_40",
            "m_pflichtbeitrag",
            "m_freiw_beitrag",
            "m_ersatzzeit",
            "m_kind_berücks_zeit",
            "m_pfleg_berücks_zeit",
        ],
        "out": [
            "entgeltp_update",
            "entgeltp_update_lohn",
            "_ges_rente_altersgrenze_abschlagsfrei",
        ],
    },
    "soli_st": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind",
            "eink_st_mit_kinderfreib_tu",
            "abgelt_st_tu",
        ],
        "out": ["soli_st_tu"],
    },
    "sozialv_beitr": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "bruttolohn_m",
            "wohnort_ost",
            "alter",
            "selbstständig",
            "hat_kinder",
            "eink_selbst_m",
            "sum_ges_rente_priv_rente_m",
            "in_priv_krankenv",
            "jahr",
        ],
        "out": [
            "sozialv_beitr_m",
            "sozialv_beitr_arbeitg_m",
            "_sozialv_beitr_arbeitn_arbeitg_m",
            "ges_rentenv_beitr_m",
            "arbeitsl_v_beitr_m",
            "ges_krankenv_beitr_m",
            "ges_pflegev_beitr_m",
        ],
    },
    "unterhalt": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind_unterh_anspr_m",
            "kindergeld_m",
            "jahr",
            "kind",
        ],
        "out": ["kind_unterh_zahlbetr_m"],
    },
    "unterhaltsvors": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "alleinerz",
            "alter",
            "bruttolohn_m",
            "sonstig_eink_m",
            "kapitaleink_brutto_m",
            "eink_vermietung_m",
            "eink_selbst_m",
            "arbeitsl_geld_m",
            "sum_ges_rente_priv_rente_m",
            "jahr",
            "monat",
            "kind_unterh_erhalt_m",
        ],
        "out": ["unterhaltsvors_m"],
    },
    "vorsorgeaufw": {
        "in_provided": [
            "p_id",
            "tu_id",
            "hh_id",
            "bruttolohn_m",
            "kind",
            "priv_rentenv_beitr_m",
            "ges_rentenv_beitr_m",
            "arbeitsl_v_beitr_m",
            "ges_pflegev_beitr_m",
            "jahr",
            "ges_krankenv_beitr_m",
        ],
        "out": ["vorsorgeaufw_tu"],
    },
    "wohngeld": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "kind",
            "bruttokaltmiete_m_hh",
            "alleinerz",
            "alter",
            "immobilie_baujahr_hh",
            "kindergeld_anspruch",
            "mietstufe",
            "bruttolohn_m",
            "sum_ges_rente_priv_rente_m",
            "rente_ertragsanteil",
            "elterngeld_m",
            "arbeitsl_geld_m",
            "sonstig_eink_m",
            "kind_unterh_erhalt_m",
            "unterhaltsvors_m",
            "eink_selbst_m",
            "eink_abhängig_beschäftigt",
            "kapitaleink_brutto",
            "eink_vermietung_m",
            "ges_rentenv_beitr_m",
            "ges_krankenv_beitr_m",
            "behinderungsgrad",
            "jahr",
            "eink_st_tu",
            "vermögen_bedürft_hh",
            "haushaltsgröße_hh",
            "geburtstag",
            "geburtsmonat",
            "geburtsjahr",
        ],
        "out": ["wohngeld_vor_vermög_check_m_hh", "wohngeld_nach_vermög_check_m_hh"],
    },
    "zu_verst_eink": {
        "in_provided": [
            "p_id",
            "hh_id",
            "tu_id",
            "bruttolohn_m",
            "betreuungskost_m",
            "eink_selbst_m",
            "kapitaleink_brutto_m",
            "eink_vermietung_m",
            "jahr_renteneintr",
            "sum_ges_rente_priv_rente_m",
            "arbeitsstunden_w",
            "in_ausbildung",
            "kind",
            "behinderungsgrad",
            "priv_rentenv_beitr_m",
            "alleinerz",
            "alter",
            "jahr",
            "wohnort_ost",
            "selbstständig",
            "hat_kinder",
            "in_priv_krankenv",
            "geburtsjahr",
            "vorsorgeaufw_tu",
        ],
        "out": [
            "_zu_verst_eink_ohne_kinderfreib_tu",
            "zu_verst_eink_mit_kinderfreib_tu",
            "eink_st_kinderfreib_tu",
            "eink_st_altersfreib",
            "alleinerz_freib_tu",
            "sum_eink",
            "_eink_st_behinderungsgrad_pauschbetrag",
        ],
    },
}

In [None]:
def list_csv_files() -> list[Path]:
    return list(TEST_DATA_DIR.glob("*.csv"))


def read_file(file_name: str) -> pd.DataFrame:
    return (
        pd.read_csv(TEST_DATA_DIR / file_name, header=0, index_col=0, encoding="utf-8")
        .squeeze("columns")
        .reset_index()
    )


def unique_years(df: pd.DataFrame, column_name: str = "jahr") -> list[int]:
    return sorted(df[column_name].unique())


def grouped_by_year(
    df: pd.DataFrame, column_name: str = "jahr"
) -> dict[int, pd.DataFrame]:
    return {year: df[df[column_name] == year] for year in unique_years(df, column_name)}


def columns_by_role(
    df: pd.DataFrame, name: str
) -> tuple[list[str], list[str], list[str], list[str], list[str]]:
    out_cols = roles[name]["out"] if name in roles and "out" in roles[name] else []
    in_cols_assumed = (
        roles[name]["in_assumed"]
        if name in roles and "in_assumed" in roles[name]
        else []
    )
    in_cols_provided = (
        roles[name]["in_provided"]
        if name in roles and "in_provided" in roles[name]
        else []
    )

    note_cols = [col for col in df if col in note_columns]
    source_cols = [col for col in df if col in source_columns]

    return in_cols_provided, in_cols_assumed, out_cols, note_cols, source_cols


def create_yaml(df: pd.DataFrame, name: str) -> dict[str, dict]:
    (
        in_cols_provided,
        in_cols_assumed,
        out_cols,
        note_cols,
        source_cols,
    ) = columns_by_role(df, name)

    df.replace(to_replace=np.nan, value=None, inplace=True)

    out = {}

    def df_to_dict(df: pd.DataFrame) -> dict:
        source = "\n\n".join(
            value_to_string(df[source_column].iloc[0])
            for source_column in source_cols
            if value_to_string(df[source_column].iloc[0]) != ""
        )
        note = "\n\n".join(
            value_to_string(df[note_column].iloc[0])
            for note_column in note_cols
            if value_to_string(df[note_column].iloc[0]) != ""
        )
        specs = {"note": note, "source": source}

        inputs = {
            "provided": df[in_cols_provided].to_dict("list"),
            "assumed": df[in_cols_assumed].to_dict("list"),
        }
        outputs = df[out_cols].to_dict("list")
        return {"info": specs, "inputs": inputs, "outputs": outputs}

    if "hh_id" in df:
        for hh_id in sorted(df["hh_id"].unique()):
            df_hh = df.loc[df["hh_id"] == hh_id]
            out[f"hh_id_{hh_id}"] = df_to_dict(df_hh)
    else:
        out["hh_id_unknown"] = df_to_dict(df)

    return out


def value_to_string(value: Any) -> str:
    if pd.isnull(value):
        return ""
    else:
        return str(value)


def write_yaml_to_file(
    out: dict[str, dict], name: str, year: Optional[int] = None
) -> None:
    text = yaml.dump(out, sort_keys=False, allow_unicode=True, indent=2, width=88)
    if year is None:
        path = TEST_DATA_DIR / name / f"{name}.yaml"
    else:
        path = TEST_DATA_DIR / name / f"{year}.yaml"

    path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Writing to {path}")

    with open(path, "w", encoding="utf-8") as text_file:
        text_file.write(text)


def convert_test_data() -> None:
    for path in list_csv_files():
        df = read_file(path)
        name = path.stem

        if "jahr" not in df:
            yaml_out = create_yaml(df, name)
            write_yaml_to_file(yaml_out, name)
        else:
            for year, year_df in grouped_by_year(df).items():
                yaml_out = create_yaml(year_df, name)
                write_yaml_to_file(yaml_out, name, year)

In [None]:
for file in list_csv_files():
    print(f'"{file.stem}": {"{}"},')

In [None]:
convert_test_data()