**runs through with SOEP V37**

In [1]:
conda develop .

path exists, skipping /Users/paulina/Documents/IZA/Arbeit/gettsim
completed operation for: /Users/paulina/Documents/IZA/Arbeit/gettsim

Note: you may need to restart the kernel to use updated packages.


# Imports

In [2]:
%matplotlib inline
%load_ext lab_black

%load_ext autoreload
%autoreload 2

import sys, os
import json
import pickle, re, itertools
import csv

import pandas as pd
import numpy as np

from pathlib import Path
from functools import reduce

from matplotlib import pyplot as plt
import seaborn as sns

# from patsy import dmatrix

from gettsim import set_up_policy_environment
from gettsim import compute_taxes_and_transfers
from gettsim.config import TYPES_INPUT_VARIABLES

from pandas.api.types import is_bool_dtype
from pandas.api.types import is_datetime64_any_dtype
from pandas.api.types import is_float_dtype
from pandas.api.types import is_integer_dtype

# from gettsim-application.src import validate_soep_gettsim

ModuleNotFoundError: No module named 'dags'

# Load files

In [None]:
CWD = Path.cwd()
OUT_DATA_SOEP = CWD.parent / "soep-data" / "out" / "data"
IN_VAL = CWD.parent / "gettsim-application" / "src" / "validation_data"
IN_REF_VAL = CWD.parent / "gettsim-application" / "src"
OUT_DATA_SOEP

In [None]:
# OUT_DATA_SOEP = CWD.parent / ".." / ".." / "soep-data" / "out" / "data"

In [None]:
gs_data = pd.read_pickle(OUT_DATA_SOEP / "gettsim_data_soep.pickle")

# Prepare SOEP data set

In [None]:
def set_expected_types(data):
    for var, exp_type in TYPES_INPUT_VARIABLES.items():
        if exp_type == int:
            # if not is_integer_dtype(data_2019[var]):
            print(f"{var} converted to integer")
            try:
                data[var] = data[var].astype(int)
            except:
                data[var] = data[var].round().astype("Int64")

        if exp_type == float:
            # if not (is_float_dtype(data[var]) or is_integer_dtype(data[var])):
            print(f"{var} converted to float")
            data[var] = data[var].astype("float")

        if exp_type == bool:
            # if not is_bool_dtype(data[var]):
            print(f"{var} converted to boolean")
            data[var] = data[var].astype(bool)
    return data

In [None]:
data_2019 = gs_data.query("jahr == 2019").reset_index()
data_2019 = data_2019.drop(["anz_kinder_hh", "anz_minderj_hh", "hhsize_tu"], axis=1)

In [None]:
data_2019 = data_2019[
    list(TYPES_INPUT_VARIABLES.keys())
    + [
        "p_gewicht",
        "hh_gewicht",
        "weiblich",
        "hh_size",
        "erwerbstätig",
        "nicht_erwerbstätig",
        "bildungsstand_isced_cat",
        "bildungsstand_casmin_cat",
        "ledig",
        "geschieden",
        "verwitwet",
        "verheiratet",
        "arbeitslos",
        "beamte",
        "vollzeit",
        "stundenlohn_netto",
        "nettolohn_m",
        "stundenlohn",
        "geringfügig_erwb",
        "teilzeit",
        "werkstatt",
        "pensions_pub_m",
        "unempl_benefit2_hh",
        "unempl_benefit",
        "housing_benefit_hh",
        "kinderzuschlag_hh",
        "kindergeld_hh",
        "einkommenssteuer_hh",
        "kapital_eink_hh",
    ]
]
data_2019 = set_expected_types(data_2019)
data_2019 = data_2019.fillna(0)
data_2019 = set_expected_types(data_2019)

In [None]:
data_2019.dtypes

In [None]:
data_2019.isna().sum().loc[data_2019.isna().sum() > 0]

# Run GETTSIM

In [None]:
policy_params, policy_functions = set_up_policy_environment(2019)
targets = [
    "geringfügig_beschäftigt",
    "ges_krankenv_beitr_rente_m",
    "ges_krankenv_beitr_selbst_m",
    "in_gleitzone",
    "_ges_krankenv_midi_job_arbeitn_m",
    "_ges_krankenv_beitr_reg_beschäftigt",
    "ges_krankenv_beitr_m",
]
result = compute_taxes_and_transfers(
    data=data_2019,
    params=policy_params,
    targets=targets,
    functions=policy_functions,
)
result = result.join(data_2019)

In [None]:
policy_params["soz_vers_beitr"]["geringfügige_eink_grenzen_m"]

In [None]:
result.loc[result["ges_krankenv_beitr_m"].isna()][
    targets
    + [
        "eink_selbst_m",
        "selbstständig",
        "priv_rentenv_beitr_m",
        "arbeitsstunden_w",
        "bruttolohn_m",
    ]
]

In [None]:
result[
    targets
    + [
        "eink_selbst_m",
        "selbstständig",
        "priv_rentenv_beitr_m",
        "arbeitsstunden_w",
        "bruttolohn_m",
    ]
].sample(10)

In [None]:
policy_params, policy_functions = set_up_policy_environment(2019)
targets = [
    "ges_krankenv_beitr_m",
    "arbeitsl_v_beitr_m",
    "ges_rentenv_beitr_m",
    "ges_pflegev_beitr_m",
    "sozialv_beitr_gesamt_m",
    "arbeitsl_geld_m",
    # "rente_anspr_m",
    "abgelt_st_tu",
    "soli_st_tu",
    "kindergeld_m",
    "kindergeld_m_hh",
    "eink_st_tu",
    "unterhaltsvors_m",
    "arbeitsl_geld_2_regelsatz_m_hh",
    "arbeitsl_geld_2_kost_unterk_m_hh",
    "unterhaltsvors_m_hh",
    "kinderzuschl_m_hh",
    "wohngeld_m_hh",
    "arbeitsl_geld_2_m_hh",
    "grunds_im_alter_m_hh",
    "anz_kinder_hh",
    "geringfügig_beschäftigt",
    "in_gleitzone",
    "anz_kinder_hh",
    # for debugging
    "ges_krankenv_beitr_rente_m",
    "sum_ges_rente_priv_rente_m",
    "_ges_krankenv_beitr_bemess_grenze_m",
    "ges_rente_m",
    "ges_krankenv_beitr_selbst_m",
    "_ges_krankenv_bemessungsgrundlage_eink_selbst",
    "ges_krankenv_beitr_satz",
    "_ges_krankenv_beitr_satz_arbeitg",
    "kindergeld_m_ab_1997",
    "kindergeld_basis_m",
    "kinderfreib_günstiger_tu",
    "arbeitsl_geld_2_vor_vorrang_m_hh",
    "wohngeld_vorrang_hh",
    "kinderzuschl_vorrang_hh",
    "wohngeld_kinderzuschl_vorrang_hh",
    "erwachsene_alle_rentner_hh",
    "wohngeld_nach_vermög_check_m_hh",
    "arbeitsl_geld_2_vor_vorrang_m_hh",
    "wohngeld_vor_vermög_check_m_hh",
    "haushaltsgröße_hh",
    "arbeitsl_geld_2_regelbedarf_m_hh",
    "arbeitsl_geld_2_eink_m_hh",
    "kapitaleink_brutto_tu",
    "zu_verst_kapitaleink_tu",
    "anz_erwachsene_tu",
    "anz_erwachsene_hh",
]
result = compute_taxes_and_transfers(
    data=data_2019,
    params=policy_params,
    targets=targets,
    functions=policy_functions,
)
result = result.join(data_2019)

In [None]:
result

In [None]:
(result.groupby("hh_id")["wohngeld_m_hh"].max() > 0).mean()

In [None]:
result.loc[result["ges_krankenv_beitr_m"] < 0]

# Compare Results to observed measures

In [None]:
import sys

sys.path.append("/Users/paulina/Documents/IZA/Arbeit/gettsim-application/src")
from load_validation_data import validation_data

In [None]:
val_data = validation_data(IN_VAL=IN_VAL)
val_data

In [None]:
from validate_soep_gettsim import compare_data_moments

In [None]:
from IPython.display import display

In [None]:
result["erwerbslos"] = result["arbeitslos"]

In [None]:
out_1, out_2 = compare_data_moments(
    year=2019, val_data=val_data, result=result, IN_REF_VAL=IN_REF_VAL
)
pd.set_option("display.max_rows", None)
display(out_1, out_2)

# Debugging

In [None]:
pd.set_option("display.max_rows", 10)

## Privat pension

In [None]:
result_priv_pension = result.loc[
    (result["bruttolohn_m"] <= 450)
    & (result["sozialv_beitr_gesamt_m"] > 0)
    & (result["rentner"] == False)
    & (result["selbstständig"] == False)
].copy()

In [None]:
result_priv_pension[
    [
        "jahr",
        "p_id",
        "hh_id",
        "erwerbstätig",
        "in_ausbildung",
        "stundenlohn",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "sozialv_beitr_gesamt_m",
        "arbeitsl_geld_m",
        "anz_kinder_hh",
        "vollzeit",
        "geringfügig_erwb",
        "teilzeit",
        "rentner",
        "geringfügig_beschäftigt",
        "selbstständig",
        "in_gleitzone",
        "ges_krankenv_beitr_rente_m",
        "sum_ges_rente_priv_rente_m",
        "_ges_krankenv_beitr_bemess_grenze_m",
        "ges_rente_m",
        "priv_rente_m",
        "pensions_pub_m",
    ]
]

### with pensions_pub

In [None]:
result_priv_pension.loc[result_priv_pension["pensions_pub_m"] > 0][
    [
        "jahr",
        "p_id",
        "hh_id",
        "erwerbstätig",
        "in_ausbildung",
        "stundenlohn",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "sozialv_beitr_gesamt_m",
        "arbeitsl_geld_m",
        "anz_kinder_hh",
        "vollzeit",
        "geringfügig_erwb",
        "teilzeit",
        "rentner",
        "geringfügig_beschäftigt",
        "selbstständig",
        "in_gleitzone",
        "ges_krankenv_beitr_rente_m",
        "sum_ges_rente_priv_rente_m",
        "_ges_krankenv_beitr_bemess_grenze_m",
        "ges_rente_m",
        "priv_rente_m",
        "pensions_pub_m",
    ]
]

### without pensions_pub

In [None]:
result_priv_pension.loc[result_priv_pension["pensions_pub_m"] == 0][
    [
        "jahr",
        "p_id",
        "hh_id",
        "erwerbstätig",
        "in_ausbildung",
        "stundenlohn",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "sozialv_beitr_gesamt_m",
        "arbeitsl_geld_m",
        "anz_kinder_hh",
        "vollzeit",
        "geringfügig_erwb",
        "teilzeit",
        "rentner",
        "geringfügig_beschäftigt",
        "selbstständig",
        "in_gleitzone",
        "ges_krankenv_beitr_rente_m",
        "sum_ges_rente_priv_rente_m",
        "_ges_krankenv_beitr_bemess_grenze_m",
        "ges_rente_m",
        "priv_rente_m",
        "pensions_pub_m",
    ]
]

## self-employed

In [None]:
result_selbstst = result.loc[
    (result["bruttolohn_m"] <= 450)
    & (result["sozialv_beitr_gesamt_m"] > 0)
    & (result["rentner"] == False)
    & (result["selbstständig"] == True)
].copy()

In [None]:
result_selbstst[
    [
        "jahr",
        "p_id",
        "hh_id",
        "stundenlohn",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "geringfügig_beschäftigt",
        "selbstständig",
        "in_gleitzone",
        "ges_krankenv_beitr_rente_m",
        "sum_ges_rente_priv_rente_m",
        "_ges_krankenv_beitr_bemess_grenze_m",
        "ges_rente_m",
        "priv_rente_m",
        "pensions_pub_m",
        "ges_krankenv_beitr_selbst_m",
        "_ges_krankenv_bemessungsgrundlage_eink_selbst",
        "ges_krankenv_beitr_satz",
        "_ges_krankenv_beitr_satz_arbeitg",
        "eink_selbst_m",
    ]
]

In [None]:
pd.set_option("display.max_rows", 30)
result.loc[result["p_id"] == 181401][
    [
        "jahr",
        "p_id",
        "hh_id",
        "stundenlohn",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "geringfügig_beschäftigt",
        "selbstständig",
        "in_gleitzone",
        "ges_krankenv_beitr_rente_m",
        "sum_ges_rente_priv_rente_m",
        "_ges_krankenv_beitr_bemess_grenze_m",
        "ges_rente_m",
        "priv_rente_m",
        "pensions_pub_m",
        "ges_krankenv_beitr_selbst_m",
        "_ges_krankenv_bemessungsgrundlage_eink_selbst",
        "ges_krankenv_beitr_satz",
        "_ges_krankenv_beitr_satz_arbeitg",
        "eink_selbst_m",
    ]
].transpose()

## Niedriglohn Wohnort Ost

In [None]:
pd.set_option("display.max_rows", 10)
result.loc[
    result["beschäftigt"] & (result["stundenlohn"] <= 11.5) & result["wohnort_ost"]
][
    [
        "jahr",
        "p_id",
        "hh_id",
        "p_gewicht",
        "erwerbstätig",
        "vollzeit",
        "stundenlohn",
        "geringfügig_erwb",
        "teilzeit",
        "rentner",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "sozialv_beitr_gesamt_m",
        "arbeitsl_geld_m",
        "anz_kinder_hh",
    ]
]

## Minderjährige

In [None]:
result.loc[(result["alter"] < 18) & (result["sozialv_beitr_gesamt_m"] > 0)][
    [
        "jahr",
        "p_id",
        "hh_id",
        "erwerbstätig",
        "in_ausbildung",
        "stundenlohn",
        "vermögen_bedürft_hh",
        "bruttolohn_m",
        "alter",
        "ges_krankenv_beitr_m",
        "arbeitsl_v_beitr_m",
        "ges_rentenv_beitr_m",
        "ges_pflegev_beitr_m",
        "sozialv_beitr_gesamt_m",
        "arbeitsl_geld_m",
        "anz_kinder_hh",
        "vollzeit",
        "geringfügig_erwb",
        "teilzeit",
        "rentner",
    ]
]

## Kindergeld

In [None]:
result_kindergeld = result.loc[
    (result["kindergeld_hh"] > 0)
    & (result["kindergeld_m_hh"] == 0)
    & (result["hat_kinder"])
].copy()

In [None]:
result_kindergeld[
    [
        "jahr",
        "p_id",
        "hh_id",
        "alter",
        "kindergeld_hh",
        "kindergeld_m_hh",
        "hat_kinder",
        "anz_kinder_hh",
        "bruttolohn_m",
    ]
]

### Kinderfreibetrag

In [None]:
result.loc[result["hh_id"] == 79880][
    [
        "jahr",
        "p_id",
        "hh_id",
        "alter",
        "kindergeld_hh",
        "kindergeld_m_hh",
        "kindergeld_m",
        "hat_kinder",
        "anz_kinder_hh",
        "bruttolohn_m",
        "kindergeld_m_ab_1997",
        "kindergeld_basis_m",
        "kinderfreib_günstiger_tu",
    ]
]

In [None]:
(
    (result["kindergeld_hh"] > 0).astype(float)
    * (result["kind"]).astype(bool)
    * (result["kinderfreib_günstiger_tu"]).astype(bool)
    * result["p_gewicht"]
).sum() / 1000

### children are over 18

In [None]:
result_kindergeld.loc[result["kinderfreib_günstiger_tu"] == False][["hh_id"]]

In [None]:
result.loc[result["hh_id"] == 66036][
    [
        "jahr",
        "p_id",
        "hh_id",
        "alter",
        "kindergeld_hh",
        "kindergeld_m_hh",
        "kindergeld_m",
        "hat_kinder",
        "anz_kinder_hh",
        "bruttolohn_m",
        "kindergeld_m_ab_1997",
        "kindergeld_basis_m",
        "kinderfreib_günstiger_tu",
        "in_ausbildung",
    ]
]

## ALG II

In [None]:
result_algII = result.loc[
    (result["unempl_benefit2_hh"] > 0) & (result["arbeitsl_geld_2_m_hh"] == 0)
].copy()

In [None]:
result_algII[
    [
        "jahr",
        "p_id",
        "hh_id",
        "alter",
        "bruttolohn_m",
        "in_ausbildung",
        "unempl_benefit2_hh",
        "arbeitsl_geld_2_m_hh",
        "rentner",
        "arbeitsl_geld_2_vor_vorrang_m_hh",
        "wohngeld_vorrang_hh",
        "kinderzuschl_vorrang_hh",
        "wohngeld_kinderzuschl_vorrang_hh",
        "erwachsene_alle_rentner_hh",
    ]
]

### Rentner

In [None]:
(
    (result["unempl_benefit2_hh"] > 0).astype(bool)
    * (result["rentner"]).astype(bool)
    * result.groupby("hh_id")["hh_gewicht"].max()
).sum() / 1000

### income or wealth too high

In [None]:
result_algII[
    [
        "jahr",
        "p_id",
        "hh_id",
        "alter",
        "bruttolohn_m",
        "unempl_benefit2_hh",
        "wohngeld_nach_vermög_check_m_hh",
        "arbeitsl_geld_2_vor_vorrang_m_hh",
        "wohngeld_vor_vermög_check_m_hh",
        "vermögen_bedürft_hh",
        "haushaltsgröße_hh",
        "arbeitsl_geld_2_regelbedarf_m_hh",
        "kindergeld_m_hh",
        "arbeitsl_geld_2_eink_m_hh",
    ]
]

## Alg

In [None]:
result_alg = result.loc[
    (result["arbeitsl_geld_m"] > 0) & (result["arbeitsl_geld_2_m_hh"] > 0)
].copy()

In [None]:
result_alg[
    [
        "jahr",
        "p_id",
        "hh_id",
        "alter",
        "bruttolohn_m",
        "arbeitsl_geld_m",
        "unempl_benefit",
        "arbeitsl_geld_2_m_hh",
        "unempl_benefit2_hh",
        "haushaltsgröße_hh",
        "anz_kinder_hh",
        "in_ausbildung",
    ]
]

In [None]:
gs_data.loc[gs_data["hh_id"] == 141054]

# Abgeltungssteuer

In [None]:
result_abgelt_st = result.loc[result["abgelt_st_tu"] > 0].copy()

In [None]:
result_abgelt_st[
    [
        "jahr",
        "p_id",
        "hh_id",
        "tu_id",
        "alter",
        "anz_erwachsene_tu",
        "anz_erwachsene_hh",
        "bruttolohn_m",
        "abgelt_st_tu",
        "kapitaleink_brutto_tu",
        "zu_verst_kapitaleink_tu",
        "kapitaleink_brutto_m",
        "kapital_eink_hh",
    ]
]

In [None]:
result_abgelt_st["kapitaleink_brutto_tu"].describe()

# Tax units

In [None]:
result["anzahl_tu_in_hh"] = result.groupby(["jahr", "hh_id"])["tu_id"].transform(
    "nunique"
)

In [None]:
result_abgelt_st = result.loc[result["abgelt_st_tu"] > 0].copy()

In [None]:
result_abgelt_st["anzahl_tu_in_hh"].value_counts()

In [None]:
result_eink_st = result.loc[result["eink_st_tu"] > 0].copy()

In [None]:
result_eink_st["anzahl_tu_in_hh"].value_counts()

In [None]:
result_soli_st = result.loc[result["soli_st_tu"] > 0].copy()

In [None]:
result_soli_st["anzahl_tu_in_hh"].value_counts()

In [None]:
result_st = result.copy()

In [None]:
(
    result.groupby("hh_id")["einkommenssteuer_hh"].first()
    * result.groupby("hh_id")["hh_gewicht"].first()
    * 12
).sum() / 1000000000