In [1]:
import sys
sys.path.append("src")

import dataclasses
import numpy as np
import itertools
from matplotlib import pyplot as plt

import statfin
import verolysis

In [2]:
db = statfin.PxWebAPI.Verohallinto()
tbl = db.table("Vero", "tulot_101.px")

In [3]:
BITS = [
    "HVT_TULOT_10",  # Lukumäärä
    "HVT_TULOT_70",  # Ansiotulot
    "HVT_TULOT_80",  # Palkkatulot
    "HVT_TULOT_280", # Eläketulot
]

In [4]:
df = tbl.query({
    "Verovuosi": 2022,
    "Erä": BITS,
    "Tulonsaajaryhmä": "*",
    "Tuloluokka": "*",
    "Tunnusluvut": "*",
})

In [5]:
def get_cell_keys_gc(df):
    """
    Soluindeksit tulonsaajaryhmän ja tuloluokan mukaan

    Soluindeksi on pari (tulonsaajaryhmä, tuloluokka). Tämä palauttaa
    listan niistä indekseistä, jotka dataframesta löytyvät.
    """
    df = df[(df.Tulonsaajaryhmä != "Y") & (df.Tuloluokka != "SS")]
    groups = sorted(df.Tulonsaajaryhmä.unique().astype(int))
    classes = sorted(df.Tuloluokka.unique().astype(int))
    return list(itertools.product(groups, classes))

In [6]:
def get_cell_row(df, grp, cls, bit):
    """Poimi soluindeksin ja erän mukainen rivi"""
    df = df[(df.Tulonsaajaryhmä == str(grp)) & (df.Tuloluokka == str(cls)) & (df.Erä == bit)]
    assert len(df) == 1, len(df)
    return df.iloc[0]

In [7]:
def get_cell_size(df, grp, cls):
    """Henkilöiden lukumäärä solussa"""
    return get_cell_row(df, grp, cls, "HVT_TULOT_10").N

In [8]:
def get_cell_density(df, grp, cls, bit):
    """Solun erän tiheysfunktio"""
    row = get_cell_row(df, grp, cls, bit)
    if row.Mean is None:
        return None
    return verolysis.income_brackets.row_to_density(row)

In [9]:
def sample_cell(df, grp, cls, bit, k):
    """Poimi otanta solusta"""
    N = get_cell_size(df, grp, cls)
    d = get_cell_density(df, grp, cls, bit)
    if d is None:
        return np.zeros(k)
    return d.uniform_sample(k, leftpad=N, left=0)

In [10]:
sample_cell(df, 1, 1, "HVT_TULOT_70", 256)

array([   0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.        ,
          0.        ,    0.        ,    0.        ,    0.     

In [18]:
print(get_cell_size(df, 1, 1))
get_cell_row(df, 1, 1, "HVT_TULOT_70")

259030.0


Verovuosi                  2022
Erä                HVT_TULOT_70
Tulonsaajaryhmä               1
Tuloluokka                    1
Sum                  12243448.0
N                       12253.0
Mean                      999.0
Q1                        195.0
Q3                       1320.0
P10                        80.0
P20                       135.0
P30                       281.0
P40                       513.0
P50                       895.0
P60                      1320.0
P70                      1320.0
P80                      1320.0
P90                      2203.0
Sum_muutos                  5.0
N_muutos                    9.9
Name: 393, dtype: object

In [11]:
for bit in BITS[1:]:
    for grp, cls in get_cell_keys_gc(df):
        try:
            d = get_cell_density(df, grp, cls, bit)
            assert d is not None
        except Exception as e:
            print(get_cell_row(df, grp, cls, bit))
            print(e)

Verovuosi                  2022
Erä                HVT_TULOT_70
Tulonsaajaryhmä               1
Tuloluokka                   15
Sum                         NaN
N                           NaN
Mean                        NaN
Q1                          NaN
Q3                          NaN
P10                         NaN
P20                         NaN
P30                         NaN
P40                         NaN
P50                         NaN
P60                         NaN
P70                         NaN
P80                         NaN
P90                         NaN
Sum_muutos                  NaN
N_muutos                    NaN
Name: 407, dtype: object

Verovuosi                  2022
Erä                HVT_TULOT_70
Tulonsaajaryhmä               1
Tuloluokka                   17
Sum                         NaN
N                           NaN
Mean                        NaN
Q1                          NaN
Q3                          NaN
P10                         NaN
P20           

In [None]:
def extract_piecewise(df, bit, entries_per_bin = 8):
    bins_per_row = 12
    entries_per_row = entries_per_bin * bins_per_row
    
    grps = sorted(df[df.Tulonsaajaryhmä != "Y"].Tulonsaajaryhmä.unique().astype(int))
    clss = sorted(df[df.Tuloluokka != "SS"].Tuloluokka.unique().astype(int))
    rows = len(grps) * len(clss)

    i = 0
    out = np.zeros(entries_per_row * rows)
    df = df[df.Erä == bit]
    for g in grps:
        dfg = df[df.Tulonsaajaryhmä == str(g)]
        for c in clss:
            j = i + entries_per_row
            dfc = dfg[dfg.Tuloluokka == str(c)]
            out[i:j] = extract_from_row(dfc.iloc[0], entries_per_bin)
            i = j
    return out

In [None]:
x = extract_piecewise(df, "HVT_TULOT_70")

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(16, 4))
plt.plot(x)
plt.xlim(0, 6000)

In [None]:
def between(x, a, b=1e12):
    return np.clip(x, a, b) - a
    

@dataclasses.dataclass
class Ansiotulovähennys:
    cap: float = 3_570
    r1: float = 2_500
    r2: float = 7_230
    r3: float = 14_000
    t1: float = (51 / 100)
    t2: float = (28 / 100)
    t3: float = (4.5 / 100)
    
    def __call__(self, palkkatulot, puhtaat_ansiotulot):
        up1 = self.t1 * between(palkkatulot, self.r1, self.r2)
        up2 = self.t2 * between(palkkatulot, self.r2)
        up = np.maximum(up1 + up2, self.cap)
        dn = self.t3 * between(puhtaat_ansiotulot, self.r3)
        return np.maximum(up - dn, 0)


@dataclasses.dataclass
class Perusvähennys:
    cap: float = 3_740
    rate: float = (18 / 100)

    def __call__(self, tulot):
        up = np.minimum(tulot, self.cap)
        dn = self.rate * between(tulot, self.cap)
        return np.maximum(up - dn, 0)

class Eläketulovähennys

In [None]:
ATV = Ansiotulovähennys()
PV = Perusvähennys()

print(ATV(10_000, 15_000))
print(PV(4_000))

In [None]:
r_tyel = (7.15 / 100)
r_tvm = (1.50 / 100)

pt = 10_000
et = 10_000
vv = 1_000

tyel = r_tyel * pt
tvm = r_tvm * pt
at1 = pt + et - tyel - tvm
pat = at0 - min(pat0, 750)
pat1 = pat vv
pat2 -= ATV(pt, pat)
pat3 -= PV(pat2)
vt = 