# DAIOE SSYK2012

Here I Import and process the Raw DAIOE dataset, specifically the DAIOE SSYK 2012


## Setup and data sources
Set up local paths and define the DAIOE CSV and SCB Parquet sources used throughout the notebook.


In [None]:
import polars as pl
from pathlib import Path


ROOT = Path.cwd().resolve()
DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

DAIOE_SOURCE: str = (
    "https://raw.githubusercontent.com/joseph-data/07_translate_ssyk/main/"
    "03_translated_files/daioe_ssyk2012_translated.csv"
)

SCB_SOURCE: str = (
    "https://raw.githubusercontent.com/joseph-data/daioe-explorer-years/development/"
    "data/processed/ssyk12_aggregated_ssyk4_to_ssyk1.parquet"
)

## Load data lazily
Load both datasets as Polars LazyFrames so the pipeline stays memory-efficient.


In [2]:
daioe_lazy_lf = pl.scan_csv(
    DAIOE_SOURCE
)

scb_lazy_lf = pl.scan_parquet(
    SCB_SOURCE
)

## Quick sanity checks
Preview a few rows from each source to confirm the schemas and data look as expected.


In [3]:
print(daioe_lazy_lf.head(5).collect())

shape: (5, 27)
┌────────────┬──────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ ssyk2012_4 ┆ year ┆ daioe_alla ┆ daioe_stra ┆ … ┆ pctl_rank_ ┆ ssyk2012_ ┆ ssyk2012_ ┆ ssyk2012_ │
│ ---        ┆ ---  ┆ pps        ┆ tgames     ┆   ┆ genai      ┆ 1         ┆ 2         ┆ 3         │
│ str        ┆ i64  ┆ ---        ┆ ---        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│            ┆      ┆ f64        ┆ f64        ┆   ┆ f64        ┆ str       ┆ str       ┆ str       │
╞════════════╪══════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 0110 Commi ┆ 2010 ┆ null       ┆ null       ┆ … ┆ null       ┆ 0 Armed   ┆ 01        ┆ 011 Commi │
│ ssioned    ┆      ┆            ┆            ┆   ┆            ┆ forces    ┆ Officers  ┆ ssioned   │
│ armed      ┆      ┆            ┆            ┆   ┆            ┆ occupatio ┆           ┆ armed     │
│ forces…    ┆      ┆            ┆            ┆   ┆            ┆ ns        ┆

In [4]:
print(scb_lazy_lf.head(5).collect())

shape: (5, 7)
┌───────┬───────────┬───────┬───────┬──────┬───────┬─────────────────────────────────┐
│ level ┆ ssyk_code ┆ age   ┆ sex   ┆ year ┆ count ┆ occupation                      │
│ ---   ┆ ---       ┆ ---   ┆ ---   ┆ ---  ┆ ---   ┆ ---                             │
│ str   ┆ str       ┆ str   ┆ str   ┆ i64  ┆ i64   ┆ str                             │
╞═══════╪═══════════╪═══════╪═══════╪══════╪═══════╪═════════════════════════════════╡
│ SSYK4 ┆ 3122      ┆ 60-64 ┆ women ┆ 2014 ┆ 66    ┆ Manufacturing supervisors       │
│ SSYK4 ┆ 1491      ┆ 16-24 ┆ women ┆ 2015 ┆ 3     ┆ Department managers in educati… │
│ SSYK4 ┆ 1711      ┆ 60-64 ┆ men   ┆ 2024 ┆ 10    ┆ Hotel and conference managers,… │
│ SSYK4 ┆ 3117      ┆ 45-49 ┆ men   ┆ 2018 ┆ 118   ┆ GIS and cartographic engineers  │
│ SSYK4 ┆ 5141      ┆ 25-29 ┆ men   ┆ 2017 ┆ 144   ┆ Hairdressers                    │
└───────┴───────────┴───────┴───────┴──────┴───────┴─────────────────────────────────┘


In [5]:
# daioe_lazy_lf.collect_schema()

In [6]:
#scb_lazy_lf.collect().collect_schema()

## Derive SSYK levels and align years
Split `ssyk2012_4` into 1-4 digit codes, drop the original SSYK column, and keep the SSYK12 era (2014+).
If SCB has later years, extend the DAIOE series by carrying the latest year forward.


In [7]:
daioe_lazy_lf_ssyk12 = (
    daioe_lazy_lf\
    .with_columns([
    pl.col("ssyk2012_4").str.slice(0, 1).alias("code_1"),
    pl.col("ssyk2012_4").str.slice(0, 2).alias("code_2"),
    pl.col("ssyk2012_4").str.slice(0, 3).alias("code_3"),
    pl.col("ssyk2012_4").str.slice(0, 4).alias("code_4")
])\
    .drop(pl.col("^ssyk2012.*$"))\
        .filter(pl.col("year") >= 2014) ## The Year stretch from the first SSYK12 publication
)

In [8]:
base = daioe_lazy_lf_ssyk12

daioe_max = base.select(pl.max("year")).collect().item()
scb_max   = scb_lazy_lf.select(pl.max("year")).collect().item()

missing = list(range(daioe_max + 1, scb_max + 1))

daioe_lazy_lf_extended = (
    base
    if not missing
    else pl.concat(
        [
            base,
            base
            .filter(pl.col("year") == daioe_max)
            .drop("year")
            .join(pl.LazyFrame({"year": missing}), how="cross")
            .select(base.collect_schema().names()),  # ensure same column order/schema
        ],
        how="vertical",
    )
)



In [9]:
def inspect_lazy(lf: pl.LazyFrame) -> None:
    """
    Print the shape of a Polars LazyFrame in a memory-efficient manner.

    This function computes the number of rows using a lazy row-count
    aggregation (`pl.len()`) and retrieves the number of columns from
    the resolved schema without materializing the full dataset.

    Parameters
    ----------
    lf : pl.LazyFrame
        The LazyFrame to inspect.

    Notes
    -----
    - The row count triggers execution of the lazy query plan,
      but avoids collecting all columns into memory.
    - The column count is obtained from the schema metadata and
      does not require data materialization.
    - Intended for debugging and validation of large lazy pipelines.
    """
    n_rows = lf.select(pl.len()).collect().item()
    n_cols = len(lf.collect_schema())
    print(f"Rows: {n_rows:,}")
    print(f"Columns: {n_cols}")


In [10]:
inspect_lazy(daioe_lazy_lf_extended)


Rows: 4,719
Columns: 27


## Build SCB SSYK4 counts
Aggregate SCB to 4-digit SSYK by year to create employment counts used as weights.


In [11]:
scb_lazy_lf_level4 = (
    scb_lazy_lf
        .filter(pl.col("ssyk_code").str.len_chars() == 4)
        .group_by(["year", "ssyk_code"])
        .agg(pl.col("count").sum().alias("total_count"))
)



In [12]:
inspect_lazy(scb_lazy_lf_level4)

Rows: 4,719
Columns: 3


## Merge and filter
Join DAIOE rows to SCB counts by year and SSYK4, inspect unmatched codes, and remove the military/army group (code_1 == '0').


In [13]:
daioe_lazy_lf_extended.head(5).collect()

year,daioe_allapps,daioe_stratgames,daioe_videogames,daioe_imgrec,daioe_imgcompr,daioe_imggen,daioe_readcompr,daioe_lngmod,daioe_translat,daioe_speechrec,daioe_genai,pctl_rank_allapps,pctl_rank_stratgames,pctl_rank_videogames,pctl_rank_imgrec,pctl_rank_imgcompr,pctl_rank_imggen,pctl_rank_readcompr,pctl_rank_lngmod,pctl_rank_translat,pctl_rank_speechrec,pctl_rank_genai,code_1,code_2,code_3,code_4
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str
2014,,,,,,,,,,,,,,,,,,,,,,,"""0""","""01""","""011""","""0110"""
2015,,,,,,,,,,,,,,,,,,,,,,,"""0""","""01""","""011""","""0110"""
2016,,,,,,,,,,,,,,,,,,,,,,,"""0""","""01""","""011""","""0110"""
2017,,,,,,,,,,,,,,,,,,,,,,,"""0""","""01""","""011""","""0110"""
2018,,,,,,,,,,,,,,,,,,,,,,,"""0""","""01""","""011""","""0110"""


In [14]:
scb_lazy_lf_level4.head(5).collect()

year,ssyk_code,total_count
i64,str,i64
2014,"""2225""",3992
2024,"""8114""",2254
2020,"""5323""",38229
2014,"""7112""",7901
2020,"""2423""",21282


In [15]:
daioe_scb_years = daioe_lazy_lf_extended\
    .join(
        scb_lazy_lf_level4,
        left_on=["year", "code_4"],
        right_on=["year", "ssyk_code"],
        how="left"
    )

In [16]:
inspect_lazy(daioe_scb_years)

Rows: 4,719
Columns: 28


In [17]:
# DAIOE codes with no SCB match
daioe_scb_years_unmatched = daioe_lazy_lf_extended\
    .join(
        scb_lazy_lf_level4,
        left_on=["year", "code_4"],
        right_on=["year", "ssyk_code"],
        how="anti"
    )



In [18]:
inspect_lazy(daioe_scb_years_unmatched)

Rows: 0
Columns: 27


In [19]:
daioe_scb_years.collect_schema()

Schema([('year', Int64),
        ('daioe_allapps', Float64),
        ('daioe_stratgames', Float64),
        ('daioe_videogames', Float64),
        ('daioe_imgrec', Float64),
        ('daioe_imgcompr', Float64),
        ('daioe_imggen', Float64),
        ('daioe_readcompr', Float64),
        ('daioe_lngmod', Float64),
        ('daioe_translat', Float64),
        ('daioe_speechrec', Float64),
        ('daioe_genai', Float64),
        ('pctl_rank_allapps', Float64),
        ('pctl_rank_stratgames', Float64),
        ('pctl_rank_videogames', Float64),
        ('pctl_rank_imgrec', Float64),
        ('pctl_rank_imgcompr', Float64),
        ('pctl_rank_imggen', Float64),
        ('pctl_rank_readcompr', Float64),
        ('pctl_rank_lngmod', Float64),
        ('pctl_rank_translat', Float64),
        ('pctl_rank_speechrec', Float64),
        ('pctl_rank_genai', Float64),
        ('code_1', String),
        ('code_2', String),
        ('code_3', String),
        ('code_4', String),
        ('tot

In [20]:
daioe_scb_years.collect_schema().names()

['year',
 'daioe_allapps',
 'daioe_stratgames',
 'daioe_videogames',
 'daioe_imgrec',
 'daioe_imgcompr',
 'daioe_imggen',
 'daioe_readcompr',
 'daioe_lngmod',
 'daioe_translat',
 'daioe_speechrec',
 'daioe_genai',
 'pctl_rank_allapps',
 'pctl_rank_stratgames',
 'pctl_rank_videogames',
 'pctl_rank_imgrec',
 'pctl_rank_imgcompr',
 'pctl_rank_imggen',
 'pctl_rank_readcompr',
 'pctl_rank_lngmod',
 'pctl_rank_translat',
 'pctl_rank_speechrec',
 'pctl_rank_genai',
 'code_1',
 'code_2',
 'code_3',
 'code_4',
 'total_count']

In [21]:
## Here I ommitted the Army and Military from the data 

daioe_scb_filtered = daioe_scb_years\
    .filter(pl.col("code_1") != "0")

In [22]:
inspect_lazy(daioe_scb_filtered)

Rows: 4,686
Columns: 28


## Identify DAIOE measure columns
Collect all DAIOE indicator columns and define the weight column used for averaging.


In [23]:
daioe_cols = [
    c for c in daioe_scb_filtered.collect_schema().names()
    if c.startswith("daioe_")
]

w = pl.col("total_count")


## Aggregate to SSYK3/2/1
Compute simple and employment-weighted averages for each higher SSYK level.


In [24]:

daioe_scb_lv3 = (
    daioe_scb_filtered
    .select(["code_3", "year", "total_count", *daioe_cols])
    .group_by(["year", "code_3"])
    .agg(
        [
            w.sum().alias("weight_sum"),

            # simple averages
            *[pl.col(c).mean().alias(f"{c}_avg") for c in daioe_cols],

            # employment-weighted averages
            *[
                pl.when(w.sum() > 0)
                  .then((pl.col(c) * w).sum() / w.sum())
                  .otherwise(None)
                  .alias(f"{c}_wavg")
                for c in daioe_cols
            ],
        ]
    )
    .with_columns(pl.lit("SSYK3").alias("level"))
    .rename({"code_3": "ssyk_code"})
)

# preview
daioe_scb_lv3.limit(10).collect()


year,ssyk_code,weight_sum,daioe_allapps_avg,daioe_stratgames_avg,daioe_videogames_avg,daioe_imgrec_avg,daioe_imgcompr_avg,daioe_imggen_avg,daioe_readcompr_avg,daioe_lngmod_avg,daioe_translat_avg,daioe_speechrec_avg,daioe_genai_avg,daioe_allapps_wavg,daioe_stratgames_wavg,daioe_videogames_wavg,daioe_imgrec_wavg,daioe_imgcompr_wavg,daioe_imggen_wavg,daioe_readcompr_wavg,daioe_lngmod_wavg,daioe_translat_wavg,daioe_speechrec_wavg,daioe_genai_wavg,level
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2016,"""832""",18638,12.423868,0.213674,3.5309,0.175093,0.039101,0.248279,0.066891,0.018544,0.004826,0.156095,0.368311,12.419802,0.213713,3.527426,0.17513,0.039122,0.248628,0.066943,0.018548,0.004824,0.156029,0.368741,"""SSYK3"""
2020,"""231""",36297,27.456768,0.273098,3.589121,0.380133,0.102511,0.606002,0.290223,0.480877,0.061234,0.570221,1.901919,27.456768,0.273098,3.589121,0.380133,0.102511,0.606002,0.290223,0.480877,0.061234,0.570221,1.901919,"""SSYK3"""
2022,"""171""",2013,22.68055,0.206465,3.337697,0.328055,0.129356,0.567701,0.182055,0.578177,0.030391,0.338739,1.954531,22.68055,0.206465,3.337697,0.328055,0.129356,0.567701,0.182055,0.578177,0.030391,0.338739,1.954531,"""SSYK3"""
2024,"""723""",65688,27.345164,0.245856,5.528633,0.372573,0.176021,0.543646,0.217712,0.521277,0.021963,0.457824,1.755637,27.008165,0.242882,5.706464,0.368693,0.172292,0.530962,0.199836,0.480903,0.020475,0.439712,1.667703,"""SSYK3"""
2015,"""811""",7306,8.663304,0.182641,3.571892,0.153134,,0.017178,0.002217,0.009415,0.001361,0.115963,0.043585,8.727735,0.182384,3.625122,0.153468,0.0,0.016806,0.002149,0.009169,0.001343,0.116047,0.042557,"""SSYK3"""
2022,"""262""",11166,35.633672,0.308747,4.444371,0.565492,0.224683,0.953352,0.335521,1.033119,0.052742,0.532053,3.381895,34.991365,0.306025,4.593648,0.567752,0.220453,0.935537,0.311429,0.967479,0.048493,0.496264,3.240957,"""SSYK3"""
2022,"""932""",9621,24.173985,0.228463,5.978074,0.348283,0.119977,0.431461,0.109968,0.34798,0.018579,0.23698,1.32459,24.173985,0.228463,5.978074,0.348283,0.119977,0.431461,0.109968,0.34798,0.018579,0.23698,1.32459,"""SSYK3"""
2018,"""722""",52175,17.110885,0.274485,4.942647,0.200896,0.053529,0.321309,0.081638,0.051161,0.00973,0.179082,0.560432,16.51367,0.264111,4.879271,0.189576,0.050338,0.297286,0.07548,0.047301,0.009196,0.172479,0.518438,"""SSYK3"""
2016,"""831""",5635,12.705631,0.227971,3.642626,0.177302,0.039355,0.250646,0.066782,0.018148,0.004773,0.159116,0.369653,12.833703,0.228358,3.802299,0.17937,0.039279,0.243286,0.062525,0.016961,0.004542,0.153648,0.356315,"""SSYK3"""
2024,"""834""",44577,26.642166,0.228816,5.817086,0.405152,0.18409,0.507423,0.174228,0.433481,0.019366,0.43327,1.544496,26.237362,0.226667,5.976657,0.403434,0.180058,0.500262,0.159986,0.392753,0.017549,0.401179,1.465329,"""SSYK3"""


In [25]:
inspect_lazy(daioe_scb_lv3)

Rows: 1,595
Columns: 26


In [26]:

daioe_scb_lv2 = (
    daioe_scb_filtered
    .select(["code_2", "year", "total_count", *daioe_cols])
    .group_by(["year", "code_2"])
    .agg(
        [
            w.sum().alias("weight_sum"),

            # simple averages
            *[pl.col(c).mean().alias(f"{c}_avg") for c in daioe_cols],

            # employment-weighted averages
            *[
                pl.when(w.sum() > 0)
                  .then((pl.col(c) * w).sum() / w.sum())
                  .otherwise(None)
                  .alias(f"{c}_wavg")
                for c in daioe_cols
            ],
        ]
    )
    .with_columns(pl.lit("SSYK2").alias("level"))
    .rename({"code_2": "ssyk_code"})
)

# preview
daioe_scb_lv2.limit(10).collect()

year,ssyk_code,weight_sum,daioe_allapps_avg,daioe_stratgames_avg,daioe_videogames_avg,daioe_imgrec_avg,daioe_imgcompr_avg,daioe_imggen_avg,daioe_readcompr_avg,daioe_lngmod_avg,daioe_translat_avg,daioe_speechrec_avg,daioe_genai_avg,daioe_allapps_wavg,daioe_stratgames_wavg,daioe_videogames_wavg,daioe_imgrec_wavg,daioe_imgcompr_wavg,daioe_imggen_wavg,daioe_readcompr_wavg,daioe_lngmod_wavg,daioe_translat_wavg,daioe_speechrec_wavg,daioe_genai_wavg,level
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2024,"""44""",20557,32.748844,0.267681,5.163647,0.445566,0.225883,0.61168,0.347111,0.877114,0.036118,0.709806,2.436989,30.783729,0.253436,5.292304,0.41625,0.207466,0.549604,0.302721,0.764974,0.031822,0.638184,2.153917,"""SSYK2"""
2016,"""15""",43162,13.366554,0.257214,2.459884,0.182375,0.049122,0.338245,0.1503,0.041013,0.010132,0.278512,0.555941,13.789247,0.267515,2.559913,0.189307,0.050574,0.347861,0.154835,0.041869,0.01025,0.28053,0.570732,"""SSYK2"""
2019,"""82""",53238,21.824818,0.258475,6.020729,0.263553,0.058305,0.342861,0.112963,0.07613,0.019932,0.22338,0.663219,22.068356,0.26221,6.083159,0.267151,0.059155,0.347357,0.114704,0.077109,0.020086,0.224581,0.671897,"""SSYK2"""
2023,"""16""",5734,34.320328,0.286123,3.904811,0.442811,0.248901,0.757716,0.467716,1.155599,0.046648,0.825958,3.129512,34.320328,0.286123,3.904811,0.442811,0.248901,0.757716,0.467716,1.155599,0.046648,0.825958,3.129512,"""SSYK2"""
2021,"""42""",65916,29.789505,0.278408,4.405919,0.434382,0.16218,0.680686,0.233805,0.471838,0.051732,0.585794,2.058443,30.931395,0.292254,4.480422,0.461514,0.1711,0.748933,0.247959,0.491225,0.052886,0.576176,2.211147,"""SSYK2"""
2021,"""41""",163884,33.945281,0.332096,4.852562,0.520224,0.192996,0.795924,0.290884,0.559767,0.058254,0.601808,2.422947,34.678456,0.343703,4.993289,0.534621,0.197297,0.808103,0.295017,0.568462,0.05895,0.608249,2.460206,"""SSYK2"""
2022,"""11""",23212,28.844533,0.2615,3.690636,0.413343,0.171817,0.755392,0.270054,0.831965,0.043216,0.454404,2.705924,28.188419,0.255752,3.610949,0.397621,0.166733,0.733552,0.265042,0.816299,0.042598,0.448423,2.642005,"""SSYK2"""
2018,"""44""",21375,17.466701,0.26748,4.061334,0.22567,0.064045,0.337739,0.124049,0.081581,0.015722,0.271567,0.661785,16.279254,0.245404,4.251308,0.202752,0.055518,0.283717,0.096943,0.064102,0.012347,0.221452,0.545119,"""SSYK2"""
2020,"""13""",83385,25.807071,0.288818,4.328535,0.380536,0.091171,0.578108,0.215352,0.349539,0.039389,0.39604,1.605689,25.98506,0.292555,4.342929,0.384587,0.091905,0.582825,0.216852,0.351557,0.03946,0.397064,1.617091,"""SSYK2"""
2019,"""75""",12991,21.009748,0.25093,6.084496,0.250291,0.054662,0.328329,0.097421,0.067125,0.016926,0.199549,0.618327,21.271339,0.259372,6.227363,0.248831,0.053554,0.333508,0.09571,0.065031,0.016299,0.199739,0.621018,"""SSYK2"""


In [27]:
inspect_lazy(daioe_scb_lv2)

Rows: 473
Columns: 26


In [28]:

daioe_scb_lv1 = (
    daioe_scb_filtered
    .select(["code_1", "year", "total_count", *daioe_cols])
    .group_by(["year", "code_1"])
    .agg(
        [
            w.sum().alias("weight_sum"),

            # simple averages
            *[pl.col(c).mean().alias(f"{c}_avg") for c in daioe_cols],

            # employment-weighted averages
            *[
                pl.when(w.sum() > 0)
                  .then((pl.col(c) * w).sum() / w.sum())
                  .otherwise(None)
                  .alias(f"{c}_wavg")
                for c in daioe_cols
            ],
        ]
    )
    .with_columns(pl.lit("SSYK1").alias("level"))
    .rename({"code_1": "ssyk_code"})
)

# preview
daioe_scb_lv1.limit(10).collect()

year,ssyk_code,weight_sum,daioe_allapps_avg,daioe_stratgames_avg,daioe_videogames_avg,daioe_imgrec_avg,daioe_imgcompr_avg,daioe_imggen_avg,daioe_readcompr_avg,daioe_lngmod_avg,daioe_translat_avg,daioe_speechrec_avg,daioe_genai_avg,daioe_allapps_wavg,daioe_stratgames_wavg,daioe_videogames_wavg,daioe_imgrec_wavg,daioe_imgcompr_wavg,daioe_imggen_wavg,daioe_readcompr_wavg,daioe_lngmod_wavg,daioe_translat_wavg,daioe_speechrec_wavg,daioe_genai_wavg,level
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2017,"""3""",587743,15.330511,0.284989,3.294012,0.221659,0.068901,0.342863,0.136208,0.03857,0.012569,0.264466,0.54813,15.430883,0.288345,3.187177,0.225241,0.070892,0.349034,0.143976,0.04095,0.0134,0.277785,0.563433,"""SSYK1"""
2020,"""9""",235351,21.161469,0.221319,5.206414,0.29203,0.063406,0.357419,0.106934,0.184102,0.022102,0.257368,0.925713,20.77265,0.218106,5.372838,0.28779,0.061061,0.334038,0.096607,0.166074,0.019314,0.230467,0.854366,"""SSYK1"""
2014,"""4""",336579,5.672183,0.220959,2.190429,0.160599,,,0.003519,0.014595,,0.055095,0.014595,5.907313,0.232484,2.302639,0.167467,0.0,0.0,0.003611,0.014823,0.0,0.053938,0.014823,"""SSYK1"""
2016,"""9""",235399,12.368514,0.21964,3.56791,0.154175,0.035831,0.225512,0.072541,0.020552,0.005171,0.167438,0.347305,12.211664,0.21614,3.689088,0.151661,0.034402,0.209998,0.065042,0.018443,0.004502,0.149821,0.32139,"""SSYK1"""
2017,"""4""",358550,16.391392,0.295741,3.260772,0.23118,0.07469,0.344118,0.159099,0.047022,0.016163,0.346562,0.575913,16.897651,0.310795,3.415979,0.241077,0.077208,0.355023,0.163081,0.04765,0.01608,0.337032,0.591638,"""SSYK1"""
2021,"""5""",989413,23.70467,0.227595,4.472701,0.346074,0.122038,0.520232,0.151855,0.297935,0.031302,0.356879,1.445203,23.255248,0.225627,4.367118,0.340014,0.119959,0.517246,0.149866,0.293592,0.030149,0.342511,1.433008,"""SSYK1"""
2021,"""4""",363256,30.671484,0.296127,4.732154,0.464938,0.169565,0.701705,0.241132,0.472345,0.04986,0.540041,2.093716,31.338661,0.30945,4.962001,0.48237,0.173968,0.718724,0.244077,0.472191,0.048845,0.517055,2.121689,"""SSYK1"""
2024,"""8""",333977,27.739882,0.247078,5.853991,0.390965,0.181807,0.519702,0.205744,0.502361,0.020999,0.45489,1.683537,27.93045,0.243212,5.75924,0.405849,0.188934,0.537958,0.209679,0.511816,0.021712,0.462791,1.728448,"""SSYK1"""
2014,"""7""",365712,5.497783,0.18516,2.645227,0.129784,,,0.001618,0.006395,,0.024655,0.006395,5.365312,0.182691,2.568061,0.128419,0.0,0.0,0.001598,0.006275,0.0,0.02432,0.006275,"""SSYK1"""
2021,"""7""",399245,24.166788,0.248149,5.71469,0.375726,0.119025,0.534775,0.11084,0.206973,0.020461,0.241668,1.272799,23.674488,0.244359,5.529758,0.371823,0.117563,0.536624,0.10955,0.20315,0.020285,0.238143,1.267686,"""SSYK1"""


In [29]:
inspect_lazy(daioe_scb_lv1)

Rows: 99
Columns: 26


## Generalized aggregation + percentiles
Use a reusable function to aggregate all SSYK levels and add within-year percentiles for each DAIOE metric.


In [30]:
import polars as pl

def aggregate_daioe_level(
    lf: pl.LazyFrame,
    code_col: str,
    level_label: str,
    weight_col: str = "total_count",
    prefix: str = "daioe_",
    add_percentiles: bool = True,
    pct_scale: int = 100,
    descending: bool = False,
) -> pl.LazyFrame:

    daioe_cols = [c for c in lf.collect_schema().names() if c.startswith(prefix)]
    w = pl.col(weight_col)

    out = (
        lf
        .group_by(["year", code_col])
        .agg(
            w.sum().alias("weight_sum"),
            pl.col(daioe_cols).mean().name.suffix("_avg"),
            ((pl.col(daioe_cols) * w).sum() / w.sum()).name.suffix("_wavg"),
        )
        .with_columns(pl.lit(level_label).alias("level"))
        .rename({code_col: "ssyk_code"})
    )

    if not add_percentiles:
        return out

    group_keys = ["year", "level"]

    rank_expr = (
        pl.col(f"^{prefix}.*_(avg|wavg)$")
        .rank(method="average", descending=descending)
        .over(group_keys)
    )

    n_expr = pl.len().over(group_keys)

    return out.with_columns(
        (
            pl.when(n_expr > 1)
            .then((rank_expr - 1) / (n_expr - 1))
            .otherwise(0.0)
            * pct_scale
        ).name.prefix("pctl_")
    )


In [31]:
levels = {
    "code_4": "SSYK4",
    "code_3": "SSYK3",
    "code_2": "SSYK2",
    "code_1": "SSYK1",
}

aggregated = [
    aggregate_daioe_level(daioe_scb_filtered, col, label)
    for col, label in levels.items()
]

daioe_all_levels = (
    pl.concat(aggregated)
    .sort(["level", "year", "ssyk_code"])
)


In [32]:
inspect_lazy(daioe_all_levels)

Rows: 6,853
Columns: 48


In [33]:
print(
    daioe_all_levels
    .group_by("level")
    .len()
    .collect()
)


shape: (4, 2)
┌───────┬──────┐
│ level ┆ len  │
│ ---   ┆ ---  │
│ str   ┆ u32  │
╞═══════╪══════╡
│ SSYK4 ┆ 4686 │
│ SSYK1 ┆ 99   │
│ SSYK2 ┆ 473  │
│ SSYK3 ┆ 1595 │
└───────┴──────┘


## Export
Write the combined dataset to Parquet for downstream use.


In [34]:
output_path = DATA_DIR / "daioe_scb_years_all_levels.parquet"

daioe_all_levels.sink_parquet(output_path)