# DAIOE SSYK2012 Workflow

Organized end-to-end flow for preparing DAIOE + SCB SSYK data.


## 1) Setup Paths and Data Sources

Define imports, workspace paths, and source locations.


In [1]:
import polars as pl
from pathlib import Path


ROOT = Path.cwd().resolve()
DATA_DIR = ROOT / "data"
DATA_DIR.mkdir(parents=True, exist_ok=True)

DAIOE_SOURCE: str = (
    "https://raw.githubusercontent.com/joseph-data/07_translate_ssyk/main/"
    "03_translated_files/daioe_ssyk2012_translated.csv"
)

SCB_SOURCE: str = (
        "https://raw.githubusercontent.com/joseph-data/AI_Econ_daioe_years/daioe_pull/"
        "data/processed/ssyk12_aggregated_ssyk4_to_ssyk1.parquet"
)

## 2) Load DAIOE and SCB Lazily

Read source files as `LazyFrame` objects for efficient transformations.


In [2]:
daioe_lazy_lf = pl.scan_csv(
    DAIOE_SOURCE
)

scb_lazy_lf = pl.scan_parquet(
    SCB_SOURCE
)

## 3) Define Utility Helpers

Create helper functions used for lightweight pipeline inspection.


In [3]:
def inspect_lazy(lf: pl.LazyFrame) -> None:
    """
    Print the shape of a Polars LazyFrame in a memory-efficient manner.

    This function computes the number of rows using a lazy row-count
    aggregation (`pl.len()`) and retrieves the number of columns from
    the resolved schema without materializing the full dataset.

    Parameters
    ----------
    lf : pl.LazyFrame
        The LazyFrame to inspect.

    Notes
    -----
    - The row count triggers execution of the lazy query plan,
      but avoids collecting all columns into memory.
    - The column count is obtained from the schema metadata and
      does not require data materialization.
    - Intended for debugging and validation of large lazy pipelines.
    """
    n_rows = lf.select(pl.len()).collect().item()
    n_cols = len(lf.collect_schema())
    print(f"Rows: {n_rows:,}")
    print(f"Columns: {n_cols}")


## 4) Quick Sanity Checks and Early Military Removal

Preview both sources and remove code-0 military rows early from DAIOE and SCB.


In [4]:
print(daioe_lazy_lf.head(5).collect())

shape: (5, 27)
┌────────────┬──────┬────────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ ssyk2012_4 ┆ year ┆ daioe_alla ┆ daioe_stra ┆ … ┆ pctl_rank_ ┆ ssyk2012_ ┆ ssyk2012_ ┆ ssyk2012_ │
│ ---        ┆ ---  ┆ pps        ┆ tgames     ┆   ┆ genai      ┆ 1         ┆ 2         ┆ 3         │
│ str        ┆ i64  ┆ ---        ┆ ---        ┆   ┆ ---        ┆ ---       ┆ ---       ┆ ---       │
│            ┆      ┆ f64        ┆ f64        ┆   ┆ f64        ┆ str       ┆ str       ┆ str       │
╞════════════╪══════╪════════════╪════════════╪═══╪════════════╪═══════════╪═══════════╪═══════════╡
│ 0110 Commi ┆ 2010 ┆ null       ┆ null       ┆ … ┆ null       ┆ 0 Armed   ┆ 01        ┆ 011 Commi │
│ ssioned    ┆      ┆            ┆            ┆   ┆            ┆ forces    ┆ Officers  ┆ ssioned   │
│ armed      ┆      ┆            ┆            ┆   ┆            ┆ occupatio ┆           ┆ armed     │
│ forces…    ┆      ┆            ┆            ┆   ┆            ┆ ns        ┆

In [5]:
print(scb_lazy_lf.head(5).collect())

shape: (5, 7)
┌───────┬───────────┬───────┬───────┬──────┬───────┬─────────────────────────────────┐
│ level ┆ ssyk_code ┆ age   ┆ sex   ┆ year ┆ count ┆ occupation                      │
│ ---   ┆ ---       ┆ ---   ┆ ---   ┆ ---  ┆ ---   ┆ ---                             │
│ str   ┆ str       ┆ str   ┆ str   ┆ i64  ┆ i64   ┆ str                             │
╞═══════╪═══════════╪═══════╪═══════╪══════╪═══════╪═════════════════════════════════╡
│ SSYK4 ┆ 2132      ┆ 45-49 ┆ women ┆ 2024 ┆ 80    ┆ Plant and animal biologists     │
│ SSYK4 ┆ 8163      ┆ 55-59 ┆ women ┆ 2017 ┆ 184   ┆ Machine operators, baked-goods… │
│ SSYK4 ┆ 2413      ┆ 35-39 ┆ men   ┆ 2016 ┆ 556   ┆ Financial and investment advis… │
│ SSYK4 ┆ 1222      ┆ 16-24 ┆ men   ┆ 2016 ┆ 7     ┆ Human resource managers, level… │
│ SSYK4 ┆ 8199      ┆ 25-29 ┆ women ┆ 2019 ┆ 62    ┆ Process control technicians no… │
└───────┴───────────┴───────┴───────┴──────┴───────┴─────────────────────────────────┘


In [6]:
# daioe_lazy_lf.collect_schema()

In [7]:
## Removed Military Personnel

scb_lazy_lf = scb_lazy_lf.filter(
    pl.col("ssyk_code").str.starts_with("0").not_()
)

daioe_lazy_lf = daioe_lazy_lf.filter(
    pl.col("ssyk2012_4").str.starts_with("0").not_()
)

In [8]:
#scb_lazy_lf.collect().collect_schema()

## 5) Derive SSYK Levels in DAIOE

Split DAIOE SSYK4 into SSYK1-4 and keep SSYK2012-era years.


In [9]:
daioe_lazy_lf_ssyk12 = (
    daioe_lazy_lf\
    .with_columns([
    pl.col("ssyk2012_4").str.slice(0, 1).alias("code_1"),
    pl.col("ssyk2012_4").str.slice(0, 2).alias("code_2"),
    pl.col("ssyk2012_4").str.slice(0, 3).alias("code_3"),
    pl.col("ssyk2012_4").str.slice(0, 4).alias("code_4")
])\
    .drop(pl.col("^ssyk2012.*$"))\
        .filter(pl.col("year") >= 2014) ## The Year stretch from the first SSYK12 publication
)

## 6) Align DAIOE Years to SCB Coverage

Extend DAIOE series forward when SCB has later years.


In [10]:
## Here I extend the years to Latest according to the pulled SCB data (2024, yearly)

base = daioe_lazy_lf_ssyk12

daioe_max = base.select(pl.max("year")).collect().item()
scb_max   = scb_lazy_lf.select(pl.max("year")).collect().item()

missing = list(range(daioe_max + 1, scb_max + 1))

daioe_lazy_lf_extended = (
    base
    if not missing
    else pl.concat(
        [
            base,
            base
            .filter(pl.col("year") == daioe_max)
            .drop("year")
            .join(pl.LazyFrame({"year": missing}), how="cross")
            .select(base.collect_schema().names()),  # ensure same column order/schema
        ],
        how="vertical",
    )
)



In [11]:
inspect_lazy(daioe_lazy_lf_extended)


Rows: 4,686
Columns: 27


## 7) Build SCB SSYK4 Employment Counts

Aggregate SCB counts at year + 4-digit SSYK level.


In [12]:
scb_lazy_lf_level4 = (
    scb_lazy_lf
        .filter(pl.col("ssyk_code").str.len_chars() == 4)
        .group_by(["year", "ssyk_code"])
        .agg(pl.col("count").sum().alias("total_count"))
)



In [13]:
inspect_lazy(scb_lazy_lf_level4)

Rows: 4,686
Columns: 3


## 8) Merge DAIOE with SCB SSYK4 Counts

Join by `year` and 4-digit code, then inspect merged coverage.


In [14]:
daioe_lazy_lf_extended.head(5).collect()

year,daioe_allapps,daioe_stratgames,daioe_videogames,daioe_imgrec,daioe_imgcompr,daioe_imggen,daioe_readcompr,daioe_lngmod,daioe_translat,daioe_speechrec,daioe_genai,pctl_rank_allapps,pctl_rank_stratgames,pctl_rank_videogames,pctl_rank_imgrec,pctl_rank_imgcompr,pctl_rank_imggen,pctl_rank_readcompr,pctl_rank_lngmod,pctl_rank_translat,pctl_rank_speechrec,pctl_rank_genai,code_1,code_2,code_3,code_4
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str
2014,,,,,,,,,,,,,,,,,,,,,,,"""1""","""11""","""111""","""1111"""
2015,,,,,,,,,,,,,,,,,,,,,,,"""1""","""11""","""111""","""1111"""
2016,,,,,,,,,,,,,,,,,,,,,,,"""1""","""11""","""111""","""1111"""
2017,,,,,,,,,,,,,,,,,,,,,,,"""1""","""11""","""111""","""1111"""
2018,,,,,,,,,,,,,,,,,,,,,,,"""1""","""11""","""111""","""1111"""


In [15]:
scb_lazy_lf_level4.head(5).collect()

year,ssyk_code,total_count
i64,str,i64
2024,"""8341""",7704
2018,"""7124""",2749
2024,"""7221""",2796
2023,"""2250""",2994
2019,"""8113""",505


In [16]:
daioe_scb_years = daioe_lazy_lf_extended\
    .join(
        scb_lazy_lf_level4,
        left_on=["year", "code_4"],
        right_on=["year", "ssyk_code"],
        how="left"
    )

In [17]:
inspect_lazy(daioe_scb_years)

Rows: 4,686
Columns: 28


## 9) Inspect Unmatched DAIOE Codes

Identify DAIOE rows that have no SCB SSYK4 match.


In [18]:
# DAIOE codes with no SCB match
daioe_scb_years_unmatched = daioe_lazy_lf_extended\
    .join(
        scb_lazy_lf_level4,
        left_on=["year", "code_4"],
        right_on=["year", "ssyk_code"],
        how="anti"
    )



In [19]:
inspect_lazy(daioe_scb_years_unmatched)

Rows: 0
Columns: 27


## 10) Post-Merge Validation

Run quick schema and shape checks on the filtered joined frame before aggregation.


In [20]:
daioe_scb_years.collect_schema()

Schema([('year', Int64),
        ('daioe_allapps', Float64),
        ('daioe_stratgames', Float64),
        ('daioe_videogames', Float64),
        ('daioe_imgrec', Float64),
        ('daioe_imgcompr', Float64),
        ('daioe_imggen', Float64),
        ('daioe_readcompr', Float64),
        ('daioe_lngmod', Float64),
        ('daioe_translat', Float64),
        ('daioe_speechrec', Float64),
        ('daioe_genai', Float64),
        ('pctl_rank_allapps', Float64),
        ('pctl_rank_stratgames', Float64),
        ('pctl_rank_videogames', Float64),
        ('pctl_rank_imgrec', Float64),
        ('pctl_rank_imgcompr', Float64),
        ('pctl_rank_imggen', Float64),
        ('pctl_rank_readcompr', Float64),
        ('pctl_rank_lngmod', Float64),
        ('pctl_rank_translat', Float64),
        ('pctl_rank_speechrec', Float64),
        ('pctl_rank_genai', Float64),
        ('code_1', String),
        ('code_2', String),
        ('code_3', String),
        ('code_4', String),
        ('tot

In [21]:
daioe_scb_years.collect_schema().names()

['year',
 'daioe_allapps',
 'daioe_stratgames',
 'daioe_videogames',
 'daioe_imgrec',
 'daioe_imgcompr',
 'daioe_imggen',
 'daioe_readcompr',
 'daioe_lngmod',
 'daioe_translat',
 'daioe_speechrec',
 'daioe_genai',
 'pctl_rank_allapps',
 'pctl_rank_stratgames',
 'pctl_rank_videogames',
 'pctl_rank_imgrec',
 'pctl_rank_imgcompr',
 'pctl_rank_imggen',
 'pctl_rank_readcompr',
 'pctl_rank_lngmod',
 'pctl_rank_translat',
 'pctl_rank_speechrec',
 'pctl_rank_genai',
 'code_1',
 'code_2',
 'code_3',
 'code_4',
 'total_count']

In [22]:
inspect_lazy(daioe_scb_years)

Rows: 4,686
Columns: 28


## 11) Identify DAIOE Measure Columns

Collect DAIOE metric columns and define the weight expression.


In [23]:
daioe_cols = [
    c for c in daioe_scb_years.collect_schema().names()
    if c.startswith("daioe_")
]

w = pl.col("total_count")


## 12) Manual Aggregation Checks (Legacy)

Compute SSYK3 and SSYK2 weighted/simple aggregates as intermediate checks.


In [24]:

daioe_scb_lv3 = (
    daioe_scb_years
    .select(["code_3", "year", "total_count", *daioe_cols])
    .group_by(["year", "code_3"])
    .agg(
        [
            w.sum().alias("weight_sum"),

            # simple averages
            *[pl.col(c).mean().alias(f"{c}_avg") for c in daioe_cols],

            # employment-weighted averages
            *[
                pl.when(w.sum() > 0)
                  .then((pl.col(c) * w).sum() / w.sum())
                  .otherwise(None)
                  .alias(f"{c}_wavg")
                for c in daioe_cols
            ],
        ]
    )
    .with_columns(pl.lit("SSYK3").alias("level"))
    .rename({"code_3": "ssyk_code"})
)

# preview
daioe_scb_lv3.limit(10).collect()


year,ssyk_code,weight_sum,daioe_allapps_avg,daioe_stratgames_avg,daioe_videogames_avg,daioe_imgrec_avg,daioe_imgcompr_avg,daioe_imggen_avg,daioe_readcompr_avg,daioe_lngmod_avg,daioe_translat_avg,daioe_speechrec_avg,daioe_genai_avg,daioe_allapps_wavg,daioe_stratgames_wavg,daioe_videogames_wavg,daioe_imgrec_wavg,daioe_imgcompr_wavg,daioe_imggen_wavg,daioe_readcompr_wavg,daioe_lngmod_wavg,daioe_translat_wavg,daioe_speechrec_wavg,daioe_genai_wavg,level
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2023,"""242""",148272,40.009033,0.328854,4.225665,0.512985,0.288534,0.917541,0.584167,1.418209,0.055578,0.976789,3.816099,40.707543,0.336852,4.324831,0.524727,0.294267,0.941778,0.592293,1.434638,0.055891,0.980734,3.884599,"""SSYK3"""
2015,"""835""",1078,8.32951,0.166351,3.375607,0.177775,,0.018123,0.001719,0.00743,0.001167,0.10923,0.04108,8.32951,0.166351,3.375607,0.177775,0.0,0.018123,0.001719,0.00743,0.001167,0.10923,0.04108,"""SSYK3"""
2018,"""933""",10034,14.686696,0.222104,3.974772,0.187095,0.050261,0.277962,0.076749,0.049995,0.009859,0.182192,0.499617,14.775669,0.22347,3.979362,0.187703,0.050628,0.277989,0.078508,0.051226,0.010076,0.185196,0.502869,"""SSYK3"""
2018,"""233""",30313,17.704664,0.284355,3.064515,0.234204,0.072161,0.449363,0.167077,0.109855,0.020734,0.351347,0.885201,17.704664,0.284355,3.064515,0.234204,0.072161,0.449363,0.167077,0.109855,0.020734,0.351347,0.885201,"""SSYK3"""
2024,"""334""",19071,39.189495,0.297017,4.775579,0.493606,0.269038,0.754567,0.499259,1.263146,0.05719,1.084983,3.283525,38.949624,0.293155,4.80012,0.48145,0.262864,0.743036,0.487657,1.2464,0.05754,1.100435,3.236825,"""SSYK3"""
2020,"""311""",111758,27.797647,0.314147,5.40091,0.426252,0.095965,0.613699,0.191731,0.308647,0.033925,0.352147,1.577816,27.038649,0.308704,5.456667,0.412808,0.091648,0.573863,0.180004,0.28786,0.031602,0.330032,1.474867,"""SSYK3"""
2023,"""523""",14404,32.47451,0.263215,4.792392,0.389165,0.214052,0.566573,0.366435,0.961842,0.041329,0.821583,2.486526,32.47451,0.263215,4.792392,0.389165,0.214052,0.566573,0.366435,0.961842,0.041329,0.821583,2.486526,"""SSYK3"""
2017,"""222""",84165,12.630204,0.246082,2.873926,0.16359,0.051424,0.245914,0.113322,0.03155,0.010045,0.209715,0.405731,12.58977,0.245739,2.867824,0.163011,0.051149,0.245105,0.112628,0.031382,0.009979,0.208845,0.404201,"""SSYK3"""
2014,"""149""",795,4.561,0.187747,1.697522,0.130604,,,0.003297,0.013309,,0.045317,0.013309,4.561,0.187747,1.697522,0.130604,0.0,0.0,0.003297,0.013309,0.0,0.045317,0.013309,"""SSYK3"""
2023,"""831""",5941,27.047561,0.229787,5.340136,0.412921,0.191605,0.536836,0.20179,0.495842,0.021923,0.476364,1.701006,26.995867,0.230238,5.610793,0.418488,0.191177,0.518608,0.186918,0.458336,0.020696,0.457437,1.608573,"""SSYK3"""


In [25]:
inspect_lazy(daioe_scb_lv3)

Rows: 1,595
Columns: 26


In [26]:

daioe_scb_lv2 = (
    daioe_scb_years
    .select(["code_2", "year", "total_count", *daioe_cols])
    .group_by(["year", "code_2"])
    .agg(
        [
            w.sum().alias("weight_sum"),

            # simple averages
            *[pl.col(c).mean().alias(f"{c}_avg") for c in daioe_cols],

            # employment-weighted averages
            *[
                pl.when(w.sum() > 0)
                  .then((pl.col(c) * w).sum() / w.sum())
                  .otherwise(None)
                  .alias(f"{c}_wavg")
                for c in daioe_cols
            ],
        ]
    )
    .with_columns(pl.lit("SSYK2").alias("level"))
    .rename({"code_2": "ssyk_code"})
)

# preview
daioe_scb_lv2.limit(10).collect()

year,ssyk_code,weight_sum,daioe_allapps_avg,daioe_stratgames_avg,daioe_videogames_avg,daioe_imgrec_avg,daioe_imgcompr_avg,daioe_imggen_avg,daioe_readcompr_avg,daioe_lngmod_avg,daioe_translat_avg,daioe_speechrec_avg,daioe_genai_avg,daioe_allapps_wavg,daioe_stratgames_wavg,daioe_videogames_wavg,daioe_imgrec_wavg,daioe_imgcompr_wavg,daioe_imggen_wavg,daioe_readcompr_wavg,daioe_lngmod_wavg,daioe_translat_wavg,daioe_speechrec_wavg,daioe_genai_wavg,level
i64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2022,"""52""",254464,26.957355,0.229525,4.358095,0.370307,0.150158,0.592623,0.199564,0.664285,0.037989,0.446924,2.134175,26.546281,0.231456,4.464385,0.384306,0.14969,0.611394,0.182438,0.609613,0.032886,0.39579,2.076815,"""SSYK2"""
2018,"""33""",305706,18.720561,0.295981,3.504844,0.250675,0.076123,0.428642,0.171897,0.110233,0.021533,0.350793,0.856736,18.491067,0.292573,3.398758,0.246551,0.075509,0.429049,0.171066,0.111157,0.021652,0.352955,0.859828,"""SSYK2"""
2016,"""76""",8314,14.079237,0.263675,4.020607,0.172656,0.040065,0.254151,0.085476,0.023544,0.005613,0.1859,0.393299,14.164117,0.263324,4.052239,0.174323,0.040473,0.274339,0.083331,0.022991,0.00539,0.17892,0.416418,"""SSYK2"""
2023,"""35""",53818,39.128611,0.340064,5.50415,0.526508,0.272065,0.87722,0.458285,1.105388,0.042678,0.811326,3.260162,38.874436,0.33463,5.401192,0.513797,0.26952,0.857818,0.46641,1.124958,0.043695,0.810927,3.258542,"""SSYK2"""
2015,"""92""",2905,8.226237,0.168583,3.430168,0.146715,,0.01617,0.002062,0.00889,0.001255,0.105222,0.041075,8.226237,0.168583,3.430168,0.146715,0.0,0.01617,0.002062,0.00889,0.001255,0.105222,0.041075,"""SSYK2"""
2024,"""16""",5851,34.320328,0.286123,3.904811,0.442811,0.248901,0.757716,0.467716,1.155599,0.046648,0.825958,3.129512,34.320328,0.286123,3.904811,0.442811,0.248901,0.757716,0.467716,1.155599,0.046648,0.825958,3.129512,"""SSYK2"""
2015,"""44""",21852,9.561741,0.213908,3.269881,0.187193,,0.02205,0.004387,0.019314,0.002775,0.209342,0.068859,9.180589,0.195566,3.4305,0.167448,0.0,0.018382,0.003385,0.014995,0.002151,0.168828,0.055537,"""SSYK2"""
2018,"""51""",136827,15.752235,0.234639,3.74278,0.18885,0.055159,0.315926,0.105379,0.070417,0.013781,0.248743,0.604146,15.441917,0.233119,3.982602,0.183076,0.051635,0.287572,0.092194,0.061207,0.011654,0.215375,0.542988,"""SSYK2"""
2019,"""34""",63056,20.309432,0.238193,4.260053,0.251496,0.061703,0.391372,0.158049,0.110897,0.028652,0.306066,0.816398,19.331278,0.229209,3.943759,0.238662,0.059235,0.364604,0.156862,0.109883,0.028782,0.30326,0.776716,"""SSYK2"""
2021,"""71""",172356,21.938945,0.220423,5.241192,0.350904,0.109823,0.490611,0.096356,0.18053,0.018039,0.213728,1.146811,22.616546,0.229185,5.112826,0.366316,0.116457,0.533854,0.108448,0.202466,0.020065,0.232052,1.261656,"""SSYK2"""


In [27]:
inspect_lazy(daioe_scb_lv2)

Rows: 473
Columns: 26


## 12b) Optional SSYK1 Check

This inspection assumes `daioe_scb_lv1` exists in the session.


In [28]:
if "daioe_scb_lv1" in globals():
    inspect_lazy(daioe_scb_lv1)
else:
    print("`daioe_scb_lv1` is not defined in this notebook flow; skipping this optional check.")


`daioe_scb_lv1` is not defined in this notebook flow; skipping this optional check.


## 13) Generalized Aggregation + Percentiles

Aggregate SSYK4-1 with a reusable function and add within-year percentiles.


In [29]:
def aggregate_daioe_level(
    lf: pl.LazyFrame,
    code_col: str,
    level_label: str,
    weight_col: str = "total_count",
    prefix: str = "daioe_",
    add_percentiles: bool = True,
    pct_scale: int = 100,
    descending: bool = False,
) -> pl.LazyFrame:

    daioe_cols = [c for c in lf.collect_schema().names() if c.startswith(prefix)]
    w = pl.col(weight_col)

    out = (
        lf
        .group_by(["year", code_col])
        .agg(
            w.sum().alias("weight_sum"),
            pl.col(daioe_cols).mean().name.suffix("_avg"),
            ((pl.col(daioe_cols) * w).sum() / w.sum()).name.suffix("_wavg"),
        )
        .with_columns(pl.lit(level_label).alias("level"))
        .rename({code_col: "ssyk_code"})
    )

    if not add_percentiles:
        return out

    group_keys = ["year", "level"]

    rank_expr = (
        pl.col(f"^{prefix}.*_(avg|wavg)$")
        .rank(method="average", descending=descending)
        .over(group_keys)
    )

    n_expr = pl.len().over(group_keys)

    return out.with_columns(
        (
            pl.when(n_expr > 1)
            .then((rank_expr - 1) / (n_expr - 1))
            .otherwise(0.0)
            * pct_scale
        ).name.prefix("pctl_")
    )


In [30]:
levels = {
    "code_4": "SSYK4",
    "code_3": "SSYK3",
    "code_2": "SSYK2",
    "code_1": "SSYK1",
}

aggregated = [
    aggregate_daioe_level(daioe_scb_years, col, label)
    for col, label in levels.items()
]

daioe_all_levels = (
    pl.concat(aggregated)
    .sort(["level", "year", "ssyk_code"])
)


In [31]:
inspect_lazy(daioe_all_levels)

Rows: 6,853
Columns: 48


In [32]:
print(
    daioe_all_levels
    .group_by("level")
    .len()
    .collect()
)


shape: (4, 2)
┌───────┬──────┐
│ level ┆ len  │
│ ---   ┆ ---  │
│ str   ┆ u32  │
╞═══════╪══════╡
│ SSYK4 ┆ 4686 │
│ SSYK3 ┆ 1595 │
│ SSYK1 ┆ 99   │
│ SSYK2 ┆ 473  │
└───────┴──────┘


## 14) Build 1-5 Level Exposure Columns

Create `daioe_<index>_Level_Exposure` columns from weighted percentile ranks (`pctl_daioe_*_wavg`) using quintile-style bins.


In [33]:
# Convert weighted percentile ranks (0..100) into 1-5 exposure levels
pct_cols = [
    c
    for c in daioe_all_levels.collect_schema().names()
    if c.startswith("pctl_daioe_") and c.endswith("_wavg")
]

exposure_exprs = []
for col_name in pct_cols:
    metric = col_name[len("pctl_daioe_"):-len("_wavg")]
    out_col = f"daioe_{metric}_Level_Exposure"
    p = pl.col(col_name)

    exposure_exprs.append(
        pl.when(p.is_null())
        .then(None)
        .when(p <= 20)
        .then(1)
        .when(p <= 40)
        .then(2)
        .when(p <= 60)
        .then(3)
        .when(p <= 80)
        .then(4)
        .otherwise(5)
        .cast(pl.Int8)
        .alias(out_col)
    )

daioe_all_levels = daioe_all_levels.with_columns(exposure_exprs)

print([c for c in daioe_all_levels.collect_schema().names() if c.endswith("_Level_Exposure")])


['daioe_allapps_Level_Exposure', 'daioe_stratgames_Level_Exposure', 'daioe_videogames_Level_Exposure', 'daioe_imgrec_Level_Exposure', 'daioe_imgcompr_Level_Exposure', 'daioe_imggen_Level_Exposure', 'daioe_readcompr_Level_Exposure', 'daioe_lngmod_Level_Exposure', 'daioe_translat_Level_Exposure', 'daioe_speechrec_Level_Exposure', 'daioe_genai_Level_Exposure']


## 15) Pre-Merge Diagnostics

Inspect schemas and frame shapes before final integration.


In [34]:
daioe_all_levels.collect_schema()

Schema([('year', Int64),
        ('ssyk_code', String),
        ('weight_sum', Int64),
        ('daioe_allapps_avg', Float64),
        ('daioe_stratgames_avg', Float64),
        ('daioe_videogames_avg', Float64),
        ('daioe_imgrec_avg', Float64),
        ('daioe_imgcompr_avg', Float64),
        ('daioe_imggen_avg', Float64),
        ('daioe_readcompr_avg', Float64),
        ('daioe_lngmod_avg', Float64),
        ('daioe_translat_avg', Float64),
        ('daioe_speechrec_avg', Float64),
        ('daioe_genai_avg', Float64),
        ('daioe_allapps_wavg', Float64),
        ('daioe_stratgames_wavg', Float64),
        ('daioe_videogames_wavg', Float64),
        ('daioe_imgrec_wavg', Float64),
        ('daioe_imgcompr_wavg', Float64),
        ('daioe_imggen_wavg', Float64),
        ('daioe_readcompr_wavg', Float64),
        ('daioe_lngmod_wavg', Float64),
        ('daioe_translat_wavg', Float64),
        ('daioe_speechrec_wavg', Float64),
        ('daioe_genai_wavg', Float64),
        

In [35]:
scb_lazy_lf.collect_schema()

Schema([('level', String),
        ('ssyk_code', String),
        ('age', String),
        ('sex', String),
        ('year', Int64),
        ('count', Int64),
        ('occupation', String)])

In [36]:
inspect_lazy(scb_lazy_lf)

Rows: 123,354
Columns: 7


In [37]:
inspect_lazy(daioe_all_levels)

Rows: 6,853
Columns: 59


## 16) Final Merge

Attach DAIOE aggregates back to the SCB base table.


In [38]:
final_merge = scb_lazy_lf\
    .join(
        daioe_all_levels,
        left_on=["year", "ssyk_code"],
        right_on=["year", "ssyk_code"],
        how="left"
    )

In [39]:
inspect_lazy(final_merge)

Rows: 123,354
Columns: 64


In [None]:
#dd = final_merge.limit(30).collect()

## 17) Export Final Dataset

Write the merged output to parquet for downstream use.


In [133]:
output_path = DATA_DIR / "daioe_scb_years_all_levels.parquet"

final_merge.sink_parquet(output_path)