# SCB Data Pull & Weight Computation

In [15]:
import pandas as pd 
from pyscbwrapper import SCB
from pathlib import Path

#ROOT = Path(__file__).resolve().parents[1]
ROOT = Path.cwd().resolve().parents[1]

In [16]:
TAX_ID = 'ssyk2012' 

TABLES = {
    "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
    "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
}




In [17]:
scb = SCB(*TABLES[f"{TAX_ID}_tab"])

In [18]:
scb.info()

{'title': 'Employees, number by occupation (SSYK 2012), level of education SUN 2020, age, sex, observations and year',
 'variables': [{'code': 'Yrke2012',
   'text': 'occupation (SSYK 2012)',
   'values': ['0110',
    '0210',
    '0310',
    '1111',
    '1112',
    '1113',
    '1120',
    '1211',
    '1212',
    '1221',
    '1222',
    '1230',
    '1241',
    '1242',
    '1251',
    '1252',
    '1291',
    '1292',
    '1311',
    '1312',
    '1321',
    '1322',
    '1331',
    '1332',
    '1341',
    '1342',
    '1351',
    '1352',
    '1361',
    '1362',
    '1371',
    '1372',
    '1380',
    '1411',
    '1412',
    '1421',
    '1422',
    '1491',
    '1492',
    '1511',
    '1512',
    '1521',
    '1522',
    '1531',
    '1532',
    '1540',
    '1591',
    '1592',
    '1611',
    '1612',
    '1711',
    '1712',
    '1721',
    '1722',
    '1731',
    '1732',
    '1741',
    '1742',
    '1791',
    '1792',
    '2111',
    '2112',
    '2113',
    '2114',
    '2121',
    '2122',
    '2

In [19]:
var_ = scb.get_variables()

In [20]:
occupations_key, occupations = next(iter(var_.items()))

In [21]:
clean_key = occupations_key.replace(" ", "")

In [22]:
# Years can be strings; coerce to int safely and pick max
def coerce_year(y):
    try:
        return int(y)
    except Exception:
        return None


years = [coerce_year(y) for y in var_["year"]]
years = [y for y in years if y is not None]
latest_year = str(max(years))

# ========= 3) Build query (match exact variable names) ========= #

scb.set_query(
    **{
        clean_key: occupations,
        "year": [latest_year],
    }
)

In [23]:
scb_data = scb.get_data()
scb_fetch = scb_data["data"]

In [24]:
codes = scb.get_query()["query"][0]["selection"]["values"]
occ_dict = dict(zip(codes, occupations))

In [25]:
# CREATING CLEANED DATAFRAME
records = []
for r in scb_fetch:
    code, year = r["key"][:2]  # occupation code, year
    name = occ_dict.get(code, code)
    value = r["values"][0]              # raw string
    records.append({"code_4": code, "occupation": name, "year": year, "value": value})

df = pd.DataFrame(records)
df = df[df["code_4"] != "0002"].reset_index(drop=True) ## remove unidentified group


In [26]:
# Code formatting
df["code_4"] = df["code_4"].astype(str).str.zfill(4)
df["code_3"] = df["code_4"].str[:3]
df["code_2"] = df["code_4"].str[:2]
df["code_1"] = df["code_4"].str[0]
df["value"]  = df["value"].astype(int)


In [27]:
# Mapping: level → column name
level_map = {4: "code_4", 3: "code_3", 2: "code_2", 1: "code_1"}

level_frames = []
for level, column in level_map.items():
    # Sum values (employment) within each parent group
    level_df = (
        df.groupby(["year", column], as_index=False)["value"]
        .sum()
        .rename(columns={column: "code"})
    )
    level_df["level"] = level
    level_frames.append(level_df)

# Combine all levels 1–4 into unified taxonomy format
df = (
    pd.concat(level_frames, ignore_index=True)
    .assign(taxonomy=TAX_ID)[["taxonomy", "year", "level", "code", "value"]]
    .sort_values(["year", "level", "code"])
    .reset_index(drop=True)
)

In [28]:
out_path = ROOT / "data" / "02_scb_data" / f"{TAX_ID}_weights_en_{latest_year}.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
print(f"Wrote: {out_path.resolve()}")

Wrote: /media/jojo/DataDisk/2_Work/1_RA_Work/data/02_scb_data/ssyk2012_weights_en_2023.csv
