CELL 0 — Imports



In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
import os
import datetime
from tqdm.auto import tqdm

CELL 1 — Spark session (reuse if already running)

try:
    spark  # noqa: F821
    spark.sparkContext.setLogLevel("ERROR")
    print("Re-using existing Spark session.")
except NameError:
    auto_config = (
        SparkSession.builder
        .appName("OECD_Data_Intensity_Pipeline")
        .config("spark.executor.memory", "13g")
        .config("spark.driver.memory", "3g")
        .config("spark.executor.cores", "4")
        .config("spark.driver.cores", "4")
        .config("spark.sql.adaptive.enabled", "true")
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
        .config("spark.sql.adaptive.skewJoin.enabled", "true")
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")
        .config("spark.sql.parquet.compression.codec", "snappy")
    )
    spark = auto_config.getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    print("Spark session created.")

CELL 2 — Global parameters (NO loose vars later)

In [None]:
# ---------------------------
# Pipeline parameters
# ---------------------------
YEARS = "2020-2025"   # "ALL" or "2020-2025" or "2025" or [2020,2022]

# Sampling
SAMPLE_FRACTION = 0.01       # 1% dev mode; set to 1.0 for full run
SAMPLE_SEED = 42
STRATIFIED_BY_SOC = False   # True = more representative; False = fastest

# NLP thresholds (OECD-style)
SIM_THRESHOLD = 0.45         # filter for "data-related" semantic similarity
DATA_THRESHOLD = 3           # ≥ 3 data-related chunks => data-intensive job

# How much to store per job
TOP_K = 10                   # store top K most "data-like" chunks

# Re-run behaviour
FORCE_RECOMPUTE = False      # if True, overwrite outputs even if exist
WRITE_DEBUG_CHUNKS = False   # only for tiny tests
DEBUG_SAMPLE_FRACTION = 0.0001

print("Parameters set.")

CELL 3 — Paths (S3A root + per-year output folders)


In [None]:
# Root directory: keep everything inside OECD_DATA
BASE_S3A_PATH = "s3a://onscdp-prd-data01-d4946922/dapsen/workspace_zone/online_job_ads/OECD_DATA"

# If you ever want to read from a single parquet instead of CSV partitions
PARQUET_PATH = None  # e.g. f"{BASE_S3A_PATH}/somefile.parquet"
CSV_PATH = None      # if None, uses BASE_S3A_PATH/csv_data/<year>

def get_available_years():
    return list(range(2020, 2026))

def parse_year_input(year_input):
    available = get_available_years()
    if isinstance(year_input, str):
        y = year_input.strip().upper()
        if y == "ALL":
            years = available
        elif "-" in y:
            s, e = map(int, y.split("-"))
            years = list(range(s, e + 1))
        else:
            years = [int(y)]
    elif isinstance(year_input, (list, tuple)):
        years = [int(x) for x in year_input]
    else:
        years = [int(year_input)]
    bad = [yy for yy in years if yy not in available]
    if bad:
        raise ValueError(f"Invalid years {bad}. Available: {available}")
    return sorted(years)

years_to_process = parse_year_input(YEARS)
print("Years to process:", years_to_process)

def validate_oecd_path(path: str) -> bool:
    if not path.startswith(BASE_S3A_PATH):
        raise ValueError(f"Path {path} is outside OECD_DATA root")
    return True

def get_paths_for_year(year: int):
    base_year_path = os.path.join(BASE_S3A_PATH, "processed_data", str(year))
    input_csv = os.path.join(BASE_S3A_PATH, "csv_data", str(year)) if CSV_PATH is None else os.path.join(CSV_PATH, str(year))

    return {
        # input
        "input_csv": input_csv,

        # outputs (job-level, efficient)
        "job_features": os.path.join(base_year_path, "job_features"),
        "debug_chunks": os.path.join(base_year_path, "debug_chunks"),

        # classification outputs
        "job_categories": os.path.join(base_year_path, "job_categories"),
        "occupation_summary": os.path.join(base_year_path, "occupation_summary"),

        # synthetic industry outputs
        "job_categories_sic": os.path.join(base_year_path, "job_categories_sic"),
        "sector_summary_sic": os.path.join(base_year_path, "sector_summary_sic"),
    }

ALL_PATHS = {yr: get_paths_for_year(yr) for yr in years_to_process}

# Validate
for yr, paths in ALL_PATHS.items():
    for k, p in paths.items():
        validate_oecd_path(p)

print("Path validation complete.")
print("Example year paths:", years_to_process[0], ALL_PATHS[years_to_process[0]])

CELL 4 — Safe writing utilities (S3A-safe, versioned, restartable)

In [None]:
def safe_write_parquet_s3a(df, output_base_path, mode="overwrite", create_version=True, verify_read=True):
    """
    Safe-ish write for object stores (s3a://).
    Writes to version folder output_base_path/_v=YYYYMMDD_HHMMSS and updates a _LATEST_POINTER file.
    """
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    version_path = f"{output_base_path}/_v={ts}" if create_version else output_base_path

    print(f"[WRITE] parquet -> {version_path}")
    df.write.mode(mode).option("compression", "snappy").parquet(version_path)

    if verify_read:
        _ = spark.read.parquet(version_path).limit(1).count()

    # marker (optional)
    try:
        marker_df = spark.createDataFrame([(ts,)], ["written_at"])
        marker_df.coalesce(1).write.mode("overwrite").json(f"{version_path}/_SUCCESS_MARKER")
    except Exception as e:
        print(f"Warning: marker write failed: {e}")

    # pointer
    if create_version:
        try:
            latest_df = spark.createDataFrame([(version_path, ts)], ["latest_path", "timestamp"])
            latest_df.coalesce(1).write.mode("overwrite").json(f"{output_base_path}/_LATEST_POINTER")
        except Exception as e:
            print(f"Warning: pointer update failed: {e}")

    return version_path

def read_latest_version(output_base_path):
    pointer_path = f"{output_base_path}/_LATEST_POINTER"
    pointer = spark.read.json(pointer_path)
    latest = pointer.orderBy(F.col("timestamp").desc()).limit(1).collect()[0]["latest_path"]
    return spark.read.parquet(latest), latest

print("Utilities ready: safe_write_parquet_s3a(), read_latest_version()")

CELL 5 — Step 1: Load data per year (with sampling parameter)
This is where you control 1% / 2% / 10% without touching downstream logic.

In [None]:
all_data = {}
total_jobs = 0

if PARQUET_PATH:
    print(f"[LOAD] unified parquet from {PARQUET_PATH}")
    full_df = spark.read.parquet(PARQUET_PATH).withColumn("date", F.to_date("date"))

    for yr in tqdm(years_to_process, desc="Reading data (Parquet)"):
        df = full_df.filter(F.year("date") == yr)

        if SAMPLE_FRACTION is not None and float(SAMPLE_FRACTION) < 1.0:
            frac = float(SAMPLE_FRACTION)
            df = df.sample(False, frac, seed=SAMPLE_SEED)
            print(f"[LOAD] Year {yr}: sampled {frac*100:.2f}%")

        cnt = df.count()
        all_data[yr] = df
        total_jobs += cnt
        print(f"[LOAD] Year {yr}: {cnt:,} rows")

else:
    for yr in tqdm(years_to_process, desc="Reading data (CSV)"):
        print(f"\n[LOAD] Year {yr} from {ALL_PATHS[yr]['input_csv']} ...")

        df = (
            spark.read
            .option("header", True)
            .option("multiline", True)
            .csv(ALL_PATHS[yr]["input_csv"])
            .select(
                "date",
                "job_id",
                "soc_2020",
                "job_title",
                F.col("full_text").cast("string")
            )
            .filter(F.col("full_text").isNotNull() & (F.length("full_text") > 0))
            .withColumn("date", F.to_date("date", "yyyy-MM-dd"))
        )

        # Sampling
        if SAMPLE_FRACTION is not None and float(SAMPLE_FRACTION) < 1.0:
            frac = float(SAMPLE_FRACTION)

            if STRATIFIED_BY_SOC:
                # approximate stratified sample
                soc_vals = [r["soc_2020"] for r in df.select("soc_2020").distinct().limit(50000).collect()]
                fractions = {s: frac for s in soc_vals if s is not None}
                df = df.sampleBy("soc_2020", fractions=fractions, seed=SAMPLE_SEED)
                print(f"[LOAD] Year {yr}: stratified sample {frac*100:.2f}% by SOC")
            else:
                df = df.sample(False, frac, seed=SAMPLE_SEED)
                print(f"[LOAD] Year {yr}: uniform sample {frac*100:.2f}%")

        cnt = df.count()
        all_data[yr] = df
        total_jobs += cnt
        print(f"[LOAD] Year {yr}: {cnt:,} rows")

print(f"\n[LOAD] Total jobs loaded: {total_jobs:,}")

CELL 6 — Step 2: NLP job-level feature extraction (efficient storage)


OECD alignment (adapted)
	•	OECD extracts noun chunks from “pure data” occupations then spreads list across all jobs.
	•	Your efficient adaptation: still extracts noun chunks + similarity to “data”, but stores job-level summaries and top chunks. This keeps the pipeline scalable while preserving interpretability.

In [None]:
# Schema: one row per job advert
job_features_schema = StructType([
    StructField("date", StringType()),
    StructField("job_id", StringType()),
    StructField("soc_2020", StringType()),
    StructField("n_chunks_total", IntegerType()),
    StructField("n_chunks_data", IntegerType()),
    StructField("avg_sim_data", DoubleType()),
    StructField("top_chunks", ArrayType(StringType())),
    StructField("top_sims", ArrayType(DoubleType())),
])

# Optional debug schema: chunk-level (ONLY for tiny sample)
debug_schema = StructType([
    StructField("date", StringType()),
    StructField("job_id", StringType()),
    StructField("soc_2020", StringType()),
    StructField("noun_chunk", StringType()),
    StructField("sim_data", DoubleType()),
])

def extract_job_features(iterator):
    import spacy
    import pandas as pd

    try:
        nlp = spacy.load("en_core_web_lg", exclude=["lemmatizer", "ner"])
    except OSError:
        nlp = spacy.load("en_core_web_sm")

    target = nlp("data")

    for pdf in iterator:
        pdf = pdf.copy()
        pdf["full_text"] = pdf["full_text"].fillna("").astype(str)
        pdf["job_id"] = pdf["job_id"].astype(str)
        pdf["soc_2020"] = pdf["soc_2020"].astype(str)
        pdf["date"] = pdf["date"].astype(str)

        out_rows = []
        texts = pdf["full_text"].tolist()
        jobids = pdf["job_id"].tolist()
        socs = pdf["soc_2020"].tolist()
        dates = pdf["date"].tolist()

        for i, doc in enumerate(nlp.pipe(texts, batch_size=50, n_process=1)):
            total_chunks = 0
            data_chunks = []

            for chunk in doc.noun_chunks:
                total_chunks += 1
                if chunk.has_vector:
                    sim = float(chunk.similarity(target))
                    if sim >= SIM_THRESHOLD:
                        cleaned = "".join(c for c in chunk.text if not c.isdigit()).strip().lower()
                        if cleaned:
                            data_chunks.append((cleaned, sim))

            n_data = len(data_chunks)
            if n_data > 0:
                data_chunks.sort(key=lambda x: x[1], reverse=True)
                top = data_chunks[:TOP_K]
                top_chunks = [t[0] for t in top]
                top_sims = [float(t[1]) for t in top]
                avg_sim = float(sum(t[1] for t in data_chunks) / n_data)
            else:
                top_chunks, top_sims, avg_sim = [], [], None

            out_rows.append({
                "date": dates[i],
                "job_id": jobids[i],
                "soc_2020": socs[i],
                "n_chunks_total": int(total_chunks),
                "n_chunks_data": int(n_data),
                "avg_sim_data": avg_sim,
                "top_chunks": top_chunks,
                "top_sims": top_sims,
            })

        yield pd.DataFrame(out_rows)

def extract_debug_chunks(iterator):
    import spacy
    import pandas as pd

    try:
        nlp = spacy.load("en_core_web_lg", exclude=["lemmatizer", "ner"])
    except OSError:
        nlp = spacy.load("en_core_web_sm")

    target = nlp("data")

    for pdf in iterator:
        pdf = pdf.copy()
        pdf["full_text"] = pdf["full_text"].fillna("").astype(str)
        pdf["job_id"] = pdf["job_id"].astype(str)
        pdf["soc_2020"] = pdf["soc_2020"].astype(str)
        pdf["date"] = pdf["date"].astype(str)

        rows = []
        texts = pdf["full_text"].tolist()
        jobids = pdf["job_id"].tolist()
        socs = pdf["soc_2020"].tolist()
        dates = pdf["date"].tolist()

        for i, doc in enumerate(nlp.pipe(texts, batch_size=50, n_process=1)):
            for chunk in doc.noun_chunks:
                if chunk.has_vector:
                    sim = float(chunk.similarity(target))
                    cleaned = "".join(c for c in chunk.text if not c.isdigit()).strip().lower()
                    if cleaned:
                        rows.append({
                            "date": dates[i],
                            "job_id": jobids[i],
                            "soc_2020": socs[i],
                            "noun_chunk": cleaned,
                            "sim_data": sim,
                        })

        if rows:
            yield pd.DataFrame(rows)
        else:
            yield pd.DataFrame(columns=["date","job_id","soc_2020","noun_chunk","sim_data"])

for yr in tqdm(years_to_process, desc="NLP job feature extraction"):
    out_base = ALL_PATHS[yr]["job_features"]

    # Skip if already computed
    if not FORCE_RECOMPUTE:
        try:
            _df, latest_path = read_latest_version(out_base)
            print(f"\n[SKIP] Year {yr} job_features already exist at: {latest_path}")
            continue
        except Exception:
            pass

    print(f"\n[NLP] Year {yr} starting…")
    df = all_data[yr]

    # Partition tuning (important)
    total_rows = df.count()
    estimated_row_size = 1000
    target_partition_bytes = 128 * 1024 * 1024
    num_partitions = max(16, int((total_rows * estimated_row_size) / target_partition_bytes))
    print(f"[NLP] Year {yr} rows={total_rows:,} partitions={num_partitions}")

    df_part = df.repartition(num_partitions)

    job_feat = df_part.mapInPandas(extract_job_features, schema=job_features_schema)
    job_feat = job_feat.withColumn("year", F.year(F.to_date("date"))) \
                       .withColumn("month", F.month(F.to_date("date")))

    latest_written = safe_write_parquet_s3a(
        job_feat,
        out_base,
        mode="overwrite",
        create_version=True,
        verify_read=True
    )

    print(f"[NLP] Year {yr} job_features written to: {latest_written}")

    if WRITE_DEBUG_CHUNKS:
        dbg_base = ALL_PATHS[yr]["debug_chunks"]
        dbg_df = df_part.sample(False, DEBUG_SAMPLE_FRACTION, seed=SAMPLE_SEED)
        dbg_chunks = dbg_df.mapInPandas(extract_debug_chunks, schema=debug_schema) \
                           .withColumn("year", F.year(F.to_date("date"))) \
                           .withColumn("month", F.month(F.to_date("date")))
        dbg_written = safe_write_parquet_s3a(dbg_chunks, dbg_base, create_version=True)
        print(f"[NLP-DEBUG] Year {yr} debug chunks written to: {dbg_written}")

CELL 7 — Step 3: Job classification + Occupation summary (SOC4)
Digit level answer: This produces occupation estimates at SOC 4-digit (soc4 = first 4 chars of soc_2020).

In [None]:
# Landmark SOC4 groups (your current “OECD categories via SOC” approach)
data_entry_soc = {"4111","4112","4113","4114","4121","4131","4132","4150"}
database_soc = {"2423","2136"}
data_analytics_soc = {"2421","2424","2133","2135"}

for yr in tqdm(years_to_process, desc="Job classification + occ summary"):
    print(f"\n[CLASSIFY] Year {yr}")

    job_feat_df, job_feat_path = read_latest_version(ALL_PATHS[yr]["job_features"])
    print(f"[CLASSIFY] Using job_features from: {job_feat_path}")

    job_feat_df = job_feat_df.filter(F.year(F.to_date("date")) == yr)
    job_feat_df = job_feat_df.withColumn("soc4", F.substring(F.col("soc_2020"), 1, 4))

    is_data_intensive = (F.col("n_chunks_data") >= F.lit(DATA_THRESHOLD))

    job_categories = (
        job_feat_df.select("job_id","soc_2020","soc4","date","n_chunks_data","avg_sim_data","top_chunks","top_sims")
        .withColumn("data_entry", (F.col("soc4").isin(list(data_entry_soc)) & is_data_intensive).cast("int"))
        .withColumn("database", (F.col("soc4").isin(list(database_soc)) & is_data_intensive).cast("int"))
        .withColumn("data_analytics", (F.col("soc4").isin(list(data_analytics_soc)) & is_data_intensive).cast("int"))
        .withColumn("any_data_intensive", is_data_intensive.cast("int"))
        .withColumn("year", F.year(F.to_date("date")))
        .withColumn("month", F.month(F.to_date("date")))
    )

    # Save job-level categories
    jc_base = ALL_PATHS[yr]["job_categories"]
    if FORCE_RECOMPUTE:
        jc_written = safe_write_parquet_s3a(job_categories, jc_base, create_version=True)
    else:
        try:
            _df, latest = read_latest_version(jc_base)
            print(f"[SKIP] job_categories exists: {latest}")
            jc_written = latest
        except Exception:
            jc_written = safe_write_parquet_s3a(job_categories, jc_base, create_version=True)

    # Occupation summary (SOC4)
    occ = (
        job_categories.groupBy("soc4")
        .agg(
            F.count("*").alias("total_jobs"),
            F.sum("data_entry").alias("data_entry_jobs"),
            F.sum("database").alias("database_jobs"),
            F.sum("data_analytics").alias("data_analytics_jobs"),
            F.sum("any_data_intensive").alias("any_data_intensive_jobs"),
        )
        .withColumn("data_entry_share", 100 * F.col("data_entry_jobs") / F.col("total_jobs"))
        .withColumn("database_share", 100 * F.col("database_jobs") / F.col("total_jobs"))
        .withColumn("data_analytics_share", 100 * F.col("data_analytics_jobs") / F.col("total_jobs"))
        .withColumn("total_data_share", 100 * F.col("any_data_intensive_jobs") / F.col("total_jobs"))
        .withColumn("year", F.lit(int(yr)))
    )

    os_base = ALL_PATHS[yr]["occupation_summary"]
    if FORCE_RECOMPUTE:
        os_written = safe_write_parquet_s3a(occ, os_base, create_version=True)
    else:
        try:
            _df, latest = read_latest_version(os_base)
            print(f"[SKIP] occupation_summary exists: {latest}")
            os_written = latest
        except Exception:
            os_written = safe_write_parquet_s3a(occ, os_base, create_version=True)

    print(f"[CLASSIFY] Saved year {yr}: job_categories={jc_written} | occupation_summary={os_written}")

CELL 8 — Step 4: Synthetic SIC Section (Option 1 rule-based SOC→SIC)
This produces industry at SIC “Section” level (A–U style) — same level OECD often reports when rolling up to broad industries.

In [None]:
# Synthetic SIC mapping (DEMO ONLY)
# Rule: SOC major group (first digit) -> SIC Section letter
SOC_MAJOR_TO_SIC = {
    "1": "M",  # managers -> professional/technical proxy
    "2": "M",
    "3": "J",  # associate prof -> info/comm proxy
    "4": "N",  # admin -> admin/support proxy
    "5": "F",  # skilled trades -> construction proxy
    "6": "Q",  # caring -> health/social proxy
    "7": "G",  # sales -> wholesale/retail proxy
    "8": "C",  # process/plant -> manufacturing proxy
    "9": "N",  # elementary -> admin/support proxy
}

def soc_to_sic_section(soc):
    if soc is None:
        return None
    s = str(soc).strip()
    if len(s) == 0:
        return None
    return SOC_MAJOR_TO_SIC.get(s[0], None)

sic_udf = F.udf(soc_to_sic_section, StringType())

for yr in tqdm(years_to_process, desc="Synthetic SIC + sector summary"):
    print(f"\n[SIC-SYNTH] Year {yr}")

    jc_df, jc_path = read_latest_version(ALL_PATHS[yr]["job_categories"])
    jc_df = jc_df.filter(F.year(F.to_date("date")) == yr)
    print(f"[SIC-SYNTH] Using job_categories from: {jc_path}")

    jc_sic = jc_df.withColumn("SICSection_synth", sic_udf(F.col("soc_2020")))

    # Save job-level with SIC
    jc_sic_base = ALL_PATHS[yr]["job_categories_sic"]
    if FORCE_RECOMPUTE:
        jc_sic_written = safe_write_parquet_s3a(jc_sic, jc_sic_base, create_version=True)
    else:
        try:
            _df, latest = read_latest_version(jc_sic_base)
            print(f"[SKIP] job_categories_sic exists: {latest}")
            jc_sic_written = latest
        except Exception:
            jc_sic_written = safe_write_parquet_s3a(jc_sic, jc_sic_base, create_version=True)

    # Sector summary by synthetic SIC section
    sec = (
        jc_sic.filter(F.col("SICSection_synth").isNotNull())
        .groupBy("SICSection_synth")
        .agg(
            F.count("*").alias("total_jobs"),
            F.sum("data_entry").alias("data_entry_jobs"),
            F.sum("database").alias("database_jobs"),
            F.sum("data_analytics").alias("data_analytics_jobs"),
            F.sum("any_data_intensive").alias("any_data_intensive_jobs"),
        )
        .withColumn("data_entry_share", 100 * F.col("data_entry_jobs") / F.col("total_jobs"))
        .withColumn("database_share", 100 * F.col("database_jobs") / F.col("total_jobs"))
        .withColumn("data_analytics_share", 100 * F.col("data_analytics_jobs") / F.col("total_jobs"))
        .withColumn("total_data_share", 100 * F.col("any_data_intensive_jobs") / F.col("total_jobs"))
        .withColumn("year", F.lit(int(yr)))
    )

    sec_base = ALL_PATHS[yr]["sector_summary_sic"]
    if FORCE_RECOMPUTE:
        sec_written = safe_write_parquet_s3a(sec, sec_base, create_version=True)
    else:
        try:
            _df, latest = read_latest_version(sec_base)
            print(f"[SKIP] sector_summary_sic exists: {latest}")
            sec_written = latest
        except Exception:
            sec_written = safe_write_parquet_s3a(sec, sec_base, create_version=True)

    print(f"[SIC-SYNTH] Saved year {yr}: job_categories_sic={jc_sic_written} | sector_summary={sec_written}")

CELL 9 — Step 5: Load outputs into pandas for visualisations

In [None]:
import pandas as pd
import plotly.express as px

occ_frames = []
sec_frames = []

for yr in years_to_process:
    occ_df, occ_path = read_latest_version(ALL_PATHS[yr]["occupation_summary"])
    sec_df, sec_path = read_latest_version(ALL_PATHS[yr]["sector_summary_sic"])

    occ_frames.append(occ_df.toPandas())
    sec_frames.append(sec_df.toPandas())

occupation_df = pd.concat(occ_frames, ignore_index=True)
sector_df = pd.concat(sec_frames, ignore_index=True)

occupation_df["year"] = occupation_df["year"].astype(int)
sector_df["year"] = sector_df["year"].astype(int)

print("Loaded occupation_df:", occupation_df.shape)
print("Loaded sector_df:", sector_df.shape)

CELL 10 — Step 6: Core plots (multi-year)


A) overall trend

In [None]:
yearly = (
    occupation_df.groupby("year")[["total_jobs","any_data_intensive_jobs"]]
    .sum()
    .reset_index()
)
yearly["data_intensive_share"] = 100 * yearly["any_data_intensive_jobs"] / yearly["total_jobs"]

fig = px.line(
    yearly.sort_values("year"),
    x="year",
    y="data_intensive_share",
    markers=True,
    title="Share of data-intensive job adverts over time",
    labels={"year":"Year", "data_intensive_share":"Data-intensive jobs (%)"}
)
fig.show()

B) SOC4 top occupations per year (facets)

In [None]:
top_n = 20
top_each_year = (
    occupation_df.sort_values(["year","total_data_share"], ascending=[True,False])
    .groupby("year")
    .head(top_n)
)

melted = top_each_year.melt(
    id_vars=["year","soc4"],
    value_vars=["data_entry_share","database_share","data_analytics_share"],
    var_name="category",
    value_name="share"
)

label_map = {
    "data_entry_share":"Data entry",
    "database_share":"Database",
    "data_analytics_share":"Data analytics"
}
melted["category_label"] = melted["category"].map(label_map)

fig = px.bar(
    melted,
    x="soc4",
    y="share",
    color="category_label",
    facet_col="year",
    facet_col_wrap=2,
    title=f"Top {top_n} occupations by data intensity — each year (stacked components)",
    labels={"soc4":"SOC (4-digit)", "share":"Share (%)", "category_label":"Layer"},
    height=900
)
fig.update_layout(barmode="stack")
fig.show()

C) Synthetic SIC sector view (multi-year)

In [None]:
fig = px.line(
    sector_df.sort_values(["year","SICSection_synth"]),
    x="year",
    y="total_data_share",
    color="SICSection_synth",
    markers=True,
    title="Synthetic SIC Section — data-intensive share over time",
    labels={"year":"Year","total_data_share":"Data-intensive share (%)","SICSection_synth":"Synthetic SIC Section"}
)
fig.show()

CELL 11 — Step 7 (placeholder): SUT merge later (OECD “complete” step)

In [None]:
print("""
SUT merge placeholder ready.

When you provide:
- SUT dataset path (s3a/hdfs)
- Columns for: Year, SICSection (or Activity mapped to SICSection), GVA, COMP_EMP (or equivalents)

We will:
1) Read SUT
2) Clean keys to SICSection + Year
3) Merge with sector_df (synthetic SIC section)
4) Compute OECD-style valuation metrics:
   - data_entry_invest = data_entry_share * COMP_EMP * alpha
   - database_invest = ...
   - data_science_invest = ...
   - total_data_invest
   - total_data_invest / GVA (data-as-asset share proxy)
""")