In [1]:
# ==========================================
# CELL 1: PHASE 1 - HEAVY NLP EXTRACTION (ARRAY PACKED)
# ==========================================
import os
import gc
import spacy
import pandas as pd
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# --- CONFIGURATION ---
BASE_PATH = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/parquet_OECD"
PARQUET_SOURCE = os.path.join(BASE_PATH, "part-00000-6f2787d8-9f9c-4b9b-9903-fc9d83e3d0c0-c000.snappy.parquet")
YEARS = [2020, 2021, 2022, 2023, 2024, 2025]
FORCE_RECOMPUTE_NLP = True 

def get_nlp_path(year):
    return os.path.join(BASE_PATH, "processed_data", str(year), "noun_chunks_packed")

# --- SPARK SETUP (Optimized for 16GB RAM constraints) ---
spark = (SparkSession.builder 
    .appName("OECD_Phase1_NLP_Packed") 
    .config("spark.executor.memory", "12g") 
    .config("spark.driver.memory", "4g") 
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") 
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "5000") # Prevents OOM crashes
    .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")
print("✓ Spark Session Created.")

# --- NLP UDF (Outputs Arrays) ---
noun_schema_packed = StructType([
    StructField("doc_JobID", StringType()),
    StructField("doc_BGTOcc", StringType()), 
    StructField("doc_year", IntegerType()),
    StructField("doc_month", IntegerType()),
    StructField("noun_chunks", ArrayType(StringType())),
    StructField("sim_scores", ArrayType(DoubleType()))
])

def extract_noun_chunks_packed(iterator):
    try: nlp = spacy.load("en_core_web_lg", disable=["lemmatizer", "ner"])
    except: nlp = spacy.load("en_core_web_sm")
    target = nlp("data")

    for pdf in iterator:
        rows = []
        texts = pdf["full_text"].fillna("").astype(str).tolist()
        
        # Meta: JobID, SOC, Year, Month
        meta = list(zip(
            pdf["job_id"].astype(str), 
            pdf["soc_2020"].astype(str),
            pdf["doc_year"],
            pdf["doc_month"]
        ))

        for i, doc in enumerate(nlp.pipe(texts, batch_size=50)):
            chunks_list = []
            sims_list = []
            
            for chunk in doc.noun_chunks:
                if chunk.has_vector:
                    cleaned = "".join(c for c in chunk.text if not c.isdigit()).strip()
                    if cleaned:
                        chunks_list.append(cleaned.lower())
                        sims_list.append(float(chunk.similarity(target)))
            
            rows.append({
                'doc_JobID': meta[i][0], 
                'doc_BGTOcc': meta[i][1],
                'doc_year': meta[i][2],
                'doc_month': meta[i][3],
                'noun_chunks': chunks_list, 
                'sim_scores': sims_list
            })
        yield pd.DataFrame(rows) if rows else pd.DataFrame(columns=noun_schema_packed.fieldNames())

# --- EXECUTION ---
for year in YEARS:
    out_path = get_nlp_path(year)
    if not FORCE_RECOMPUTE_NLP:
        try:
            if spark.read.parquet(out_path).limit(1).count() > 0:
                print(f"[SKIP] Data already extracted for {year}.")
                continue
        except: pass

    try:
        print(f"--- EXTRACTING TEXT FOR {year} ---")
        df_raw = spark.read.parquet(PARQUET_SOURCE) \
            .withColumn("date", F.to_date("date")) \
            .filter(F.year("date") == year) \
            .withColumn("doc_year", F.year("date")) \
            .withColumn("doc_month", F.month("date"))

        if df_raw.rdd.isEmpty(): continue

        df_raw = df_raw.repartition(max(8, int(df_raw.count()/10000)))
        chunks = df_raw.mapInPandas(extract_noun_chunks_packed, schema=noun_schema_packed)
        
        # Partition by month to save progress in smaller, safer chunks
        chunks.write.mode("overwrite").partitionBy("doc_month").parquet(out_path)
        print(f"[DONE] Extracted NLP features for {year}.")
    except Exception as e:
        print(f"Skipping {year} NLP extraction due to error: {e}")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/20 01:41:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/02/20 01:41:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
26/02/20 01:41:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


✓ Spark Session Created.
--- EXTRACTING TEXT FOR 2020 ---


                                                                                

[DONE] Extracted NLP features for 2020.
--- EXTRACTING TEXT FOR 2021 ---


                                                                                

[DONE] Extracted NLP features for 2021.
--- EXTRACTING TEXT FOR 2022 ---


                                                                                

[DONE] Extracted NLP features for 2022.
--- EXTRACTING TEXT FOR 2023 ---


                                                                                

[DONE] Extracted NLP features for 2023.
--- EXTRACTING TEXT FOR 2024 ---


                                                                                

[DONE] Extracted NLP features for 2024.
--- EXTRACTING TEXT FOR 2025 ---


[Stage 47:>                                                         (0 + 8) / 8]

[DONE] Extracted NLP features for 2025.


                                                                                

In [2]:
# ==========================================
# CELL 2: PHASE 2 - OECD DICTIONARY (VALID-UNIVERSE & MAX-RF)
# ==========================================
import os
import pyspark.sql.functions as F

# --- FILE PATHS & CONFIGURATION ---
BASE_PATH = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/parquet_OECD"
CENSUS_CSV = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/Census.csv"
SUT_CSV = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/SUT_TABLE.csv"
YEARS = [2020, 2021, 2022, 2023, 2024, 2025]

def get_nlp_path(year):
    return os.path.join(BASE_PATH, "processed_data", str(year), "noun_chunks_packed")

# STRICT OECD PARAMETERS:
REL_SHARE_THRESHOLD = 10.0  
SIM_GROUNDING = 0.50        
DATA_THRESHOLD = 3         

SUT_YEAR = 2023
ALPHA_LOW = 1.58
ALPHA_ECONOMY_AVG = 3.62
ALPHA_MAP = {
    "A": 3.62, "B-E": 6.45, "F": 6.64, "G-I": 2.95, "J": 2.97,
    "K": 3.91, "L": 3.62, "M-N": 2.79, "O-Q": 2.07, "R-T": 3.06, "U": 3.62
}

SOC_GROUPS = {
    "data_entry": ["4111","4112","4113","4114","4121","4131","4132","4150"],
    "database":   ["2423","2136"],
    "analytics":  ["2421","2424","2133","2135"]
}
ALL_ANCHOR_SOCS = [item for sublist in SOC_GROUPS.values() for item in sublist]

print("Building OECD Dictionary (Valid-Universe Denominators & Max-RF)...")
try:
    packed_chunks = None
    for year in YEARS:
        year_path = get_nlp_path(year)
        try:
            df_year = spark.read.parquet(year_path)
            if packed_chunks is None: packed_chunks = df_year
            else: packed_chunks = packed_chunks.unionByName(df_year, allowMissingColumns=True)
        except: pass

    if packed_chunks is None or packed_chunks.rdd.isEmpty():
        raise ValueError("No parquet data could be loaded.")

    # METHODOLOGY NOTE: total_jobs_economy is the TRUE size of the economy (all jobs). 
    # Used for coverage logging.
    total_jobs_economy = packed_chunks.count()
    
    # REGEX ARMOR: Log jobs with completely broken SOC codes
    missing_soc4 = packed_chunks.filter(F.regexp_extract("doc_BGTOcc", r"(\d{4})", 1) == "").count()

    # EXPLODE, FILTER SIMILARITY, FILTER BROKEN SOCS, AND CACHE
    valid_chunks = packed_chunks.select(
        "doc_JobID", "doc_BGTOcc",
        F.explode(F.arrays_zip("noun_chunks", "sim_scores")).alias("zipped")
    ).select(
        "doc_JobID", "doc_BGTOcc",
        F.regexp_extract("doc_BGTOcc", r"(\d{4})", 1).alias("soc4"),
        F.col("zipped.noun_chunks").alias("noun_chunk"),
        F.col("zipped.sim_scores").alias("sim_data")
    ).filter(
        (F.col("sim_data") >= SIM_GROUNDING) & 
        (F.col("soc4") != "")  # Regex Armor
    ).cache()

    # METHODOLOGY NOTE: valid_job_universe is strictly jobs containing NLP signals.
    # Used ONLY for Dictionary RF calculations to prevent anchor dilution.
    valid_job_universe = valid_chunks.select("doc_JobID").distinct().count()
    
    valid_anchor_jobs = valid_chunks.filter(F.col("soc4").isin(ALL_ANCHOR_SOCS)).select("doc_JobID").distinct().count()

    print(f"\n--- DATA QUALITY & COVERAGE LOG ---")
    print(f"Jobs dropped due to missing/unparseable SOC: {missing_soc4}")
    print(f"Total Jobs in Economy: {total_jobs_economy}")
    print(f"Total Economy Jobs with >= 1 valid data chunk: {valid_job_universe}")
    print(f"Total Anchor Jobs with >= 1 valid data chunk: {valid_anchor_jobs}")
    print(f"NLP Coverage Rate: {round((valid_job_universe/(total_jobs_economy if total_jobs_economy > 0 else 1))*100, 2)}%\n")

    valid_jobs_entry = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["data_entry"])).select("doc_JobID").distinct().count()
    valid_jobs_db = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["database"])).select("doc_JobID").distinct().count()
    valid_jobs_ana = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["analytics"])).select("doc_JobID").distinct().count()

    # GLOBAL DOCUMENT FREQUENCY
    global_freq = valid_chunks.groupBy("noun_chunk") \
        .agg(F.countDistinct("doc_JobID").alias("global_count"), F.avg("sim_data").alias("avg_sim")) \
        .withColumn("share_economy", F.col("global_count") / (valid_job_universe if valid_job_universe > 0 else 1))

    # CALCULATE RF PER GROUP 
    entry_freq = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["data_entry"])) \
        .groupBy("noun_chunk").agg(F.countDistinct("doc_JobID").alias("count_entry")) \
        .withColumn("share_entry", F.col("count_entry") / (valid_jobs_entry if valid_jobs_entry > 0 else 1))
    
    db_freq = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["database"])) \
        .groupBy("noun_chunk").agg(F.countDistinct("doc_JobID").alias("count_db")) \
        .withColumn("share_db", F.col("count_db") / (valid_jobs_db if valid_jobs_db > 0 else 1))

    ana_freq = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["analytics"])) \
        .groupBy("noun_chunk").agg(F.countDistinct("doc_JobID").alias("count_ana")) \
        .withColumn("share_ana", F.col("count_ana") / (valid_jobs_ana if valid_jobs_ana > 0 else 1))

    # JOIN AND TAKE MAXIMUM RELATIVE SHARE
    dictionary_df = global_freq \
        .join(entry_freq, "noun_chunk", "left").fillna(0) \
        .join(db_freq, "noun_chunk", "left").fillna(0) \
        .join(ana_freq, "noun_chunk", "left").fillna(0)

    dictionary_df = dictionary_df \
        .withColumn("rf_entry", F.col("share_entry") / F.col("share_economy")) \
        .withColumn("rf_db", F.col("share_db") / F.col("share_economy")) \
        .withColumn("rf_ana", F.col("share_ana") / F.col("share_economy")) \
        .withColumn("max_relative_share", F.greatest("rf_entry", "rf_db", "rf_ana"))

    # FINAL FILTER 
    oecd_vocabulary = dictionary_df.filter(
        (F.col("max_relative_share") >= REL_SHARE_THRESHOLD) & 
        (F.col("global_count") >= 5) &
        (F.col("avg_sim") >= SIM_GROUNDING)
    ).select("noun_chunk", F.col("max_relative_share").alias("relative_share"), "avg_sim", "global_count").cache()

    dict_size = oecd_vocabulary.count()
    print(f"✓ OECD Dictionary Built. Identified {dict_size} stable data-work terms.")
    
    valid_chunks.unpersist()

except Exception as e:
    print(f"❌ ERROR building dictionary: {e}")
    oecd_vocabulary = None

Building OECD Dictionary (Valid-Universe Denominators & Max-RF)...


                                                                                


--- DATA QUALITY & COVERAGE LOG ---
Jobs dropped due to missing/unparseable SOC: 0
Total Jobs in Economy: 10000
Total Economy Jobs with >= 1 valid data chunk: 1
Total Anchor Jobs with >= 1 valid data chunk: 1
NLP Coverage Rate: 0.01%



                                                                                

✓ OECD Dictionary Built. Identified 0 stable data-work terms.


In [3]:
# ==========================================
# CELL 3: PHASE 3 - JOB CLASSIFICATION
# ==========================================
print("--- Phase 3: Classifying Occupations ---")

def run_classification_for_year(year):
    in_path = get_nlp_path(year)
    try: 
        packed_chunks = spark.read.parquet(in_path)
    except Exception as e: 
        print(f"  [Skipping {year}] - Could not read data: {e}")
        return None

    if oecd_vocabulary is None: return None

    # Explode and filter
    year_chunks = packed_chunks.select(
        "doc_JobID", "doc_BGTOcc",
        F.explode(F.arrays_zip("noun_chunks", "sim_scores")).alias("zipped")
    ).select(
        "doc_JobID", "doc_BGTOcc",
        F.col("zipped.noun_chunks").alias("noun_chunk"),
        F.col("zipped.sim_scores").alias("sim_data")
    ).filter(F.col("sim_data") >= SIM_GROUNDING)

    # Tag jobs based on dictionary presence
    tagged_chunks = year_chunks.join(F.broadcast(oecd_vocabulary), "noun_chunk", "inner")

    job_scores = tagged_chunks.groupBy("doc_JobID", "doc_BGTOcc") \
        .agg(F.countDistinct("noun_chunk").alias("unique_data_terms")) \
        .withColumnRenamed("doc_BGTOcc", "soc_2020")

    # Regex Armor Applied Here
    job_scores = job_scores.withColumn("soc4", F.regexp_extract("soc_2020", r"(\d{4})", 1)).filter(F.col("soc4") != "")
    is_intensive = (F.col("unique_data_terms") >= DATA_THRESHOLD)

    classified = job_scores \
        .withColumn("data_entry", (F.col("soc4").isin(SOC_GROUPS["data_entry"]) & is_intensive).cast("int")) \
        .withColumn("database", (F.col("soc4").isin(SOC_GROUPS["database"]) & is_intensive).cast("int")) \
        .withColumn("data_analytics", (F.col("soc4").isin(SOC_GROUPS["analytics"]) & is_intensive).cast("int")) \
        .withColumn("any_data_intensive", is_intensive.cast("int"))

    # METHODOLOGY NOTE: We use ALL jobs here (including 0 valid chunks) because 
    # the Phase 4 Sector mapping requires the entire economy to match the SUT Tables.
    all_jobs_year = packed_chunks.select("doc_JobID", F.col("doc_BGTOcc").alias("soc_2020")).distinct() \
        .withColumn("soc4", F.regexp_extract("soc_2020", r"(\d{4})", 1)).filter(F.col("soc4") != "")

    merged = all_jobs_year.join(classified, ["doc_JobID", "soc4", "soc_2020"], "left").fillna(0)

    occ_sum = merged.groupBy("soc4").agg(
        F.count("*").alias("total_jobs"),
        F.sum("data_entry").alias("data_entry_jobs"),
        F.sum("database").alias("database_jobs"),
        F.sum("data_analytics").alias("data_analytics_jobs"),
        F.sum("any_data_intensive").alias("any_data_intensive_jobs")
    ).withColumn("year", F.lit(year))

    occ_sum = occ_sum \
        .withColumn("total_data_share", 100 * F.col("any_data_intensive_jobs") / F.col("total_jobs")) \
        .withColumn("data_entry_share", 100 * F.col("data_entry_jobs") / F.col("total_jobs")) \
        .withColumn("database_share", 100 * F.col("database_jobs") / F.col("total_jobs")) \
        .withColumn("data_analytics_share", 100 * F.col("data_analytics_jobs") / F.col("total_jobs"))

    return occ_sum

occupation_summaries = {y: run_classification_for_year(y) for y in YEARS}
print("✓ Occupations Classified.")

--- Phase 3: Classifying Occupations ---
✓ Occupations Classified.


In [4]:
# ==========================================
# CELL 4: PHASE 4 - SECTOR MAPPING
# ==========================================
print("--- Phase 4: Applying Census Weights ---")

df_census = spark.read.option("header", True).csv(CENSUS_CSV)
desc_col = df_census.columns[0]
df_census = df_census.withColumn("soc4", F.regexp_extract(F.col(desc_col), r"^(\d{4})", 1))

sic_cols = [c for c in df_census.columns if c != desc_col and c != "soc4"]
stack_expr = f"stack({len(sic_cols)}, " + ", ".join([f"'{c}', `{c}`" for c in sic_cols]) + ") as (sic_col, count_raw)"
long_df = df_census.select("soc4", F.expr(stack_expr))

long_df = long_df.withColumn("sic2", F.regexp_extract("sic_col", r"^(\d{2})", 1).cast("int")) \
                 .withColumn("n", F.regexp_replace("count_raw", ",", "").cast("long")).filter(F.col("n") > 0)

long_df = long_df.withColumn("SIC_Code", F.expr("""
    CASE WHEN sic2 BETWEEN 1 AND 3 THEN 'A' WHEN sic2 BETWEEN 5 AND 39 THEN 'B-E' WHEN sic2 BETWEEN 41 AND 43 THEN 'F' WHEN sic2 BETWEEN 45 AND 56 THEN 'G-I' WHEN sic2 BETWEEN 58 AND 63 THEN 'J' WHEN sic2 BETWEEN 64 AND 66 THEN 'K' WHEN sic2 = 68 THEN 'L' WHEN sic2 BETWEEN 69 AND 82 THEN 'M-N' WHEN sic2 BETWEEN 84 AND 88 THEN 'O-Q' WHEN sic2 BETWEEN 90 AND 98 THEN 'R-T' WHEN sic2 = 99 THEN 'U' ELSE NULL END
""")).filter(F.col("SIC_Code").isNotNull())

totals = long_df.groupBy("soc4").agg(F.sum("n").alias("total_soc"))
weights_df = long_df.groupBy("soc4", "SIC_Code").agg(F.sum("n").alias("n_sic")) \
                 .join(totals, "soc4").withColumn("w_soc4_SIC", F.col("n_sic") / F.col("total_soc")).cache()

sector_summaries = []
for year, occ_df in occupation_summaries.items():
    if occ_df is None: continue

    joined = occ_df.join(weights_df, "soc4", "left").fillna(0, subset=["w_soc4_SIC"])

    weighted = joined.select("SIC_Code",
        (F.col("total_jobs") * F.col("w_soc4_SIC")).alias("w_total"),
        (F.col("data_entry_jobs") * F.col("w_soc4_SIC")).alias("w_entry"),
        (F.col("database_jobs") * F.col("w_soc4_SIC")).alias("w_db"),
        (F.col("data_analytics_jobs") * F.col("w_soc4_SIC")).alias("w_ana"),
        (F.col("any_data_intensive_jobs") * F.col("w_soc4_SIC")).alias("w_any")
    )

    sec_sum = weighted.groupBy("SIC_Code").agg(
        F.sum("w_total").alias("total_jobs"), F.sum("w_entry").alias("data_entry_jobs"),
        F.sum("w_db").alias("database_jobs"), F.sum("w_ana").alias("data_analytics_jobs"), 
        F.sum("w_any").alias("any_data_intensive_jobs")
    )

    sec_sum = sec_sum \
        .withColumn("total_data_share", F.when(F.col("total_jobs") > 0, 100 * F.col("any_data_intensive_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("data_entry_share", F.when(F.col("total_jobs") > 0, 100 * F.col("data_entry_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("database_share", F.when(F.col("total_jobs") > 0, 100 * F.col("database_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("data_analytics_share", F.when(F.col("total_jobs") > 0, 100 * F.col("data_analytics_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("year", F.lit(year))

    sector_summaries.append(sec_sum)

print("✓ Sectors Mapped.")

--- Phase 4: Applying Census Weights ---
✓ Sectors Mapped.


In [5]:
# ==========================================
# CELL 5: PHASE 5 - ECONOMIC VALUATION
# ==========================================
print("--- Phase 5: Calculating Economic Valuation ---")

if not sector_summaries: raise ValueError("No sector data generated. Check Cell 3 output.")
full_sector_df = sector_summaries[0]
for d in sector_summaries[1:]: full_sector_df = full_sector_df.unionByName(d)

sut_df = spark.read.option("header", True).csv(SUT_CSV).filter(F.col("year") == SUT_YEAR) \
    .select(F.upper(F.trim("SIC_Code")).alias("SIC_Code"), F.col("GVA_basic_prices").cast("double"), F.col("COMP_EMP").cast("double"))

alpha_expr = F.create_map([F.lit(x) for i in ALPHA_MAP.items() for x in i])
valued = full_sector_df.withColumn("SIC_Code", F.upper(F.trim("SIC_Code"))) \
    .join(sut_df, "SIC_Code", "inner") \
    .withColumn("alpha_low", F.lit(ALPHA_LOW)) \
    .withColumn("alpha_sector", F.coalesce(alpha_expr[F.col("SIC_Code")], F.lit(ALPHA_ECONOMY_AVG)))

valued = valued \
    .withColumn("inv_entry", F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("data_entry_share")/100)) \
    .withColumn("inv_db",    F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("database_share")/100)) \
    .withColumn("inv_ana",   F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("data_analytics_share")/100)) \
    .withColumn("inv_low_tot", F.col("alpha_low") * F.col("COMP_EMP") * (F.col("total_data_share")/100))

valued = valued.withColumn("total_investment_sector", F.col("inv_entry") + F.col("inv_db") + F.col("inv_ana"))

valued = valued \
    .withColumn("inv_share_gva_sector", F.when(F.col("GVA_basic_prices")>0, (F.col("total_investment_sector")/F.col("GVA_basic_prices"))*100).otherwise(0.0)) \
    .withColumn("inv_share_gva_low", F.when(F.col("GVA_basic_prices")>0, (F.col("inv_low_tot")/F.col("GVA_basic_prices"))*100).otherwise(0.0))

valued.cache()
print("✓ Valuation Complete.")

--- Phase 5: Calculating Economic Valuation ---
✓ Valuation Complete.


In [6]:
# ==========================================
# CELL 6: PHASE 6 - VISUALIZATIONS
# ==========================================
import pandas as pd
import plotly.express as px

print("--- Phase 6: Generating Visualizations ---")
pdf = valued.toPandas().sort_values(["year", "SIC_Code"])

if pdf.empty:
    print("WARNING: Valuation DataFrame is empty. Cannot plot.")
else:
    econ = pdf.groupby("year")[["inv_low_tot", "total_investment_sector", "GVA_basic_prices"]].sum().reset_index()
    econ["Low Scenario %"] = (econ["inv_low_tot"] / econ["GVA_basic_prices"]) * 100
    econ["Sector Scenario %"] = (econ["total_investment_sector"] / econ["GVA_basic_prices"]) * 100

    fig1 = px.line(econ, x="year", y=["Low Scenario %", "Sector Scenario %"], title="1. Economy-wide Data Investment as % of GVA", markers=True)
    fig1.show()

    fig2 = px.line(econ, x="year", y=["inv_low_tot", "total_investment_sector"], title="2. Absolute Data Investment Value", markers=True)
    fig2.show()

    fig3 = px.line(pdf, x="year", y="inv_share_gva_sector", color="SIC_Code", title="3. Sector Data Investment % of GVA", markers=True)
    fig3.show()

occ_frames = [df.toPandas() for y, df in occupation_summaries.items() if df is not None]
if occ_frames:
    all_occ_df = pd.concat(occ_frames, ignore_index=True)
    if not all_occ_df.empty:
        top_n = 20
        top_each_year = all_occ_df.sort_values(["year","total_data_share"], ascending=[True,False]).groupby("year").head(top_n)

        melted = top_each_year.melt(
            id_vars=["year","soc4"], 
            value_vars=["data_entry_share", "database_share", "data_analytics_share"],
            var_name="Category", value_name="Share"
        )
        melted["Category"] = melted["Category"].map({"data_entry_share": "Data Entry", "database_share": "Database", "data_analytics_share": "Data Analytics"})

        fig4 = px.bar(melted, x="soc4", y="Share", color="Category", facet_col="year", facet_col_wrap=2,
                      title=f"4. Top {top_n} Occupations by Data Intensity", height=900, barmode="stack")
        fig4.show()

--- Phase 6: Generating Visualizations ---


                                                                                

In [7]:
# ==========================================
# CELL 7: PHASE 7 - AUDIT LOG EXPORT
# ==========================================
import datetime

LOG_FILE = "pipeline_audit_log_FINAL.txt"

def dump_df(df, name, f, limit=50):
    f.write(f"\n{'='*50}\nDATASET: {name}\n{'='*50}\n")
    if df is None:
        f.write("[MISSING OR EMPTY]\n")
        return
    f.write(f"Columns: {df.columns}\n")
    try:
        f.write(df.limit(limit).toPandas().to_string())
        f.write("\n")
    except Exception as e:
        f.write(f"Error dumping: {e}\n")

with open(LOG_FILE, "w") as f:
    f.write(f"OECD PIPELINE FINAL AUDIT - {datetime.datetime.now()}\n")

    if 'oecd_vocabulary' in globals() and oecd_vocabulary is not None:
        top_vocab = oecd_vocabulary.orderBy(F.col("relative_share").desc())
        
        # NEW: Print Top 30 to console for immediate reviewer sanity-check
        print("\n=== TOP 30 OECD DICTIONARY TERMS ===")
        try:
            print(top_vocab.limit(30).toPandas().to_string(index=False))
        except:
            print("Dictionary is empty.")
        print("====================================\n")
            
        dump_df(top_vocab, "Learned OECD Vocabulary", f, limit=50)

    dump_df(weights_df, "Census Weights", f)

    sample_year = 2024
    f.write(f"\n\n>>> YEAR {sample_year} SNAPSHOTS <<<\n")

    if sample_year in occupation_summaries and occupation_summaries[sample_year] is not None:
        dump_df(occupation_summaries[sample_year], f"Occupation Summary {sample_year}", f)

    sec_2024 = next((sec for sec in sector_summaries if sec.filter(F.col("year") == sample_year).count() > 0), None)
    dump_df(sec_2024, f"Sector Summary {sample_year}", f)
    dump_df(valued, "Final Valuation Table", f)

print(f"Data successfully dumped to: {LOG_FILE}")


=== TOP 30 OECD DICTIONARY TERMS ===
Empty DataFrame
Columns: [noun_chunk, relative_share, avg_sim, global_count]
Index: []





Data successfully dumped to: pipeline_audit_log_FINAL.txt


                                                                                

In [8]:
#export as a python script for reproducibility and audit purposes

!jupyter nbconvert --to script Test_1.2.ipynb --output OECD

[NbConvertApp] Converting notebook Test_1.2.ipynb to script
[NbConvertApp] Writing 22792 bytes to OECD.py
