In [1]:
# ==========================================
# CELL 1: PHASE 1 - HEAVY NLP EXTRACTION (ARRAY PACKED & SAMPLED)
# ==========================================
import os
import gc
import spacy
import pandas as pd
import logging
from typing import Optional, Literal, Dict
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

# --- LOGGER SETUP ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --- CONFIGURATION ---
BASE_PATH = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/parquet_OECD"
PARQUET_SOURCE = os.path.join(BASE_PATH, "part-00000-6f2787d8-9f9c-4b9b-9903-fc9d83e3d0c0-c000.snappy.parquet")
YEARS = [2020, 2021, 2022, 2023, 2024, 2025]
FORCE_RECOMPUTE_NLP = False 

# 1.0 = All 60 Million records. 0.1 = Random 10% sample. 0.01 = Random 1% sample.
SAMPLE_FRACTION = 1.0  

def get_nlp_path(year):
    return os.path.join(BASE_PATH, "processed_data", str(year), "noun_chunks_packed")

# --- RDSA SPARK SESSION BUILDER ---
def create_spark_session(
    app_name: Optional[str] = None,
    size: Optional[Literal["small", "medium", "large", "extra-large"]] = None,
    extra_configs: Optional[Dict[str, str]] = None,
) -> SparkSession:
    try:
        if size:
            size = size.lower()
            valid_sizes = ["small", "medium", "large", "extra-large"]
            if size not in valid_sizes:
                msg = f"Invalid '{size=}'. If specified must be one of {valid_sizes}."
                raise ValueError(msg)

        logger.info(
            (f"Creating a '{size}' Spark session..." if size else "Creating a basic Spark session...")
        )

        if app_name:
            builder = SparkSession.builder.appName(f"{app_name}")
        else:
            builder = SparkSession.builder

        # fmt: off
        if size == "small":
            builder = (
                builder.config("spark.executor.memory", "1g")
                .config("spark.executor.cores", 1)
                .config("spark.dynamicAllocation.maxExecutors", 3)
                .config("spark.sql.shuffle.partitions", 12)
            )
        elif size == "medium":
            builder = (
                builder.config("spark.executor.memory", "6g")
                .config("spark.executor.cores", 3)
                .config("spark.dynamicAllocation.maxExecutors", 3)
                .config("spark.sql.shuffle.partitions", 18)
            )
        elif size == "large":
            builder = (
                builder.config("spark.executor.memory", "10g")
                .config("spark.yarn.executor.memoryOverhead", "1g")
                .config("spark.executor.cores", 5)
                .config("spark.dynamicAllocation.maxExecutors", 5)
                .config("spark.sql.shuffle.partitions", 200)
            )
        elif size == "extra-large":
            builder = (
                builder.config("spark.executor.memory", "20g")
                .config("spark.yarn.executor.memoryOverhead", "2g")
                .config("spark.executor.cores", 5)
                .config("spark.dynamicAllocation.maxExecutors", 12)
                .config("spark.sql.shuffle.partitions", 240)
            )

        # Common configurations for all sizes
        builder = (
            builder.config("spark.dynamicAllocation.enabled", "true")
             .config("spark.dynamicAllocation.shuffleTracking.enabled", "true")
             .config("spark.sql.adaptive.enabled", "true")
             .config("spark.ui.showConsoleProgress", "false")
        ).enableHiveSupport()
        # fmt: on

        # Apply extra configurations
        if extra_configs:
            for key, value in extra_configs.items():
                builder = builder.config(key, value)

        logger.info("Spark session created successfully!")
        return builder.getOrCreate()
    except Exception as e:
        logger.error(f"An error occurred while creating the Spark session: {e}")
        raise

# --- INITIALIZE EXTRA-LARGE CLUSTER ---
spark = create_spark_session(
    app_name="OECD_Phase1_NLP_Packed",
    size="extra-large", 
    extra_configs={
        "spark.sql.execution.arrow.pyspark.enabled": "true",
        "spark.sql.execution.arrow.maxRecordsPerBatch": "5000" # Our vital memory safeguard
    }
)
spark.sparkContext.setLogLevel("ERROR")

# --- NLP UDF (Outputs Arrays) ---
noun_schema_packed = StructType([
    StructField("doc_JobID", StringType()),
    StructField("doc_BGTOcc", StringType()), 
    StructField("doc_year", IntegerType()),
    StructField("doc_month", IntegerType()),
    StructField("noun_chunks", ArrayType(StringType())),
    StructField("sim_scores", ArrayType(DoubleType()))
])

def extract_noun_chunks_packed(iterator):
    try: nlp = spacy.load("en_core_web_lg", disable=["lemmatizer", "ner"])
    except: nlp = spacy.load("en_core_web_sm")
    target = nlp("data")

    for pdf in iterator:
        rows = []
        texts = pdf["full_text"].fillna("").astype(str).tolist()

        meta = list(zip(
            pdf["job_id"].astype(str), 
            pdf["soc_2020"].astype(str),
            pdf["doc_year"],
            pdf["doc_month"]
        ))

        for i, doc in enumerate(nlp.pipe(texts, batch_size=50)):
            chunks_list = []
            sims_list = []

            for chunk in doc.noun_chunks:
                if chunk.has_vector:
                    cleaned = "".join(c for c in chunk.text if not c.isdigit()).strip()
                    if cleaned:
                        chunks_list.append(cleaned.lower())
                        sims_list.append(float(chunk.similarity(target)))

            rows.append({
                'doc_JobID': meta[i][0], 
                'doc_BGTOcc': meta[i][1],
                'doc_year': meta[i][2],
                'doc_month': meta[i][3],
                'noun_chunks': chunks_list, 
                'sim_scores': sims_list
            })
        yield pd.DataFrame(rows) if rows else pd.DataFrame(columns=noun_schema_packed.fieldNames())

# --- EXECUTION ---
for year in YEARS:
    out_path = get_nlp_path(year)
    if not FORCE_RECOMPUTE_NLP:
        try:
            if spark.read.parquet(out_path).limit(1).count() > 0:
                print(f"[SKIP] Data already extracted for {year}.")
                continue
        except: pass

    try:
        print(f"\n--- EXTRACTING TEXT FOR {year} ---")
        df_raw = spark.read.parquet(PARQUET_SOURCE) \
            .withColumn("date", F.to_date("date")) \
            .filter(F.year("date") == year) \
            .withColumn("doc_year", F.year("date")) \
            .withColumn("doc_month", F.month("date"))

        # --- SAMPLING LOGIC & LOGGING APPLIED ---
        if SAMPLE_FRACTION < 1.0:
            df_raw = df_raw.sample(False, SAMPLE_FRACTION, seed=42)
            
        advert_count = df_raw.count()
        print(f"  -> Processing {advert_count} job adverts (Sample Fraction: {SAMPLE_FRACTION})")

        if advert_count == 0: 
            print(f"  -> Skipping {year}: No records found after sampling.")
            continue

        # Adjust partitions to take full advantage of the 240 partition limit of 'extra-large'
        df_raw = df_raw.repartition(max(240, int(advert_count/10000)))
        
        chunks = df_raw.mapInPandas(extract_noun_chunks_packed, schema=noun_schema_packed)

        chunks.write.mode("overwrite").partitionBy("doc_month").parquet(out_path)
        print(f"  [DONE] Extracted NLP features for {year}.")
        
    except Exception as e:
        print(f"  [ERROR] Skipping {year} NLP extraction due to error: {e}")

INFO:__main__:Creating a 'extra-large' Spark session...
INFO:__main__:Spark session created successfully!
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/22 21:38:31 WARN Utils: Your hostname, Saurabhs-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.5.108 instead (on interface en0)
26/02/22 21:38:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
26/02/22 21:38:32 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
26/02/22 21:38:32 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprecated as of Spark 2.3 and may be removed in the future. Please use the new key 'spark.executor.memoryOverhead' instead.
26/02/22 21:38:32 WARN SparkConf: The configuration key 'spark.yarn.executor.memoryOverhead' has been deprec

[SKIP] Data already extracted for 2020.
[SKIP] Data already extracted for 2021.
[SKIP] Data already extracted for 2022.
[SKIP] Data already extracted for 2023.
[SKIP] Data already extracted for 2024.
[SKIP] Data already extracted for 2025.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 51601)
Traceback (most recent call last):
  File "/Users/saurabhkumar/miniforge3/envs/spark_env/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/saurabhkumar/miniforge3/envs/spark_env/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/Users/saurabhkumar/miniforge3/envs/spark_env/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/saurabhkumar/miniforge3/envs/spark_env/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/Users/saurabhkumar/miniforge3/envs/spark_env/lib/python3.11/site-packages/pyspark/accumulators.py", line 303, in handle
    poll(accum_updates)
  File "/Users/saurabhkumar/miniforge3/envs/spark_env/li

In [None]:
# ==========================================
# CELL 2: PHASE 2 - OECD DICTIONARY (VALID-UNIVERSE & MAX-RF)
# ==========================================
import os
import pyspark.sql.functions as F

# --- FILE PATHS & CONFIGURATION ---
BASE_PATH = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/parquet_OECD"
CENSUS_CSV = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/Census.csv"
SUT_CSV = "/Users/saurabhkumar/Desktop/OECD_PYSPARK_LOCAL/data/SUT_TABLE.csv"
YEARS = [2020, 2021, 2022, 2023, 2024, 2025]

def get_nlp_path(year):
    return os.path.join(BASE_PATH, "processed_data", str(year), "noun_chunks_packed")

# STRICT OECD PARAMETERS:
REL_SHARE_THRESHOLD = 10.0  
SIM_GROUNDING = 0.35       
DATA_THRESHOLD = 3         

SUT_YEAR = 2023
ALPHA_LOW = 1.58
ALPHA_ECONOMY_AVG = 3.62
ALPHA_MAP = {
    "A": 3.62, "B-E": 6.45, "F": 6.64, "G-I": 2.95, "J": 2.97,
    "K": 3.91, "L": 3.62, "M-N": 2.79, "O-Q": 2.07, "R-T": 3.06, "U": 3.62
}

# THE FIX: Updated to strictly pure Data Science, Database, and Data Entry SOC 2020 codes
SOC_GROUPS = {
    "data_entry": ["4152"], 
    "database":   ["2134", "3133"],
    "analytics":  ["3544", "2433"]
}
ALL_ANCHOR_SOCS = [item for sublist in SOC_GROUPS.values() for item in sublist]

print("Building OECD Dictionary (Valid-Universe Denominators & Max-RF)...")
try:
    packed_chunks = None
    for year in YEARS:
        year_path = get_nlp_path(year)
        try:
            df_year = spark.read.parquet(year_path)
            if packed_chunks is None: packed_chunks = df_year
            else: packed_chunks = packed_chunks.unionByName(df_year, allowMissingColumns=True)
        except: pass

    if packed_chunks is None or packed_chunks.rdd.isEmpty():
        raise ValueError("No parquet data could be loaded.")

    # METHODOLOGY NOTE: total_jobs_economy is the TRUE size of the economy (all jobs). 
    # Used for coverage logging.
    total_jobs_economy = packed_chunks.count()
    
    # REGEX ARMOR: Log jobs with completely broken SOC codes
    missing_soc4 = packed_chunks.filter(F.regexp_extract("doc_BGTOcc", r"(\d{4})", 1) == "").count()

    # EXPLODE, FILTER SIMILARITY, FILTER BROKEN SOCS, AND CACHE
    valid_chunks = packed_chunks.select(
        "doc_JobID", "doc_BGTOcc",
        F.explode(F.arrays_zip("noun_chunks", "sim_scores")).alias("zipped")
    ).select(
        "doc_JobID", "doc_BGTOcc",
        F.regexp_extract("doc_BGTOcc", r"(\d{4})", 1).alias("soc4"),
        F.col("zipped.noun_chunks").alias("noun_chunk"),
        F.col("zipped.sim_scores").alias("sim_data")
    ).filter(
        (F.col("sim_data") >= SIM_GROUNDING) & 
        (F.col("soc4") != "")  # Regex Armor
    ).cache()

    # METHODOLOGY NOTE: valid_job_universe is strictly jobs containing NLP signals.
    # Used ONLY for Dictionary RF calculations to prevent anchor dilution.
    valid_job_universe = valid_chunks.select("doc_JobID").distinct().count()
    
    valid_anchor_jobs = valid_chunks.filter(F.col("soc4").isin(ALL_ANCHOR_SOCS)).select("doc_JobID").distinct().count()

    print(f"\n--- DATA QUALITY & COVERAGE LOG ---")
    print(f"Jobs dropped due to missing/unparseable SOC: {missing_soc4}")
    print(f"Total Jobs in Economy: {total_jobs_economy}")
    print(f"Total Economy Jobs with >= 1 valid data chunk: {valid_job_universe}")
    print(f"Total Anchor Jobs with >= 1 valid data chunk: {valid_anchor_jobs}")
    print(f"NLP Coverage Rate: {round((valid_job_universe/(total_jobs_economy if total_jobs_economy > 0 else 1))*100, 2)}%\n")

    valid_jobs_entry = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["data_entry"])).select("doc_JobID").distinct().count()
    valid_jobs_db = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["database"])).select("doc_JobID").distinct().count()
    valid_jobs_ana = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["analytics"])).select("doc_JobID").distinct().count()

    # GLOBAL DOCUMENT FREQUENCY
    global_freq = valid_chunks.groupBy("noun_chunk") \
        .agg(F.countDistinct("doc_JobID").alias("global_count"), F.avg("sim_data").alias("avg_sim")) \
        .withColumn("share_economy", F.col("global_count") / (valid_job_universe if valid_job_universe > 0 else 1))

    # CALCULATE RF PER GROUP 
    entry_freq = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["data_entry"])) \
        .groupBy("noun_chunk").agg(F.countDistinct("doc_JobID").alias("count_entry")) \
        .withColumn("share_entry", F.col("count_entry") / (valid_jobs_entry if valid_jobs_entry > 0 else 1))
    
    db_freq = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["database"])) \
        .groupBy("noun_chunk").agg(F.countDistinct("doc_JobID").alias("count_db")) \
        .withColumn("share_db", F.col("count_db") / (valid_jobs_db if valid_jobs_db > 0 else 1))

    ana_freq = valid_chunks.filter(F.col("soc4").isin(SOC_GROUPS["analytics"])) \
        .groupBy("noun_chunk").agg(F.countDistinct("doc_JobID").alias("count_ana")) \
        .withColumn("share_ana", F.col("count_ana") / (valid_jobs_ana if valid_jobs_ana > 0 else 1))

    # JOIN AND TAKE MAXIMUM RELATIVE SHARE
    dictionary_df = global_freq \
        .join(entry_freq, "noun_chunk", "left").fillna(0) \
        .join(db_freq, "noun_chunk", "left").fillna(0) \
        .join(ana_freq, "noun_chunk", "left").fillna(0)

    dictionary_df = dictionary_df \
        .withColumn("rf_entry", F.col("share_entry") / F.col("share_economy")) \
        .withColumn("rf_db", F.col("share_db") / F.col("share_economy")) \
        .withColumn("rf_ana", F.col("share_ana") / F.col("share_economy")) \
        .withColumn("max_relative_share", F.greatest("rf_entry", "rf_db", "rf_ana"))

    # FINAL FILTER 
    oecd_vocabulary = dictionary_df.filter(
        (F.col("max_relative_share") >= REL_SHARE_THRESHOLD) & 
        (F.col("global_count") >= 500) &
        (F.col("avg_sim") >= SIM_GROUNDING)
    ).select("noun_chunk", F.col("max_relative_share").alias("relative_share"), "avg_sim", "global_count").cache()

    dict_size = oecd_vocabulary.count()
    print(f"✓ OECD Dictionary Built. Identified {dict_size} stable data-work terms.")
    
    print(f"=== TRUE TOP 30 OECD DICTIONARY TERMS (By Volume) ===")
    # THE FIX: Directly execute .show() instead of assigning it to a variable
    oecd_vocabulary.orderBy(F.col("global_count").desc()).limit(30).show(truncate=False)

    valid_chunks.unpersist()

except Exception as e:
    print(f"❌ ERROR building dictionary: {e}")
    oecd_vocabulary = None

Building OECD Dictionary (Valid-Universe Denominators & Max-RF)...

--- DATA QUALITY & COVERAGE LOG ---
Jobs dropped due to missing/unparseable SOC: 0
Total Jobs in Economy: 10000
Total Economy Jobs with >= 1 valid data chunk: 1
Total Anchor Jobs with >= 1 valid data chunk: 1
NLP Coverage Rate: 0.01%

✓ OECD Dictionary Built. Identified 0 stable data-work terms.


In [None]:
# ==========================================
# CELL 2.5: ENSEMBLE NLP POST-PROCESSING POLISH (OOM-SAFE)
# ==========================================
from pyspark.sql.functions import pandas_udf
import pyspark.sql.functions as F
import pandas as pd

print("--- Phase 2.5: Executing Ensemble NLP Polish ---")

# 1. Define the OOM-Safe Vectorized Pandas UDF
@pandas_udf("float")
def max_gold_similarity_pandas(phrases: pd.Series) -> pd.Series:
    import spacy
    # Load the medium model directly on the worker node for accurate vectors
    try: nlp_worker = spacy.load("en_core_web_md") 
    except: nlp_worker = spacy.load("en_core_web_sm")
    
    GOLD_STANDARD = [
        "data", "database", "analytics", "statistics", "software", "algorithm",
        "sql", "python", "spreadsheet", "dashboard", "cloud", "server",
        "etl", "visualization", "reporting", "infrastructure", "automation", 
        "modeling", "machine learning", "artificial intelligence"
    ]
    
    # Pre-compute the target vectors once per block to save memory
    gold_docs = [nlp_worker(g) for g in GOLD_STANDARD]
    
    results = []
    for phrase in phrases:
        if not phrase:
            results.append(0.0)
            continue
        phrase_doc = nlp_worker(str(phrase))
        
        # Calculate distance against the 20 pillars and take the max score
        max_score = max([phrase_doc.similarity(g) for g in gold_docs]) if gold_docs else 0.0
        results.append(float(max_score))
        
    return pd.Series(results)

# 2. Apply the Polish
if 'oecd_vocabulary' in globals() and oecd_vocabulary is not None:
    print(f"Pre-Polish Dictionary Size: {oecd_vocabulary.count()} terms")
    
    # Run the Vectorized UDF across the cluster
    polished_df = oecd_vocabulary.withColumn(
        "gold_sim_score", 
        max_gold_similarity_pandas(F.col("noun_chunk"))
    )
    
    # Must score >= 0.45 against at least one Gold Standard term
    FINAL_SIM_THRESHOLD = 0.45 
    
    # OVERWRITE the global variable so Cell 3 inherits the clean data automatically
    oecd_vocabulary = polished_df.filter(F.col("gold_sim_score") >= FINAL_SIM_THRESHOLD).cache()
    
    print(f"Post-Polish Dictionary Size: {oecd_vocabulary.count()} terms")
    
    print("\n=== TOP 30 SURVIVORS (By Volume) ===")
    oecd_vocabulary.orderBy(F.col("global_count").desc()).limit(30).select("noun_chunk", "global_count", "gold_sim_score").show(truncate=False)

In [3]:
# ==========================================
# CELL 3: PHASE 3 - JOB CLASSIFICATION
# ==========================================
print("--- Phase 3: Classifying Occupations ---")

def run_classification_for_year(year):
    in_path = get_nlp_path(year)
    try: 
        packed_chunks = spark.read.parquet(in_path)
    except Exception as e: 
        print(f"  [Skipping {year}] - Could not read data: {e}")
        return None

    if oecd_vocabulary is None: return None

    # Explode and filter
    year_chunks = packed_chunks.select(
        "doc_JobID", "doc_BGTOcc",
        F.explode(F.arrays_zip("noun_chunks", "sim_scores")).alias("zipped")
    ).select(
        "doc_JobID", "doc_BGTOcc",
        F.col("zipped.noun_chunks").alias("noun_chunk"),
        F.col("zipped.sim_scores").alias("sim_data")
    ).filter(F.col("sim_data") >= SIM_GROUNDING)

    # Tag jobs based on dictionary presence
    tagged_chunks = year_chunks.join(F.broadcast(oecd_vocabulary), "noun_chunk", "inner")

    job_scores = tagged_chunks.groupBy("doc_JobID", "doc_BGTOcc") \
        .agg(F.countDistinct("noun_chunk").alias("unique_data_terms")) \
        .withColumnRenamed("doc_BGTOcc", "soc_2020")

    # Regex Armor Applied Here
    job_scores = job_scores.withColumn("soc4", F.regexp_extract("soc_2020", r"(\d{4})", 1)).filter(F.col("soc4") != "")
    is_intensive = (F.col("unique_data_terms") >= DATA_THRESHOLD)

    classified = job_scores \
        .withColumn("data_entry", (F.col("soc4").isin(SOC_GROUPS["data_entry"]) & is_intensive).cast("int")) \
        .withColumn("database", (F.col("soc4").isin(SOC_GROUPS["database"]) & is_intensive).cast("int")) \
        .withColumn("data_analytics", (F.col("soc4").isin(SOC_GROUPS["analytics"]) & is_intensive).cast("int")) \
        .withColumn("any_data_intensive", is_intensive.cast("int"))

    # METHODOLOGY NOTE: We use ALL jobs here (including 0 valid chunks) because 
    # the Phase 4 Sector mapping requires the entire economy to match the SUT Tables.
    all_jobs_year = packed_chunks.select("doc_JobID", F.col("doc_BGTOcc").alias("soc_2020")).distinct() \
        .withColumn("soc4", F.regexp_extract("soc_2020", r"(\d{4})", 1)).filter(F.col("soc4") != "")

    merged = all_jobs_year.join(classified, ["doc_JobID", "soc4", "soc_2020"], "left").fillna(0)

    occ_sum = merged.groupBy("soc4").agg(
        F.count("*").alias("total_jobs"),
        F.sum("data_entry").alias("data_entry_jobs"),
        F.sum("database").alias("database_jobs"),
        F.sum("data_analytics").alias("data_analytics_jobs"),
        F.sum("any_data_intensive").alias("any_data_intensive_jobs")
    ).withColumn("year", F.lit(year))

    occ_sum = occ_sum \
        .withColumn("total_data_share", 100 * F.col("any_data_intensive_jobs") / F.col("total_jobs")) \
        .withColumn("data_entry_share", 100 * F.col("data_entry_jobs") / F.col("total_jobs")) \
        .withColumn("database_share", 100 * F.col("database_jobs") / F.col("total_jobs")) \
        .withColumn("data_analytics_share", 100 * F.col("data_analytics_jobs") / F.col("total_jobs"))

    return occ_sum

occupation_summaries = {y: run_classification_for_year(y) for y in YEARS}
print("✓ Occupations Classified.")

--- Phase 3: Classifying Occupations ---
✓ Occupations Classified.


In [4]:
# ==========================================
# CELL 4: PHASE 4 - SECTOR MAPPING
# ==========================================
print("--- Phase 4: Applying Census Weights ---")

df_census = spark.read.option("header", True).csv(CENSUS_CSV)
desc_col = df_census.columns[0]
df_census = df_census.withColumn("soc4", F.regexp_extract(F.col(desc_col), r"^(\d{4})", 1))

sic_cols = [c for c in df_census.columns if c != desc_col and c != "soc4"]
stack_expr = f"stack({len(sic_cols)}, " + ", ".join([f"'{c}', `{c}`" for c in sic_cols]) + ") as (sic_col, count_raw)"
long_df = df_census.select("soc4", F.expr(stack_expr))

long_df = long_df.withColumn("sic2", F.regexp_extract("sic_col", r"^(\d{2})", 1).cast("int")) \
                 .withColumn("n", F.regexp_replace("count_raw", ",", "").cast("long")).filter(F.col("n") > 0)

long_df = long_df.withColumn("SIC_Code", F.expr("""
    CASE WHEN sic2 BETWEEN 1 AND 3 THEN 'A' WHEN sic2 BETWEEN 5 AND 39 THEN 'B-E' WHEN sic2 BETWEEN 41 AND 43 THEN 'F' WHEN sic2 BETWEEN 45 AND 56 THEN 'G-I' WHEN sic2 BETWEEN 58 AND 63 THEN 'J' WHEN sic2 BETWEEN 64 AND 66 THEN 'K' WHEN sic2 = 68 THEN 'L' WHEN sic2 BETWEEN 69 AND 82 THEN 'M-N' WHEN sic2 BETWEEN 84 AND 88 THEN 'O-Q' WHEN sic2 BETWEEN 90 AND 98 THEN 'R-T' WHEN sic2 = 99 THEN 'U' ELSE NULL END
""")).filter(F.col("SIC_Code").isNotNull())

totals = long_df.groupBy("soc4").agg(F.sum("n").alias("total_soc"))
weights_df = long_df.groupBy("soc4", "SIC_Code").agg(F.sum("n").alias("n_sic")) \
                 .join(totals, "soc4").withColumn("w_soc4_SIC", F.col("n_sic") / F.col("total_soc")).cache()

sector_summaries = []
for year, occ_df in occupation_summaries.items():
    if occ_df is None: continue

    joined = occ_df.join(weights_df, "soc4", "left").fillna(0, subset=["w_soc4_SIC"])

    weighted = joined.select("SIC_Code",
        (F.col("total_jobs") * F.col("w_soc4_SIC")).alias("w_total"),
        (F.col("data_entry_jobs") * F.col("w_soc4_SIC")).alias("w_entry"),
        (F.col("database_jobs") * F.col("w_soc4_SIC")).alias("w_db"),
        (F.col("data_analytics_jobs") * F.col("w_soc4_SIC")).alias("w_ana"),
        (F.col("any_data_intensive_jobs") * F.col("w_soc4_SIC")).alias("w_any")
    )

    sec_sum = weighted.groupBy("SIC_Code").agg(
        F.sum("w_total").alias("total_jobs"), F.sum("w_entry").alias("data_entry_jobs"),
        F.sum("w_db").alias("database_jobs"), F.sum("w_ana").alias("data_analytics_jobs"), 
        F.sum("w_any").alias("any_data_intensive_jobs")
    )

    sec_sum = sec_sum \
        .withColumn("total_data_share", F.when(F.col("total_jobs") > 0, 100 * F.col("any_data_intensive_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("data_entry_share", F.when(F.col("total_jobs") > 0, 100 * F.col("data_entry_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("database_share", F.when(F.col("total_jobs") > 0, 100 * F.col("database_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("data_analytics_share", F.when(F.col("total_jobs") > 0, 100 * F.col("data_analytics_jobs") / F.col("total_jobs")).otherwise(0.0)) \
        .withColumn("year", F.lit(year))

    sector_summaries.append(sec_sum)

print("✓ Sectors Mapped.")

--- Phase 4: Applying Census Weights ---
✓ Sectors Mapped.


In [None]:
# ==========================================
# CELL 5: PHASE 5 - ECONOMIC VALUATION (SCALED TO ABSOLUTE £)
# ==========================================
print("--- Phase 5: Calculating Economic Valuation ---")

if not sector_summaries: raise ValueError("No sector data generated. Check Cell 3 output.")
full_sector_df = sector_summaries[0]
for d in sector_summaries[1:]: full_sector_df = full_sector_df.unionByName(d)

# THE FIX: Multiply the SUT columns by 1,000,000 to convert from "Millions" to true Absolute £
sut_df = spark.read.option("header", True).csv(SUT_CSV).filter(F.col("year") == SUT_YEAR) \
    .select(
        F.upper(F.trim("SIC_Code")).alias("SIC_Code"), 
        (F.col("GVA_basic_prices").cast("double") * 1000000).alias("GVA_basic_prices"), 
        (F.col("COMP_EMP").cast("double") * 1000000).alias("COMP_EMP")
    )

alpha_expr = F.create_map([F.lit(x) for i in ALPHA_MAP.items() for x in i])
valued = full_sector_df.withColumn("SIC_Code", F.upper(F.trim("SIC_Code"))) \
    .join(sut_df, "SIC_Code", "inner") \
    .withColumn("alpha_low", F.lit(ALPHA_LOW)) \
    .withColumn("alpha_sector", F.coalesce(alpha_expr[F.col("SIC_Code")], F.lit(ALPHA_ECONOMY_AVG)))

valued = valued \
    .withColumn("inv_entry", F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("data_entry_share")/100)) \
    .withColumn("inv_db",    F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("database_share")/100)) \
    .withColumn("inv_ana",   F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("data_analytics_share")/100)) \
    .withColumn("inv_low_tot", F.col("alpha_low") * F.col("COMP_EMP") * (F.col("total_data_share")/100))

# THE FIX: Calculate using total_data_share to include the hidden data economy
valued = valued.withColumn("total_investment_sector", F.col("alpha_sector") * F.col("COMP_EMP") * (F.col("total_data_share")/100))

valued = valued \
    .withColumn("inv_share_gva_sector", F.when(F.col("GVA_basic_prices")>0, (F.col("total_investment_sector")/F.col("GVA_basic_prices"))*100).otherwise(0.0)) \
    .withColumn("inv_share_gva_low", F.when(F.col("GVA_basic_prices")>0, (F.col("inv_low_tot")/F.col("GVA_basic_prices"))*100).otherwise(0.0))

valued.cache()
print("✓ Valuation Complete.")

--- Phase 5: Calculating Economic Valuation ---
✓ Valuation Complete.


In [None]:
# ==========================================
# CELL 6a: SETUP & DICTIONARY WORD CLOUD
# ==========================================
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud

print("--- Phase 6: Generating Visualizations ---")

if 'oecd_vocabulary' in globals() and oecd_vocabulary is not None:
    vocab_pdf = oecd_vocabulary.toPandas()
    if not vocab_pdf.empty:
        print("\nGenerating Dictionary Word Cloud...")
        
        # THE FIX: Strip out any hidden newline or return characters that crash the image library
        vocab_pdf['noun_chunk'] = vocab_pdf['noun_chunk'].astype(str).str.replace(r'\n|\r', ' ', regex=True)
        
        # Dictionary linking words to their Relative Share importance
        word_freq = dict(zip(vocab_pdf['noun_chunk'], vocab_pdf['relative_share']))
        
        wc = WordCloud(width=1200, height=500, background_color='white', colormap='viridis', max_words=100)
        wc.generate_from_frequencies(word_freq)
        
        plt.figure(figsize=(16, 8))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title("OECD Data Dictionary (Sized by Specificity to Data-Work)", fontsize=22, pad=20)
        plt.tight_layout()
        plt.show()
    else:
        print("Dictionary is empty. Cannot generate Word Cloud.")

--- Phase 6: Generating Visualizations ---
Dictionary is empty. Cannot generate Word Cloud.


In [7]:
# ==========================================
# CELL 6b: TOTAL UK GVA vs. DATA INVESTMENT (ABSOLUTE £)
# ==========================================
pdf = valued.toPandas()

if pdf.empty:
    print("WARNING: Valuation DataFrame is empty. Cannot plot economic charts.")
else:
    pdf['year_str'] = pdf['year'].astype(str)
    econ = pdf.groupby("year_str")[["inv_low_tot", "total_investment_sector", "GVA_basic_prices"]].sum().reset_index()

    fig_macro_abs = go.Figure()
    
    # Total Economy (Gray)
    fig_macro_abs.add_trace(go.Bar(
        x=econ['year_str'], y=econ['GVA_basic_prices'], 
        name='Total UK Economy GVA', marker_color='#E5E5E5'
    ))
    
    # Sector Alpha Map Investment (Dark Blue)
    fig_macro_abs.add_trace(go.Bar(
        x=econ['year_str'], y=econ['total_investment_sector'], 
        name='Data Investment (Sector Alpha Map)', marker_color='#1f77b4'
    ))

    # Conservative Alpha Investment (Light Blue)
    fig_macro_abs.add_trace(go.Bar(
        x=econ['year_str'], y=econ['inv_low_tot'], 
        name='Data Investment (Conservative Alpha: 1.58)', marker_color='#a6cee3'
    ))
    
    fig_macro_abs.update_layout(
        title="Total UK Gross Value Added (GVA) vs. Data Investment Scenarios",
        xaxis_title="Year",
        yaxis_title="Absolute Value",
        yaxis=dict(tickprefix="£"),
        barmode='overlay', # Overlays the smaller bars inside the larger GVA bar
        template="plotly_white",
        legend=dict(x=0.01, y=0.99)
    )
    fig_macro_abs.show()

In [8]:
# ==========================================
# CELL 6c: GVA vs. DATA INVESTMENT BY SECTOR
# ==========================================
if not pdf.empty:
    fig_sec_abs = px.bar(
        pdf, x="year_str", y=["GVA_basic_prices", "total_investment_sector"],
        facet_col="SIC_Code", facet_col_wrap=4, barmode="group",
        title="GVA vs. Data Investment by Industry Sector (Absolute £)",
        labels={'value': 'Absolute Value', 'variable': 'Metric', 'year_str': 'Year'},
        template="plotly_white", height=800,
        color_discrete_map={"GVA_basic_prices": "#E5E5E5", "total_investment_sector": "#1f77b4"}
    )
    
    # Rename legend for presentation
    newnames = {'GVA_basic_prices': 'Total Sector GVA', 'total_investment_sector': 'Data Investment (Sector Alpha)'}
    fig_sec_abs.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                                  legendgroup = newnames[t.name],
                                                  hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])))
    
    fig_sec_abs.update_yaxes(matches=None, tickprefix="£") # Allows Y-axis to scale independently for small vs large sectors
    fig_sec_abs.show()

In [9]:
# ==========================================
# CELL 6d: DATA INVESTMENT AS % OF UK ECONOMY
# ==========================================
if not pdf.empty:
    econ["Conservative % (Alpha 1.58)"] = (econ["inv_low_tot"] / econ["GVA_basic_prices"]) * 100
    econ["Sector Map %"] = (econ["total_investment_sector"] / econ["GVA_basic_prices"]) * 100

    fig_macro_pct = px.line(
        econ, x="year_str", y=["Sector Map %", "Conservative % (Alpha 1.58)"], 
        title="Data Investment as a Percentage of Total UK Economy (GVA)", 
        markers=True, template="plotly_white",
        labels={'value': '% of Total GVA', 'variable': 'Valuation Scenario', 'year_str': 'Year'},
        color_discrete_sequence=["#00008b", "#a6cee3"]
    )
    
    fig_macro_pct.update_traces(line=dict(width=4), marker=dict(size=10))
    fig_macro_pct.update_yaxes(ticksuffix=" %")
    fig_macro_pct.show()

In [10]:
# ==========================================
# CELL 6e: DATA INVESTMENT AS % OF SECTOR GVA
# ==========================================
if not pdf.empty:
    fig_sec_pct = px.line(
        pdf, x="year_str", y="inv_share_gva_sector", color="SIC_Code",
        title="Data Investment Intensity: Percentage of Sector GVA Over Time",
        labels={'inv_share_gva_sector': '% of Sector GVA', 'SIC_Code': 'Industry (SIC)', 'year_str': 'Year'},
        markers=True, template="plotly_white", height=600
    )
    
    fig_sec_pct.update_traces(line=dict(width=3), marker=dict(size=8))
    fig_sec_pct.update_yaxes(ticksuffix=" %")
    fig_sec_pct.show()

In [11]:
# ==========================================
# CELL 6f: DATA INTENSITY BY INDUSTRY (HEATMAP)
# ==========================================
if not pdf.empty:
    # Pivot the data for a heatmap (Rows = SIC, Columns = Year, Values = Data Share)
    heatmap_data = pdf.pivot(index='SIC_Code', columns='year_str', values='total_data_share')
    
    fig_heat = px.imshow(
        heatmap_data, 
        title="Heatmap: Percentage of Workforce in Data-Intensive Roles by Sector",
        labels=dict(x="Year", y="Industry Sector (SIC)", color="% of Workforce"),
        color_continuous_scale="Blues", aspect="auto", template="plotly_white", height=600
    )
    
    fig_heat.update_traces(text=heatmap_data.round(2).astype(str) + "%", texttemplate="%{text}")
    fig_heat.show()

In [12]:
# ==========================================
# CELL 6g: OVERALL WORKFORCE DATA INTENSITY TREND
# ==========================================
if not pdf.empty:
    intensity_df = pdf.groupby("year_str")[["any_data_intensive_jobs", "total_jobs"]].sum().reset_index()
    intensity_df["Overall Intensity %"] = (intensity_df["any_data_intensive_jobs"] / intensity_df["total_jobs"]) * 100

    fig_intensity = px.line(
        intensity_df, x="year_str", y="Overall Intensity %", 
        title="UK Workforce Trend: Percentage of All Jobs Requiring Data Skills", 
        markers=True, template="plotly_white", color_discrete_sequence=["#2ca02c"]
    )
    
    fig_intensity.update_traces(line=dict(width=4), marker=dict(size=12))
    fig_intensity.update_yaxes(ticksuffix=" %")
    fig_intensity.update_layout(xaxis_title="Year", yaxis_title="% of Total UK Jobs")
    fig_intensity.show()

In [13]:
# ==========================================
# CELL 6h: TOP 20 OCCUPATIONS BY YEAR
# ==========================================
occ_frames = [df.toPandas() for y, df in occupation_summaries.items() if df is not None]

if occ_frames:
    all_occ_df = pd.concat(occ_frames, ignore_index=True)
    if not all_occ_df.empty:
        all_occ_df['soc4'] = all_occ_df['soc4'].astype(str)
        all_occ_df['year_str'] = all_occ_df['year'].astype(str)
        
        years_to_plot = sorted(all_occ_df['year_str'].unique())
        
        for plot_year in years_to_plot:
            year_data = all_occ_df[all_occ_df['year_str'] == plot_year]
            
            # Isolate the top 20 for this specific year
            top_20 = year_data.nlargest(20, "total_data_share").sort_values("total_data_share", ascending=True)

            melted = top_20.melt(
                id_vars=["soc4"], 
                value_vars=["data_entry_share", "database_share", "data_analytics_share"],
                var_name="Category", value_name="Share"
            )
            melted["Category"] = melted["Category"].map({
                "data_entry_share": "Data Entry", 
                "database_share": "Database", 
                "data_analytics_share": "Data Analytics"
            })

            fig_soc = px.bar(
                melted, x="Share", y="soc4", color="Category", orientation='h',
                title=f"Top 20 Data-Intensive Occupations in {plot_year}",
                labels={'Share': '% of Job Adverts Requiring Data Skills', 'soc4': 'SOC Code'},
                template="plotly_white", barmode="stack", height=600,
                color_discrete_map={"Data Entry": "#a6cee3", "Database": "#1f77b4", "Data Analytics": "#00008b"}
            )
            
            fig_soc.update_xaxes(ticksuffix=" %")
            fig_soc.update_layout(yaxis=dict(type='category', dtick=1)) 
            fig_soc.show()

In [14]:
# ==========================================
# CELL 7: PHASE 7 - AUDIT LOG EXPORT
# ==========================================
import datetime

LOG_FILE = "pipeline_audit_log_FINAL.txt"

def dump_df(df, name, f, limit=50):
    f.write(f"\n{'='*50}\nDATASET: {name}\n{'='*50}\n")
    if df is None:
        f.write("[MISSING OR EMPTY]\n")
        return
    f.write(f"Columns: {df.columns}\n")
    try:
        f.write(df.limit(limit).toPandas().to_string())
        f.write("\n")
    except Exception as e:
        f.write(f"Error dumping: {e}\n")

with open(LOG_FILE, "w") as f:
    f.write(f"OECD PIPELINE FINAL AUDIT - {datetime.datetime.now()}\n")

    if 'oecd_vocabulary' in globals() and oecd_vocabulary is not None:
        top_vocab = oecd_vocabulary.orderBy(F.col("relative_share").desc())
        
        # NEW: Print Top 30 to console for immediate reviewer sanity-check
        print("\n=== TOP 30 OECD DICTIONARY TERMS ===")
        try:
            print(top_vocab.limit(30).toPandas().to_string(index=False))
        except:
            print("Dictionary is empty.")
        print("====================================\n")
            
        dump_df(top_vocab, "Learned OECD Vocabulary", f, limit=50)

    dump_df(weights_df, "Census Weights", f)

    sample_year = 2024
    f.write(f"\n\n>>> YEAR {sample_year} SNAPSHOTS <<<\n")

    if sample_year in occupation_summaries and occupation_summaries[sample_year] is not None:
        dump_df(occupation_summaries[sample_year], f"Occupation Summary {sample_year}", f)

    sec_2024 = next((sec for sec in sector_summaries if sec.filter(F.col("year") == sample_year).count() > 0), None)
    dump_df(sec_2024, f"Sector Summary {sample_year}", f)
    dump_df(valued, "Final Valuation Table", f)

print(f"Data successfully dumped to: {LOG_FILE}")


=== TOP 30 OECD DICTIONARY TERMS ===
Empty DataFrame
Columns: [noun_chunk, relative_share, avg_sim, global_count]
Index: []

Data successfully dumped to: pipeline_audit_log_FINAL.txt


In [15]:
# ==========================================
# CELL 6i: DATA VOLUME BY INDUSTRY (HEATMAP - ABSOLUTE COUNTS)
# ==========================================
if not pdf.empty:
    # Pivot the data for a heatmap (Rows = SIC, Columns = Year, Values = Absolute Job Count)
    heatmap_counts = pdf.pivot(index='SIC_Code', columns='year_str', values='any_data_intensive_jobs')
    
    fig_heat_counts = px.imshow(
        heatmap_counts, 
        title="Heatmap: Total Volume of Data-Intensive Job Adverts by Sector",
        labels=dict(x="Year", y="Industry Sector (SIC)", color="Job Count"),
        color_continuous_scale="Blues", 
        aspect="auto", 
        template="plotly_white", 
        height=600,
        text_auto=",.0f"  # Automatically formats numbers with commas and no decimals
    )
    
    fig_heat_counts.show()

In [16]:
# ==========================================
# CELL 6j: TOP 20 OCCUPATIONS BY YEAR (ABSOLUTE COUNTS)
# ==========================================
occ_frames = [df.toPandas() for y, df in occupation_summaries.items() if df is not None]

if occ_frames:
    all_occ_df = pd.concat(occ_frames, ignore_index=True)
    if not all_occ_df.empty:
        all_occ_df['soc4'] = all_occ_df['soc4'].astype(str)
        all_occ_df['year_str'] = all_occ_df['year'].astype(str)
        
        years_to_plot = sorted(all_occ_df['year_str'].unique())
        
        for plot_year in years_to_plot:
            year_data = all_occ_df[all_occ_df['year_str'] == plot_year]
            
            # Isolate the top 20 for this specific year by the absolute number of data jobs
            top_20 = year_data.nlargest(20, "any_data_intensive_jobs").sort_values("any_data_intensive_jobs", ascending=True)

            melted = top_20.melt(
                id_vars=["soc4"], 
                value_vars=["data_entry_jobs", "database_jobs", "data_analytics_jobs"],
                var_name="Category", value_name="Job Count"
            )
            melted["Category"] = melted["Category"].map({
                "data_entry_jobs": "Data Entry", 
                "database_jobs": "Database", 
                "data_analytics_jobs": "Data Analytics"
            })

            fig_soc_counts = px.bar(
                melted, x="Job Count", y="soc4", color="Category", orientation='h',
                title=f"Top 20 Occupations by Data Job Volume in {plot_year}",
                labels={'Job Count': 'Number of Data Job Adverts', 'soc4': 'SOC Code'},
                template="plotly_white", barmode="stack", height=600,
                color_discrete_map={"Data Entry": "#a6cee3", "Database": "#1f77b4", "Data Analytics": "#00008b"}
            )
            
            # Formats the X-axis numbers to include commas (e.g., 15,000) for better readability
            fig_soc_counts.update_layout(
                yaxis=dict(type='category', dtick=1),
                xaxis=dict(tickformat=",.0f")
            ) 
            fig_soc_counts.show()

In [None]:
# ==========================================
# CELL 6k: ALPHA SENSITIVITY RANGE (OECD SCENARIOS)
# ==========================================
import pandas as pd
import plotly.graph_objects as go

print("Generating Alpha Sensitivity Chart...")

# Build a Pandas DataFrame directly from your configuration variables
alpha_data = []
for sector, alpha_val in ALPHA_MAP.items():
    alpha_data.append({"Sector": sector, "Alpha": alpha_val, "Scenario": "Upper Bound (UK-Specific)"})
    alpha_data.append({"Sector": sector, "Alpha": ALPHA_LOW, "Scenario": "Lower Bound (Conservative)"})

alpha_df = pd.DataFrame(alpha_data)

# Create a grouped bar chart
fig_alpha = px.bar(
    alpha_df, x="Sector", y="Alpha", color="Scenario", barmode="group",
    title="Alpha (α) Capital Markup: Lower Bound vs. Upper Bound Scenarios",
    labels={"Alpha": "Multiplier (α)", "Sector": "Industry Sector (SIC)"},
    template="plotly_white", height=600,
    color_discrete_map={
        "Upper Bound (UK-Specific)": "#1f77b4", 
        "Lower Bound (Conservative)": "#a6cee3"
    }
)

# Add a horizontal line to show the Economy-Wide Average
fig_alpha.add_hline(
    y=ALPHA_ECONOMY_AVG, line_dash="dot", line_color="black", 
    annotation_text=f"Economy-Wide Average ({ALPHA_ECONOMY_AVG})", 
    annotation_position="top left"
)

fig_alpha.update_layout(yaxis=dict(ticksuffix="x"))
fig_alpha.show()

In [3]:
#converting the script to using the nbconvert
!jupyter nbconvert --to script Test_1.4.ipynb --output oecd_pipeline_final

[NbConvertApp] Converting notebook Test_1.4.ipynb to script
[NbConvertApp] Writing 38920 bytes to oecd_pipeline_final.py


In [None]:
# #I need this as a chart
# 3.4 The Sensitivity Range (OECD Scenarios)
# Because the Alpha markup is a macroeconomic estimate, we present our final findings as a range to provide stakeholders with a transparent "Confidence Band":

# Lower Bound (Conservative): A flat multiplier of 1.58 applied to all sectors, acting as an internationally recognized benchmark (utilized by Statistics Canada) representing the absolute minimum capital footprint of an office worker.

# Upper Bound (UK-Specific): Our calculated, highly specific UK markups derived directly from the 2025 Blue Book (ranging from 2.07 up to 6.65, with outliers capped at 3.62).

26/02/23 03:07:03 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:359)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:131)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:707)
	at org.apache.spark.storage.BlockManagerMasterE