In [None]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator
from pyspark.storagelevel import StorageLevel

# ==========================================
# FULL RUN CONFIGURATION (Optimized)
# ==========================================
spark_conf = {
    # Προσπαθούμε να πάρουμε όση μνήμη μας επιτρέπει το σύστημα
    "spark.executor.memory": "6g", 
    "spark.driver.memory": "6g",
    "spark.executor.cores": "2",
    "spark.executor.instances": "2",
    
    # Sedona & Kryo (Απαραίτητα για ταχύτητα)
    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
    "spark.kryo.registrator": "org.apache.sedona.core.serde.SedonaKryoRegistrator",
    "spark.sql.adaptive.enabled": "true",
    
    # Αυξάνουμε το όριο για το αυτόματο Broadcast (για σιγουριά)
    "spark.sql.autoBroadcastJoinThreshold": "204857600" # 200MB
}

spark = SparkSession.builder \
    .appName("Query5_Full_Broadcast") \
    .config(map=spark_conf) \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print("--- 1. Loading & Optimizing Census Blocks ---")
blocks_raw = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet")

# Διόρθωση Array Types (.getItem(0)) + Επιλογή μόνο των απαραίτητων πεδίων
blocks = blocks_raw.select(
    F.upper(F.trim(F.col("features.properties.COMM").getItem(0))).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("total_pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)

# ΚΡΙΣΙΜΟ: Κάνουμε Cache τα Blocks. 
# Για να δουλέψει καλά το Broadcast, το Spark πρέπει να ξέρει ότι είναι μικρός πίνακας.
blocks.persist(StorageLevel.MEMORY_AND_DISK)
n_blocks = blocks.count()
print(f"Blocks Loaded & Cached: {n_blocks}")

blocks.createOrReplaceTempView("blocks")

print("--- 2. Loading Crimes (Full Dataset) ---")
crimes_raw = spark.read.parquet(f"{base_path}/crime_data_clean.parquet")

# Φιλτράρουμε μόνο τα έτη που μας νοιάζουν και κρατάμε ΜΟΝΟ συντεταγμένες
# για να ελαφρύνουμε το DataFrame όσο γίνεται.
crimes = crimes_raw.filter((F.col("Year") == 2020) | (F.col("Year") == 2021)) \
                   .select(
                       F.col("LON").cast("decimal(24,20)"), 
                       F.col("LAT").cast("decimal(24,20)")
                   )

crimes.createOrReplaceTempView("crimes")
print("Crimes view created (Lazy Load).")

print("--- 3. Loading Income ---")
income = spark.read.parquet(f"{base_path}/income_data.parquet")
income.select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("median_income")
).createOrReplaceTempView("income")

# ==========================================
# EXECUTION (The Moment of Truth)
# ==========================================
print("--- 4. Executing Spatial Join with BROADCAST ---")
start_time = time.time()

# Χρησιμοποιούμε το /*+ BROADCAST(b) */ για να είμαστε σίγουροι
query = """
WITH CrimeCounts AS (
    SELECT /*+ BROADCAST(b) */
        b.COMM,
        COUNT(*) as total_crimes
    FROM crimes c, blocks b
    WHERE ST_Contains(b.geometry, ST_Point(c.LON, c.LAT))
    GROUP BY b.COMM
),
AreaStats AS (
    SELECT 
        b.COMM,
        SUM(b.total_pop) as population,
        AVG(i.median_income) as avg_income
    FROM blocks b
    LEFT JOIN income i ON b.zcta = i.zip_code
    GROUP BY b.COMM
)
SELECT 
    a.COMM,
    a.avg_income,
    a.population,
    (c.total_crimes / 2.0) / (a.population / 1000.0) as crime_rate_per_1k
FROM AreaStats a
JOIN CrimeCounts c ON a.COMM = c.COMM
WHERE a.population > 0 AND a.avg_income IS NOT NULL
"""

# Εκτέλεση
result_df = spark.sql(query)

print("Calculating results (this might take a minute)...")
results = result_df.collect()

print(f"Processing finished. Found {len(results)} communities.")

if len(results) > 0:
    res_df = spark.createDataFrame(results)
    
    # Υπολογισμός Correlation
    corr = res_df.stat.corr("avg_income", "crime_rate_per_1k")
    
    # Ταξινόμηση τοπικά (στον Driver) για να μην κουράσουμε το Spark
    top10_high = res_df.orderBy(F.col("avg_income").desc()).limit(10)
    top10_low = res_df.orderBy(F.col("avg_income").asc()).limit(10)
    
    try:
        c_high = top10_high.stat.corr("avg_income", "crime_rate_per_1k")
        c_low = top10_low.stat.corr("avg_income", "crime_rate_per_1k")
    except:
        c_high, c_low = 0.0, 0.0

    print("\n" + "="*50)
    print(f"RESULTS (FULL DATASET)")
    print("="*50)
    print(f"Global Correlation:          {corr:.5f}")
    print(f"Correlation (Richest 10):    {c_high:.5f}")
    print(f"Correlation (Poorest 10):    {c_low:.5f}")
    print("-" * 50)
    print(f"Total Time: {time.time() - start_time:.2f} seconds")
    
    print("\nSample Data:")
    res_df.select("COMM", "avg_income", "crime_rate_per_1k").show(5)
else:
    print("No data returned.")

# Cleanup
blocks.unpersist()

In [None]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator

# Ρυθμίσεις για να τρέξει σίγουρα
spark_conf = {
    "spark.executor.instances": "1",
    "spark.executor.cores": "1",
    "spark.executor.memory": "4g",
    "spark.sql.shuffle.partitions": "200", 
    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
    "spark.kryo.registrator": "org.apache.sedona.core.serde.SedonaKryoRegistrator",
    "spark.sql.adaptive.enabled": "true"
}

spark = SparkSession.builder.appName("Query5_Final_Result").config(map=spark_conf).getOrCreate()
SedonaRegistrator.registerAll(spark)
base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

# Blocks (με Array Fix)
blocks = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet").select(
    F.col("features.properties.COMM").getItem(0).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)
blocks.createOrReplaceTempView("blocks")

# Crimes (10% Sample)
crimes = spark.read.parquet(f"{base_path}/crime_data_clean.parquet") \
    .filter((F.col("Year") >= 2020)) \
    .sample(False, 0.10, 42) \
    .select("LON", "LAT")
crimes.createOrReplaceTempView("crimes")

# Income
spark.read.parquet(f"{base_path}/income_data.parquet").select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("income")
).createOrReplaceTempView("income")

# Query
start = time.time()
res = spark.sql("""
WITH CrimeCounts AS (
    SELECT /*+ BROADCAST(b) */ b.COMM, COUNT(*) * 10 as crimes FROM crimes c, blocks b
    WHERE ST_Contains(b.geometry, ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))))
    GROUP BY b.COMM
),
Stats AS (
    SELECT b.COMM, SUM(b.pop) as pop, AVG(i.income) as income FROM blocks b
    LEFT JOIN income i ON b.zcta = i.zip_code GROUP BY b.COMM
)
SELECT s.COMM, s.income, (c.crimes/s.pop) as rate 
FROM Stats s JOIN CrimeCounts c ON s.COMM = c.COMM 
WHERE s.pop > 500 AND s.income IS NOT NULL
""").collect()

if res:
    df = spark.createDataFrame(res)
    print(f"Global Correlation: {df.stat.corr('income', 'rate'):.5f}")
    print(f"Time: {time.time()-start:.2f}s")

In [None]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator

# Ελάχιστες ρυθμίσεις για να μην πιάσει πόρους
spark = SparkSession.builder \
    .appName("Query5_ProofOfConcept") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)
base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print("--- Loading Data (Tiny Sample) ---")

# Blocks
blocks = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet").select(
    F.col("features.properties.COMM").getItem(0).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)
blocks.createOrReplaceTempView("blocks")

# Crimes -> 0.1% SAMPLE (Πολύ μικρό για να τρέξει αμέσως)
crimes = spark.read.parquet(f"{base_path}/crime_data_clean.parquet") \
    .filter((F.col("Year") >= 2020)) \
    .sample(False, 0.001, 42) \
    .select("LON", "LAT")
crimes.createOrReplaceTempView("crimes")

# Income
spark.read.parquet(f"{base_path}/income_data.parquet").select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("income")
).createOrReplaceTempView("income")

print("--- Executing Logic ---")
start = time.time()

# Το Query
res = spark.sql("""
WITH CrimeCounts AS (
    SELECT /*+ BROADCAST(b) */ b.COMM, COUNT(*) as crimes 
    FROM crimes c, blocks b
    WHERE ST_Contains(b.geometry, ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))))
    GROUP BY b.COMM
),
Stats AS (
    SELECT b.COMM, SUM(b.pop) as pop, AVG(i.income) as income FROM blocks b
    LEFT JOIN income i ON b.zcta = i.zip_code GROUP BY b.COMM
)
SELECT s.COMM, s.income, (c.crimes) as total_hits 
FROM Stats s JOIN CrimeCounts c ON s.COMM = c.COMM 
WHERE s.pop > 0 AND s.income IS NOT NULL
""").limit(100).collect() # Limit για να τελειώνει

print(f"Time: {time.time()-start:.2f}s")

if res:
    df = spark.createDataFrame(res)
    # Απλά για να δούμε αν βγάζει κάτι
    df.show(5)
    print("Code executed successfully on sample.")
else:
    print("No intersection found in this tiny sample (Expected).")

In [None]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator

# Βασικές ρυθμίσεις
spark_conf = {
    "spark.executor.instances": "1",
    "spark.executor.cores": "2",
    "spark.executor.memory": "4g",
    "spark.sql.shuffle.partitions": "200", 
    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
    "spark.kryo.registrator": "org.apache.sedona.core.serde.SedonaKryoRegistrator",
    "spark.sql.adaptive.enabled": "true"
}

spark = SparkSession.builder.appName("Query5_Smart_Grid").config(map=spark_conf).getOrCreate()
SedonaRegistrator.registerAll(spark)
base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print("--- 1. Loading Blocks ---")
blocks = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet").select(
    F.col("features.properties.COMM").getItem(0).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("total_pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)
blocks.createOrReplaceTempView("blocks")

print("--- 2. Smart Aggregation of Crimes ---")
# ΑΝΤΙ ΝΑ ΦΟΡΤΩΣΟΥΜΕ ΟΛΑ ΤΑ ΕΓΚΛΗΜΑΤΑ ΓΙΑ JOIN:
# 1. Φιλτράρουμε έτη
# 2. Στρογγυλοποιούμε (Round) τις συντεταγμένες σε 3 δεκαδικά (~110 μέτρα ακρίβεια)
# 3. Κάνουμε Group By και Count ΠΡΙΝ το Join
crimes_raw = spark.read.parquet(f"{base_path}/crime_data_clean.parquet")

crime_grid = crimes_raw.filter((F.col("Year") >= 2020)) \
    .withColumn("lat_grid", F.round(F.col("LAT"), 3)) \
    .withColumn("lon_grid", F.round(F.col("LON"), 3)) \
    .groupBy("lat_grid", "lon_grid") \
    .agg(F.count("*").alias("crime_count"))

# Αυτό το DataFrame είναι ΠΟΛΥ μικρότερο από το αρχικό
crime_grid.createOrReplaceTempView("crime_grid")

count_grid = crime_grid.count()
print(f"Compressed 2M+ crimes into {count_grid} grid points.")

print("--- 3. Loading Income ---")
spark.read.parquet(f"{base_path}/income_data.parquet").select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("median_income")
).createOrReplaceTempView("income")

print("--- 4. Executing Optimized Spatial Join ---")
start_time = time.time()

# Τώρα κάνουμε Join τα Grid Points (λίγα) με τα Blocks
# Είναι πολύ πιο γρήγορο γιατί έχουμε λιγότερα σημεία να ελέγξουμε.
metrics_query = """
WITH LocalCrime AS (
    SELECT /*+ BROADCAST(b) */ 
        b.COMM,
        SUM(g.crime_count) as total_crimes -- Αθροίζουμε τα counts των grid points
    FROM crime_grid g, blocks b
    WHERE ST_Contains(b.geometry, ST_Point(CAST(g.lon_grid AS Decimal(24,20)), CAST(g.lat_grid AS Decimal(24,20))))
    GROUP BY b.COMM
),
EnrichedBlocks AS (
    SELECT 
        b.COMM,
        SUM(b.total_pop) as population,
        AVG(i.median_income) as avg_income
    FROM blocks b
    LEFT JOIN income i ON b.zcta = i.zip_code
    GROUP BY b.COMM
)
SELECT 
    e.COMM,
    e.avg_income,
    e.population,
    (COALESCE(c.total_crimes, 0) / 2.0) / NULLIF(e.population, 0) as crime_rate
FROM EnrichedBlocks e
LEFT JOIN LocalCrime c ON e.COMM = c.COMM
WHERE e.population > 0 AND e.avg_income IS NOT NULL
"""

final_df = spark.sql(metrics_query)
results = final_df.collect()

if len(results) > 0:
    res_df = spark.createDataFrame(results)
    
    corr_all = res_df.stat.corr("avg_income", "crime_rate")
    
    # Sort locally
    top10_high = res_df.orderBy(F.col("avg_income").desc()).limit(10)
    top10_low = res_df.orderBy(F.col("avg_income").asc()).limit(10)
    
    try:
        corr_high = top10_high.stat.corr("avg_income", "crime_rate")
        corr_low = top10_low.stat.corr("avg_income", "crime_rate")
    except:
        corr_high, corr_low = 0.0, 0.0
    
    print("\n" + "="*40)
    print(f"RESULTS (Smart Grid Aggregation)")
    print("="*40)
    print(f"Correlation (Global):          {corr_all:.5f}")
    print(f"Correlation (Top 10 Richest):  {corr_high:.5f}")
    print(f"Correlation (Top 10 Poorest):  {corr_low:.5f}")
    print("-" * 40)
    print(f"Time: {time.time() - start_time:.2f} s")
    
    res_df.select("COMM", "avg_income", "crime_rate").show(5)
else:
    print("No data found.")

In [None]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator

# Ρυθμίσεις "Safe Mode"
spark = SparkSession.builder \
    .appName("Query5_Final_Sampling") \
    .config("spark.executor.memory", "4g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)
base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print("--- 1. Loading Blocks (Array Fix) ---")
blocks = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet").select(
    F.col("features.properties.COMM").getItem(0).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)
blocks.createOrReplaceTempView("blocks")

print("--- 2. Loading Crimes (10% SAMPLE) ---")
# ΑΥΤΟ ΕΙΝΑΙ ΤΟ ΚΛΕΙΔΙ: Παίρνουμε το 10%
crimes = spark.read.parquet(f"{base_path}/crime_data_clean.parquet") \
    .filter((F.col("Year") >= 2020)) \
    .sample(withReplacement=False, fraction=0.10, seed=42) \
    .select("LON", "LAT")

crimes.createOrReplaceTempView("crimes")
print(f"Sample size: {crimes.count()} records")

print("--- 3. Loading Income ---")
spark.read.parquet(f"{base_path}/income_data.parquet").select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("income")
).createOrReplaceTempView("income")

print("--- 4. Executing Spatial Join ---")
start_time = time.time()

# Το Query
query = """
WITH CrimeCounts AS (
    SELECT /*+ BROADCAST(b) */ 
        b.COMM,
        COUNT(*) * 10 as est_crimes -- Scaling back up (x10)
    FROM crimes c, blocks b
    WHERE ST_Contains(b.geometry, ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))))
    GROUP BY b.COMM
),
AreaStats AS (
    SELECT 
        b.COMM,
        SUM(b.pop) as total_pop,
        AVG(i.income) as avg_income
    FROM blocks b
    LEFT JOIN income i ON b.zcta = i.zip_code
    GROUP BY b.COMM
)
SELECT 
    a.COMM,
    a.avg_income,
    (c.est_crimes / a.total_pop) as crime_rate
FROM AreaStats a
JOIN CrimeCounts c ON a.COMM = c.COMM
WHERE a.total_pop > 500 AND a.avg_income IS NOT NULL
"""

result_df = spark.sql(query)
results = result_df.collect()

if len(results) > 0:
    df_res = spark.createDataFrame(results)
    corr = df_res.stat.corr("avg_income", "crime_rate")
    
    print("\n" + "="*40)
    print(f"RESULTS (10% Sample)")
    print(f"Correlation: {corr:.5f}")
    print("-" * 40)
    print(f"Time: {time.time() - start_time:.2f} s")
else:
    print("No data found.")

In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator

# 1. Καθαρή Configuration (Dynamic Allocation)
# Αφήνουμε το Spark να διαχειριστεί τη μνήμη όπως ξέρει
spark_conf = {
    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
    "spark.kryo.registrator": "org.apache.sedona.core.serde.SedonaKryoRegistrator",
    "spark.sql.adaptive.enabled": "true",
    "spark.dynamicAllocation.enabled": "true", # <--- Το κλειδί αν δουλεύει σε άλλους
    "spark.shuffle.service.enabled": "true"
}

spark = SparkSession.builder \
    .appName("Query5_Clean_Run") \
    .config(map=spark_conf) \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)
base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print("--- Loading Data ---")

# BLOCKS: Διόρθωση Array μόνο
blocks = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet").select(
    F.col("features.properties.COMM").getItem(0).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("total_pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)
blocks.createOrReplaceTempView("blocks")

# CRIMES: Sampling 20% (Αρκετό για να τρέξει, αρκετό για αποτελέσματα)
crimes = spark.read.parquet(f"{base_path}/crime_data_clean.parquet") \
    .filter((F.col("Year") >= 2020)) \
    .sample(False, 0.2, 42) \
    .select("LON", "LAT")
crimes.createOrReplaceTempView("crimes")

# INCOME
spark.read.parquet(f"{base_path}/income_data.parquet").select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("income")
).createOrReplaceTempView("income")

print("--- Executing Join ---")
start_time = time.time()

query = """
SELECT 
    b.COMM,
    AVG(i.income) as avg_income,
    (COUNT(c.LON) * 5) / SUM(b.total_pop) as crime_rate -- x5 λόγω 20% sample
FROM blocks b
LEFT JOIN income i ON b.zcta = i.zip_code
JOIN crimes c ON ST_Contains(b.geometry, ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))))
WHERE b.total_pop > 0 
GROUP BY b.COMM
HAVING avg_income IS NOT NULL
"""

# Χρησιμοποιούμε limit για να δούμε αν ξεκινάει η ροή
final_df = spark.sql(query)
results = final_df.collect()

print(f"Time: {time.time() - start_time:.2f}s")

if results:
    df = spark.createDataFrame(results)
    print(f"Correlation: {df.stat.corr('avg_income', 'crime_rate'):.5f}")
    df.show(5)
else:
    print("No results.")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1758,application_1765289937462_1742,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o425.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 8.0 failed 4 times, most recent failure: Lost task 3.3 in stage 8.0 (TID 17) (ip-192-168-1-156.eu-central-1.compute.internal executor 5): ExecutorLostFailure (executor 5 exited caused by one of the running tasks) Reason: Container from a bad node: container_1765289937462_1742_01_000008 on host: ip-192-168-1-156.eu-central-1.compute.internal. Exit status: 137. Diagnostics: [2025-12-15 15:40:22.951]Container killed on request. Exit code is 137
[2025-12-15 15:40:22.951]Container exited with a non-zero exit code 137. 
[2025-12-15 15:40:22.951]Killed by external signal
.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:3083)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3019)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$ab