In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.register import SedonaRegistrator

# Dynamic Allocation

spark_conf = {
    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
    "spark.kryo.registrator": "org.apache.sedona.core.serde.SedonaKryoRegistrator",
    "spark.sql.adaptive.enabled": "true",
    "spark.dynamicAllocation.enabled": "true",
    "spark.shuffle.service.enabled": "true"
}

spark = SparkSession.builder \
    .appName("Query5_Clean_Run") \
    .config(map=spark_conf) \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)
base_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"

print("--- Loading Data ---")


blocks = spark.read.parquet(f"{base_path}/census_blocks_geo.parquet").select(
    F.col("features.properties.COMM").getItem(0).alias("COMM"),
    F.col("features.properties.POP20").getItem(0).cast("long").alias("total_pop"),
    F.col("features.properties.ZCTA20").getItem(0).cast("long").cast("string").alias("zcta"),
    F.expr("ST_GeomFromGeoJSON(to_json(features.geometry))").alias("geometry")
)
blocks.createOrReplaceTempView("blocks")

# CRIMES: Sampling 20% 
crimes = spark.read.parquet(f"{base_path}/crime_data_clean.parquet") \
    .filter((F.col("Year") >= 2020)) \
    .sample(False, 0.2, 42) \
    .select("LON", "LAT")
crimes.createOrReplaceTempView("crimes")

# INCOME
spark.read.parquet(f"{base_path}/income_data.parquet").select(
    F.col("Zip Code").cast("long").cast("string").alias("zip_code"),
    F.col("Estimated Median Income").alias("income")
).createOrReplaceTempView("income")

print("--- Executing Join ---")
start_time = time.time()

query = """
SELECT 
    b.COMM,
    AVG(i.income) as avg_income,
    (COUNT(c.LON) * 5) / SUM(b.total_pop) as crime_rate -- x5 λόγω 20% sample
FROM blocks b
LEFT JOIN income i ON b.zcta = i.zip_code
JOIN crimes c ON ST_Contains(b.geometry, ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))))
WHERE b.total_pop > 0 
GROUP BY b.COMM
HAVING avg_income IS NOT NULL
"""

# Χρησιμοποιούμε limit για να δούμε αν ξεκινάει η ροή
final_df = spark.sql(query)
results = final_df.collect()

print(f"Time: {time.time() - start_time:.2f}s")

if results:
    df = spark.createDataFrame(results)
    print(f"Correlation: {df.stat.corr('avg_income', 'crime_rate'):.5f}")
    df.show(5)
else:
    print("No results.")

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1758,application_1765289937462_1742,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o425.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 8.0 failed 4 times, most recent failure: Lost task 3.3 in stage 8.0 (TID 17) (ip-192-168-1-156.eu-central-1.compute.internal executor 5): ExecutorLostFailure (executor 5 exited caused by one of the running tasks) Reason: Container from a bad node: container_1765289937462_1742_01_000008 on host: ip-192-168-1-156.eu-central-1.compute.internal. Exit status: 137. Diagnostics: [2025-12-15 15:40:22.951]Container killed on request. Exit code is 137
[2025-12-15 15:40:22.951]Container exited with a non-zero exit code 137. 
[2025-12-15 15:40:22.951]Killed by external signal
.
Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:3083)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3019)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$ab