In [3]:
from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator

# Αρχικοποίηση Spark (αν δεν τρέχει ήδη)
spark = SparkSession.builder \
    .appName("SedonaQuery4_Fix") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

# Διαδρομές (Paths)
bucket_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"
crimes_path = f"{bucket_path}/crime_data_clean.parquet"
stations_path = f"{bucket_path}/police_stations.parquet"

# Φόρτωση DataFrames
crimes_df = spark.read.parquet(crimes_path)
stations_df = spark.read.parquet(stations_path)

# Δημιουργία Temporary Views
crimes_df.createOrReplaceTempView("crimes")
stations_df.createOrReplaceTempView("stations")

query = """
WITH CalculatedDistances AS (
    SELECT 
        c.DR_NO AS crime_id,  -- ΔΙΟΡΘΩΣΗ 1: Χρήση του DR_NO αντί για id
        s.DIVISION,
        /* ΔΙΟΡΘΩΣΗ 2: Δημιουργία των Points από τις συντεταγμένες (LON, LAT) και (X, Y) 
           Το Sedona συνήθως περιμένει (Longitude, Latitude) */
        ST_Distance(
            ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))), 
            ST_Point(CAST(s.X AS Decimal(24,20)), CAST(s.Y AS Decimal(24,20)))
        ) as distance
    FROM crimes c
    CROSS JOIN stations s
),
NearestStation AS (
    SELECT 
        crime_id,
        division,
        distance,
        -- Κατάταξη με βάση την απόσταση για κάθε έγκλημα
        ROW_NUMBER() OVER (PARTITION BY crime_id ORDER BY distance ASC) as rn
    FROM CalculatedDistances
)
SELECT 
    division,
    COUNT(crime_id) AS incidents_count,
    AVG(distance) AS average_distance
FROM NearestStation
WHERE rn = 1
GROUP BY division
ORDER BY incidents_count DESC
"""

# Εκτέλεση
result = spark.sql(query)

print("Execution Plan:")
result.explain()

print("\nResults:")
result.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Execution Plan:
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [incidents_count#208L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(incidents_count#208L DESC NULLS LAST, 1000), ENSURE_REQUIREMENTS, [plan_id=159]
      +- HashAggregate(keys=[division#199], functions=[count(crime_id#210), avg(distance#211)], schema specialized)
         +- Exchange hashpartitioning(division#199, 1000), ENSURE_REQUIREMENTS, [plan_id=156]
            +- HashAggregate(keys=[division#199], functions=[partial_count(crime_id#210), partial_avg(distance#211)], schema specialized)
               +- Project [crime_id#210, division#199, distance#211]
                  +- Filter (rn#212 = 1)
                     +- Window [row_number() windowspecdefinition(crime_id#210, distance#211 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rn#212], [crime_id#210], [distance#211 ASC NULLS FIRST]
                        +- WindowGroupLimit [crime_id#210], [d

In [None]:
1 core και 2GB memory


In [4]:
import time
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from sedona.register import SedonaRegistrator



conf = SparkConf()
conf.set("spark.executor.cores", "1")     
conf.set("spark.driver.memory", "4g")      

# Ρυθμίσεις Sedona & Serializer
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator")
conf.set("spark.sql.adaptive.enabled", "true")


spark = SparkSession.builder \
    .config(conf=conf) \
    .appName("Sedona_Query4_Benchmark") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)


bucket_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"
crimes_path = f"{bucket_path}/crime_data_clean.parquet"
stations_path = f"{bucket_path}/police_stations.parquet"

crimes_df = spark.read.parquet(crimes_path)
stations_df = spark.read.parquet(stations_path)

crimes_df.createOrReplaceTempView("crimes")
stations_df.createOrReplaceTempView("stations")


query = """
WITH CalculatedDistances AS (
    SELECT 
        c.DR_NO AS crime_id,
        s.DIVISION,
        -- Υπολογισμός απόστασης (Geometries on-the-fly)
        ST_Distance(
            ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))), 
            ST_Point(CAST(s.X AS Decimal(24,20)), CAST(s.Y AS Decimal(24,20)))
        ) as distance
    FROM crimes c
    CROSS JOIN stations s
),
NearestStation AS (
    SELECT 
        crime_id,
        division,
        distance,
        -- Window function: Κρατάμε το κοντινότερο (Rank=1)
        ROW_NUMBER() OVER (PARTITION BY crime_id ORDER BY distance ASC) as rn
    FROM CalculatedDistances
)
SELECT 
    division,
    COUNT(crime_id) AS incidents_count,
    AVG(distance) AS average_distance
FROM NearestStation
WHERE rn = 1
GROUP BY division
ORDER BY incidents_count DESC
"""

print(f"Ξεκινάει η εκτέλεση με: {spark.conf.get('spark.executor.cores')} Cores / {spark.conf.get('spark.executor.memory')} RAM...")
start_time = time.time()


result = spark.sql(query)
result.show(21, truncate=False) # Εμφάνιση όλων των τμημάτων

end_time = time.time()
duration = end_time - start_time

print("="*40)
print(f"Time Taken: {duration:.2f} seconds")
print("="*40)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

???????? ? ???????? ??: 1 Cores / 2g RAM...
+----------------+---------------+--------------------+
|division        |incidents_count|average_distance    |
+----------------+---------------+--------------------+
|HOLLYWOOD       |212904         |0.020411675771719335|
|VAN NUYS        |209295         |0.028528263619184445|
|WILSHIRE        |198499         |0.026313844768144096|
|SOUTHWEST       |186976         |0.021580585521813554|
|OLYMPIC         |172255         |0.017360212779429156|
|NORTH HOLLYWOOD |171399         |0.026109524669723497|
|77TH STREET     |166133         |0.016591314129548024|
|PACIFIC         |158098         |0.03752607609746353 |
|CENTRAL         |155274         |0.009876237391677836|
|RAMPART         |150293         |0.014737200211313192|
|SOUTHEAST       |143597         |0.024347696551238952|
|TOPANGA         |139462         |0.03242844288214283 |
|WEST VALLEY     |129467         |0.02889458383639828 |
|HARBOR          |127073         |0.034682430516097096|
|WES

In [None]:
2 cores και 4GB memory

In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from sedona.register import SedonaRegistrator



conf = SparkConf()
conf.set("spark.executor.cores", "2")    
conf.set("spark.executor.memory", "4g")   
conf.set("spark.driver.memory", "4g")      

# Ρυθμίσεις Sedona & Serializer
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator")
conf.set("spark.sql.adaptive.enabled", "true")

spark = SparkSession.builder \
    .config(conf=conf) \
    .appName("Sedona_Query4_Benchmark") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)

bucket_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"
crimes_path = f"{bucket_path}/crime_data_clean.parquet"
stations_path = f"{bucket_path}/police_stations.parquet"

crimes_df = spark.read.parquet(crimes_path)
stations_df = spark.read.parquet(stations_path)

crimes_df.createOrReplaceTempView("crimes")
stations_df.createOrReplaceTempView("stations")


query = """
WITH CalculatedDistances AS (
    SELECT 
        c.DR_NO AS crime_id,
        s.DIVISION,
        -- Υπολογισμός απόστασης (Geometries on-the-fly)
        ST_Distance(
            ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))), 
            ST_Point(CAST(s.X AS Decimal(24,20)), CAST(s.Y AS Decimal(24,20)))
        ) as distance
    FROM crimes c
    CROSS JOIN stations s
),
NearestStation AS (
    SELECT 
        crime_id,
        division,
        distance,
        -- Window function: Κρατάμε το κοντινότερο (Rank=1)
        ROW_NUMBER() OVER (PARTITION BY crime_id ORDER BY distance ASC) as rn
    FROM CalculatedDistances
)
SELECT 
    division,
    COUNT(crime_id) AS incidents_count,
    AVG(distance) AS average_distance
FROM NearestStation
WHERE rn = 1
GROUP BY division
ORDER BY incidents_count DESC
"""

print(f"Ξεκινάει η εκτέλεση με: {spark.conf.get('spark.executor.cores')} Cores / {spark.conf.get('spark.executor.memory')} RAM...")
start_time = time.time()


result = spark.sql(query)
result.show(21, truncate=False) # Εμφάνιση όλων των τμημάτων

end_time = time.time()
duration = end_time - start_time

print("="*40)
print(f"Time Taken: {duration:.2f} seconds")
print("="*40)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1154,application_1765289937462_1147,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

???????? ? ???????? ??: 2 Cores / 4g RAM...
+----------------+---------------+--------------------+
|division        |incidents_count|average_distance    |
+----------------+---------------+--------------------+
|HOLLYWOOD       |212904         |0.02041167577171934 |
|VAN NUYS        |209295         |0.02852826361918445 |
|WILSHIRE        |198499         |0.026313844768144093|
|SOUTHWEST       |186976         |0.02158058552181356 |
|OLYMPIC         |172255         |0.017360212779429156|
|NORTH HOLLYWOOD |171399         |0.026109524669723484|
|77TH STREET     |166133         |0.016591314129548024|
|PACIFIC         |158098         |0.03752607609746355 |
|CENTRAL         |155274         |0.009876237391677834|
|RAMPART         |150293         |0.014737200211313192|
|SOUTHEAST       |143597         |0.024347696551238956|
|TOPANGA         |139462         |0.03242844288214284 |
|WEST VALLEY     |129467         |0.028894583836398265|
|HARBOR          |127073         |0.03468243051609709 |
|WES

In [None]:
4 cores και 8 GB memory


In [1]:
import time
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from sedona.register import SedonaRegistrator



conf = SparkConf()
conf.set("spark.executor.cores", "4")     
conf.set("spark.executor.memory", "8g")    
conf.set("spark.driver.memory", "4g")     

# Ρυθμίσεις Sedona & Serializer
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryo.registrator", "org.apache.sedona.core.serde.SedonaKryoRegistrator")
conf.set("spark.sql.adaptive.enabled", "true")


spark = SparkSession.builder \
    .config(conf=conf) \
    .appName("Sedona_Query4_Benchmark") \
    .getOrCreate()

SedonaRegistrator.registerAll(spark)


bucket_path = "s3a://groups-bucket-dblab-905418150721/group36/processed_data"
crimes_path = f"{bucket_path}/crime_data_clean.parquet"
stations_path = f"{bucket_path}/police_stations.parquet"

crimes_df = spark.read.parquet(crimes_path)
stations_df = spark.read.parquet(stations_path)

crimes_df.createOrReplaceTempView("crimes")
stations_df.createOrReplaceTempView("stations")


query = """
WITH CalculatedDistances AS (
    SELECT 
        c.DR_NO AS crime_id,
        s.DIVISION,
        -- Υπολογισμός απόστασης (Geometries on-the-fly)
        ST_Distance(
            ST_Point(CAST(c.LON AS Decimal(24,20)), CAST(c.LAT AS Decimal(24,20))), 
            ST_Point(CAST(s.X AS Decimal(24,20)), CAST(s.Y AS Decimal(24,20)))
        ) as distance
    FROM crimes c
    CROSS JOIN stations s
),
NearestStation AS (
    SELECT 
        crime_id,
        division,
        distance,
        -- Window function: Κρατάμε το κοντινότερο (Rank=1)
        ROW_NUMBER() OVER (PARTITION BY crime_id ORDER BY distance ASC) as rn
    FROM CalculatedDistances
)
SELECT 
    division,
    COUNT(crime_id) AS incidents_count,
    AVG(distance) AS average_distance
FROM NearestStation
WHERE rn = 1
GROUP BY division
ORDER BY incidents_count DESC
"""

print(f"Ξεκινάει η εκτέλεση με: {spark.conf.get('spark.executor.cores')} Cores / {spark.conf.get('spark.executor.memory')} RAM...")
start_time = time.time()


result = spark.sql(query)
result.show(21, truncate=False) # Εμφάνιση όλων των τμημάτων

end_time = time.time()
duration = end_time - start_time

print("="*40)
print(f"Time Taken: {duration:.2f} seconds")
print("="*40)

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1156,application_1765289937462_1149,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

???????? ? ???????? ??: 4 Cores / 8g RAM...
+----------------+---------------+--------------------+
|division        |incidents_count|average_distance    |
+----------------+---------------+--------------------+
|HOLLYWOOD       |212904         |0.020411675771719345|
|VAN NUYS        |209295         |0.028528263619184455|
|WILSHIRE        |198499         |0.026313844768144103|
|SOUTHWEST       |186976         |0.021580585521813557|
|OLYMPIC         |172255         |0.01736021277942916 |
|NORTH HOLLYWOOD |171399         |0.0261095246697235  |
|77TH STREET     |166133         |0.016591314129548024|
|PACIFIC         |158098         |0.03752607609746355 |
|CENTRAL         |155274         |0.009876237391677836|
|RAMPART         |150293         |0.014737200211313192|
|SOUTHEAST       |143597         |0.024347696551238952|
|TOPANGA         |139462         |0.03242844288214285 |
|WEST VALLEY     |129467         |0.028894583836398265|
|HARBOR          |127073         |0.03468243051609708 |
|WES

In [2]:
print("Number of partitions:", crimes_df.rdd.getNumPartitions())


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Number of partitions: 5