#### Useful Links 
- Spark History Server : http://83.212.73.248:18080/
- Hadoop YARN (scheduler) : http://83.212.73.248:8088/cluster
- HDFS : http://83.212.73.248:9870/dfshealth.html#tab-overview

#### Useful Commands : 
- Connect to okeanos-master (from local) : `$ ssh user@snf-40202.ok-kno.grnetcloud.net `
    - Password : 'Rand0m'
- Connect to okeanos-worker (from okeanos-master) : `$ ssh okeanos-worker`
- Open Jupyter Notebook : `$ jupyter notebook --ip 83.212.73.248 --port 8888`

#### Thinks to do :
- Make the data Csv to Parquet
- Make those columns the type we want
- Write the Queries (!)
- Benchmark and optimize them etc.
- Balance the data onto HDFS across the two datanodes

### Full HDFS path is here : hdfs://okeanos-master:54310/csv_data/
and contains :  
     
     1.  hdfs://okeanos-master:54310/csv_data/LAPD_Police_Stations.csv
     2.  hdfs://okeanos-master:54310/csv_data/crime_data_2019.csv 
     3.  hdfs://okeanos-master:54310/csv_data/crime_data_2023.csv
     4.  hdfs://okeanos-master:54310/csv_data/revgecoding.csv 
     5.  hdfs://okeanos-master:54310/csv_data/income/
         1. LA_income_2015.csv
         2. LA_income_2017.csv
         3. LA_income_2019.csv
         4. LA_income_2021.csv

In [1]:
# Pyspark Imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import *
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from operator import add
import geopy.distance
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, rank
from pyspark.sql.types import DoubleType
from pyspark.sql.window import Window
import math

In [2]:
# initialize sparkSession, make the data from csv to parquet,
spark = SparkSession \
    .builder \
    .appName("4 Executors") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/12 15:48:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/12 15:48:34 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# load data into memory, do the necessary joins etc. here
crime_data = spark.read.parquet("hdfs://okeanos-master:54310/parquet/crime_data_*.parquet")
revge = spark.read.parquet("hdfs://okeanos-master:54310/parquet/revgecoding.parquet")
# only 2015 income data needed
income = spark.read \
            .parquet("hdfs://okeanos-master:54310/parquet/income/LA_income_2015.parquet")
lapd_stations = spark.read.parquet("hdfs://okeanos-master:54310/parquet/LAPD_Police_Stations.parquet")

                                                                                

In [4]:
#crime_data.show()

In [5]:
#crime_data.printSchema()

#### Change Column Types 
Στην εκφωνηση λέει : Διατηρώντας τα αρχικά ονόματα στηλών 

εννοωντας οτι δεν μπορουμε να κανουμε το 'Date Rptd' -> 'Date_Rptd' ?
- Date Rptd: date
- DATE OCC: date
- Vict Age: integer
- LAT: double
- LON: double

In [6]:
# code for column type changing
crime_data = crime_data.withColumn("Date Rptd", to_timestamp("Date Rptd", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("DATE OCC", to_timestamp("DATE OCC", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("Vict Age", col("Vict Age").cast("int")) \
    .withColumn("LAT", col("LAT").cast("double")) \
    .withColumn("LON", col("LON").cast("double")) \
    .withColumn("Premis_Desc", col("Premis Desc"))

# 1st Query :
        find
            for each year
                the 3 months with the biggest crime count

        year | month | crime_total (count)  + #order
        dataframe.show()

        SELECT  YEAR(date_rptd) as year,
                MONTH(date_rptd) as month,
                COUNT(*) as crime_total,
                ROW_NUMBER() OVER (PARTITION BY year ORDER BY crime_total) as '#'
        GROUP BY YEAR(date_rptd), MONTH(date_rptd)
        SORT BY year ASC, crime_total DESp    GROUP BY police_station_name
                ORDER BY #

    2 :

In [7]:
# code for first query in SQL API
def query_1_SQL_API():
    start_time = time.time()
    crime_data.createOrReplaceTempView("crime_data")
    
    query = """
        SELECT * FROM (
            SELECT 
                year(`Date Rptd`) AS year,
                month(`Date Rptd`) AS month,
                COUNT(*) AS crime_total,
                ROW_NUMBER() OVER (PARTITION BY year(`Date Rptd`) ORDER BY COUNT(*) DESC) AS rank
            FROM 
                crime_data
            GROUP BY 
                year(`Date Rptd`), month(`Date Rptd`)
        ) ranked_data
        WHERE rank <= 3
        ORDER BY year, rank
    """
    
    
    result_df = spark.sql(query)
    
    result_df.show()
    end_time = time.time()
    
    result_df.explain()

    return end_time - start_time

In [8]:
# code for first query in Dataframe API
def query_1_Dataframe_API():
    start_time = time.time()
    crime_counts = crime_data.withColumn("year", F.year("Date Rptd")) \
                          .withColumn("month", F.month("Date Rptd")) \
                          .groupBy("year", "month") \
                          .agg(F.count("*").alias("crime_total"))
    
    window_spec = Window.partitionBy("year").orderBy(F.desc("crime_total"))
    
    ranked_crime = crime_counts.withColumn("rank", F.row_number().over(window_spec))
    
    result_df = ranked_crime.filter("rank <= 3").orderBy("year", "rank")
    
    result_df.show()
    end_time = time.time()

    result_df.explain()

    return end_time - start_time

 # 2nd Query :


            SELECT street,
                   CASE
                      WHEN HOUR('Date Rptd') BETWEEN 5 AND 11 THEN 'Morning'
                      WHEN HOUR('Date Rptd') BETWEEN 12 AND 16 THEN 'Noon'
                      WHEN HOUR('Date Rptd') BETWEEN 17 AND 20 THEN 'Afternoon'
                      ELSE 'Night'
                    END AS time_group,
                    COUNT(*) as count
            WHERE 'Prem Desc'='STREET'
            GROUP BY time_group
            ORDER BY count

In [9]:
# write code for 2nd query here for Dataframe/SQL API
def query_2_Dataframe_API():
    start_time = time.time()
    filtered_df = crime_data.filter(crime_data['Premis_Desc'] == 'STREET')

    time_group_df = filtered_df.withColumn("time_group",
                                       # TIME OCC is in 24 hour military time integer values
                                      F.when((F.col('TIME OCC').between(500, 1159)), 'Morning')
                                      .when((F.col('TIME OCC').between(1200, 1659)), 'Noon')
                                      .when((F.col('TIME OCC').between(1700, 2059)), 'Afternoon')
                                      .otherwise('Night'))

    result_df = time_group_df.groupBy("time_group").agg(F.count("*").alias("count"))

    result_df = result_df.orderBy(col("count").desc())
    result_df.show()
    end_time = time.time()
    # call explain() method in order
    # to see the query's physical plan
    # and improve the RDD query
    result_df.explain()
    return end_time - start_time

In [10]:
# write code for 2nd query here for RDD API
def time_segs(row):
    if 500 <= int(row['TIME OCC']) <= 1159:
        return 'Morning'
    elif 1200 <= int(row['TIME OCC']) <= 1659:
        return 'Noon'
    elif 1700 <= int(row['TIME OCC']) <= 2059:
        return 'Afternoon'
    else:
        return 'Night'

In [11]:
def query_2_rdd(): 
    start_time = time.time()
    crime_data_rdd = crime_data.rdd.filter(lambda x: x['Premis_Desc'] == 'STREET') \
                                .map(lambda x: (time_segs(x),1)) \
                                .reduceByKey(lambda k1,k2: k1+k2) \
                                .sortBy(lambda x: x[1], ascending = False)
    
    result = crime_data_rdd.collect()
    for time_of_day, count in result:
        print(f"{time_of_day}: {count}")
        
    end_time = time.time()
    return end_time - start_time

In [12]:
def map_time_group(row):
    time_occ = int(row)
    if 500 <= time_occ <= 1159:
        return 'Morning'
    elif 1200 <= time_occ <= 1659:
        return 'Noon'
    elif 1700 <= time_occ <= 2059:
        return 'Afternoon'
    else:
        return 'Night'
        
def query_2_rdd_new():
    start_time = time.time()

    crime_data_rdd = crime_data.rdd.filter(lambda x: x['Premis_Desc'] == 'STREET') \
                                .map(lambda x: (map_time_group(x['TIME OCC']), 1)) \
                                .reduceByKey(add).sortBy(lambda x: x[1], ascending = False)

    result = crime_data_rdd.collect()
    for time_of_day, count in result:
        print(f"{time_of_day}: {count}")
        
    end_time = time.time()
    return end_time - start_time

In [13]:
#query_2_rdd()

In [14]:
#spark.stop()

# 3rd Query :

        find the 3 zip codes with min and max household income
                    |
                    |
                    v
        // filter(remove) victimless crimes
                    |
                    |
                    v
        select vict_desc, COUNT(*) as count
        where year=2015
        group by vict_desc
        order by count DESC

In [15]:
# write code for 3rd query here
def query_3(method = 'CONTINUE'):
    start_time = time.time()
    #crime_data_2015 = crime_data.filter(year(col('Date Rptd')) == 2015)
    if method == 'BROADCAST':
        crime_data_join_revge = crime_data.join(broadcast(revge), ['LAT', 'LON'], 'inner') \
            .withColumnRenamed('ZIPcode', 'Zip Code') \
            .withColumn("Zip Code", col("Zip Code").cast("int")) \
            .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X'))
        #crime_data_join_revge.explain()
        
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        crime_data_join_revge = crime_data.hint(method).join(revge, ['LAT', 'LON'], 'inner') \
            .withColumnRenamed('ZIPcode', 'Zip Code') \
            .withColumn("Zip Code", col("Zip Code").cast("int")) \
            .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X'))
        #crime_data_join_revge.explain()
        
    elif method == 'CONTINUE':
        crime_data_join_revge = crime_data.join(revge, ['LAT', 'LON'], 'inner') \
            .withColumnRenamed('ZIPcode', 'Zip Code') \
            .withColumn("Zip Code", col("Zip Code").cast("int")) \
            .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X'))
        
    else:
        return None

    #crime_data_join_revge = crime_data.join(revge, ['LAT', 'LON'], 'inner') \
    #    .withColumnRenamed('ZIPcode', 'Zip Code') \
    #    .withColumn("Zip Code", col("Zip Code").cast("int")) \
    #    .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X'))
    
    crime_data_join_income = crime_data_join_revge.join(income, 'Zip Code', 'inner') \
                                .withColumn('Estimated Median Income', 
                                            regexp_replace(col('Estimated Median Income'), '[$,]', '')) \
                                .withColumn('Estimated Median Income', 
                                            col('Estimated Median Income') \
                                .cast('double'))
    
    max_income_zip_codes = crime_data_join_income.groupBy('Zip Code') \
                            .agg({'Estimated Median Income': 'max'}) \
                            .withColumnRenamed('max(Estimated Median Income)', 'MaxIncome') \
                            .orderBy(col('MaxIncome').desc()) \
                            .limit(3)
    
    min_income_zip_codes = crime_data_join_income.groupBy('Zip Code') \
                            .agg({'Estimated Median Income': 'min'}) \
                            .withColumnRenamed('min(Estimated Median Income)', 'MinIncome') \
                            .orderBy(col('MinIncome')) \
                            .limit(3)
    
    zip_codes = min_income_zip_codes.union(max_income_zip_codes)
    
    zip_codes_list = [row['Zip Code'] for row in zip_codes.collect()]
    
    result = crime_data_join_income \
                .filter(col('Zip Code').isin(zip_codes_list)) \
                .filter(year(col('Date Rptd')) == 2015) \
                .groupBy('Vict Descent') \
                .count() \
                .withColumnRenamed('count', '#') \
                .orderBy(col('#').desc())
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [16]:
#query_3("SHUFFLE_HASH")

In [17]:
#spark.stop()

In [18]:
#for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH']:
#    query_3(method)

# 4th Query :
    1 :
        a)
                make_extra_columns() : distance of police stations from crime
                join crime_table with LA Police Stations on police_station
                and add column of police station
                for each row compute distance from two coordinates                         
                put this computed distance in the column named 'distance'

                SELECT year, SUM(distance)/# as average_distance
                                , COUNT(*) as #
                FROM ...
                WHERE WEAPON < 200
                GROUP BY year
                ORDER BY #


        b)
                SELECT police_station_name as division,
                       SUM(distance)/# as average_distance,
                       COUNT(*) as #
                FROM ...
                WHERE weapon NOT NULL
                GROUP BY police_station_name
                ORDER BY #

    2 :

In [19]:
# calculate distance on a sphere (as earth is not flat)
def haversine(lon1, lat1, lon2, lat2):
    R = 6371
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + \
        math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

haversine_udf = udf(haversine, DoubleType())

In [20]:
# write code for 4th query here
# 1a)
def query_4_1a(method = 'CONTINUE'):
    start_time = time.time()
    lapd_stations_new = lapd_stations.withColumnRenamed('PREC','AREA')
    
    if method == 'BROADCAST':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(broadcast(lapd_stations_new), 'AREA', 'inner')
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .hint(method).join(lapd_stations_new, 'AREA', 'inner')
    elif method == 'CONTINUE':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(lapd_stations_new, 'AREA', 'inner')
    else:
        return None
    
    crime_data_join_stations = crime_data_join_stations.withColumn('distance',
                                    haversine_udf(col("LON"), col("LAT"), col("X"), col("Y")))
    
    crime_data_join_stations = crime_data_join_stations.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(col('Weapon Used Cd') < 200) \
                                    .withColumn('year', F.year('Date Rptd'))
    
    result = crime_data_join_stations.groupBy('year') \
        .agg((F.sum('distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')) \
       .orderBy(F.col('year'))
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [21]:
# 1b)
def query_4_1b(method = 'CONTINUE'):
    start_time = time.time()
    lapd_stations_new = lapd_stations.withColumnRenamed('PREC','AREA')
    
    if method == 'BROADCAST':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(broadcast(lapd_stations_new), 'AREA', 'inner')
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .hint(method).join(lapd_stations_new, 'AREA', 'inner')
    elif method == 'CONTINUE':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(lapd_stations_new, 'AREA', 'inner')
    else:
        return None
    
    # distance of LAT and LON using Spark functions
    crime_data_join_stations = crime_data_join_stations.withColumn('distance',
                                    haversine_udf(col("LON"), col("LAT"), col("X"), col("Y"))) # Earths's radius
    
    crime_data_join_stations = crime_data_join_stations.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(F.col('Weapon Used Cd').isNotNull()) \
                                    .withColumn('year', F.year('Date Rptd'))
    
    result = crime_data_join_stations.groupBy('AREA NAME') \
        .agg(
            (F.sum('distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')
        ) \
        .orderBy(F.col('#').desc()) \
        .withColumnRenamed('AREA NAME', 'division')
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [22]:
crime_data.printSchema()

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: timestamp (nullable = true)
 |-- DATE OCC: timestamp (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA : integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)

In [23]:
# 2a)
def query_4_2a(method = 'CONTINUE'):
    start_time = time.time()
    
    if method == 'BROADCAST':
        combined_data = crime_data.hint(method).crossJoin(broadcast(lapd_stations))        
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        combined_data = crime_data.hint(method).crossJoin(lapd_stations)
    elif method == 'CONTINUE':
        combined_data = crime_data.crossJoin(lapd_stations)
    else:
        return None

    # filter for DR_NO < 200  
    # or Catalyst Optimizer does it by itself 
    
    combined_data = combined_data.withColumn("closest_distance", haversine_udf(col("LON"), col("LAT"), col("X"), col("Y")))
    
    windowSpec = Window.partitionBy("DR_NO").orderBy("closest_distance")
    closest_stations = combined_data.withColumn("rank", rank().over(windowSpec)).filter(col("rank") == 1)
    
    final_data = closest_stations.select(col("DR_NO"), col("DIVISION").alias("closest_station"), col("closest_distance"))
    
    result = crime_data.join(final_data, "DR_NO")
    crime_data_join_stations = result.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(col('Weapon Used Cd') < 200) \
                                    .withColumn('year', F.year('Date Rptd'))
    
    result = crime_data_join_stations.groupBy('year') \
        .agg((F.sum('closest_distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')) \
       .orderBy(F.col('year'))

    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [24]:
# 2b)
def query_4_2b(method = 'CONTINUE'):
    start_time = time.time()
    
    if method == 'BROADCAST':
        combined_data = crime_data.hint(method).crossJoin(broadcast(lapd_stations))
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        combined_data = crime_data.hint(method).crossJoin(lapd_stations)        
    elif method == 'CONTINUE':
        combined_data = crime_data.crossJoin(lapd_stations)
    else:
        return None

    
    combined_data = combined_data.withColumn("closest_distance", haversine_udf(col("LON"), col("LAT"), col("X"), col("Y")))
    
    windowSpec = Window.partitionBy("DR_NO").orderBy("closest_distance")
    closest_stations = combined_data.withColumn("rank", rank().over(windowSpec)).filter(col("rank") == 1)
    
    final_data = closest_stations.select(col("DR_NO"), col("DIVISION").alias("closest_station"), col("closest_distance"))
    
    result = crime_data.join(final_data, "DR_NO")
    crime_data_join_stations = result.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(F.col('Weapon Used Cd').isNotNull()) \
                                    .withColumn('year', F.year('Date Rptd'))
    
    result = crime_data_join_stations.groupBy('AREA NAME') \
        .agg(
            (F.sum('closest_distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')
        ) \
        .orderBy(F.col('#').desc()) \
        .withColumnRenamed('AREA NAME', 'division')
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

# Query 1 on 4 Executors

In [22]:
 crime_data.cache()

DataFrame[DR_NO: int, Date Rptd: timestamp, DATE OCC: timestamp, TIME OCC: int, AREA : int, AREA NAME: string, Rpt Dist No: int, Part 1-2: int, Crm Cd: int, Crm Cd Desc: string, Mocodes: string, Vict Age: int, Vict Sex: string, Vict Descent: string, Premis Cd: int, Premis Desc: string, Weapon Used Cd: int, Weapon Desc: string, Status: string, Status Desc: string, Crm Cd 1: int, Crm Cd 2: int, Crm Cd 3: int, Crm Cd 4: int, LOCATION: string, Cross Street: string, LAT: double, LON: double, Premis_Desc: string]

In [23]:
query_1_Dataframe_API()

                                                                                

+----+-----+-----------+----+
|year|month|crime_total|rank|
+----+-----+-----------+----+
|2010|    3|      17595|   1|
|2010|    7|      17520|   2|
|2010|    5|      17338|   3|
|2011|    8|      17139|   1|
|2011|    5|      17050|   2|
|2011|    3|      16951|   3|
|2012|    8|      17696|   1|
|2012|   10|      17477|   2|
|2012|    5|      17391|   3|
|2013|    8|      17329|   1|
|2013|    7|      16714|   2|
|2013|    5|      16671|   3|
|2014|    7|      14059|   1|
|2014|   10|      14031|   2|
|2014|    9|      13799|   3|
|2015|    8|      18951|   1|
|2015|   10|      18916|   2|
|2015|    7|      18528|   3|
|2016|    8|      19779|   1|
|2016|   10|      19615|   2|
+----+-----+-----------+----+
only showing top 20 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [year#514 ASC NULLS FIRST, rank#614 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(year#514 ASC NULLS FIRST, rank#614 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=308]
  

26.40231966972351

In [24]:
query_1_SQL_API()

+----+-----+-----------+----+
|year|month|crime_total|rank|
+----+-----+-----------+----+
|2010|    3|      17595|   1|
|2010|    7|      17520|   2|
|2010|    5|      17338|   3|
|2011|    8|      17139|   1|
|2011|    5|      17050|   2|
|2011|    3|      16951|   3|
|2012|    8|      17696|   1|
|2012|   10|      17477|   2|
|2012|    5|      17391|   3|
|2013|    8|      17329|   1|
|2013|    7|      16714|   2|
|2013|    5|      16671|   3|
|2014|    7|      14059|   1|
|2014|   10|      14031|   2|
|2014|    9|      13799|   3|
|2015|    8|      18951|   1|
|2015|   10|      18916|   2|
|2015|    7|      18528|   3|
|2016|    8|      19779|   1|
|2016|   10|      19615|   2|
+----+-----+-----------+----+
only showing top 20 rows

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [year#1798 ASC NULLS FIRST, rank#1801 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(year#1798 ASC NULLS FIRST, rank#1801 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=614

2.1631157398223877

# Query 2 on 4 Executors 

In [25]:
query_2_Dataframe_API()



+----------+------+
|time_group| count|
+----------+------+
|     Night|237605|
| Afternoon|187306|
|      Noon|148180|
|   Morning|123846|
+----------+------+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#3056L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(count#3056L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=732]
      +- HashAggregate(keys=[time_group#2994], functions=[count(1)])
         +- Exchange hashpartitioning(time_group#2994, 200), ENSURE_REQUIREMENTS, [plan_id=729]
            +- HashAggregate(keys=[time_group#2994], functions=[partial_count(1)])
               +- Project [CASE WHEN ((TIME OCC#3 >= 500) AND (TIME OCC#3 <= 1159)) THEN Morning WHEN ((TIME OCC#3 >= 1200) AND (TIME OCC#3 <= 1659)) THEN Noon WHEN ((TIME OCC#3 >= 1700) AND (TIME OCC#3 <= 2059)) THEN Afternoon ELSE Night END AS time_group#2994]
                  +- Filter (isnotnull(Premis_Desc#339) AND (Premis_Desc#339 = STREET))
                     +- InMemoryTa

                                                                                

1.5946252346038818

In [26]:
query_2_rdd()

                                                                                

Night: 237605
Afternoon: 187306
Noon: 148180
Morning: 123846


27.94545888900757

# Query 3 on 4 Executors

In [27]:
query_3()



+------------+----+
|Vict Descent|   #|
+------------+----+
|           H|1556|
|           B|1092|
|           W|1002|
|           O| 484|
|           A| 116|
|           K|   7|
|           J|   3|
|           I|   3|
|           C|   2|
|           F|   1|
+------------+----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [##6037L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(##6037L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=1841]
      +- HashAggregate(keys=[Vict Descent#13], functions=[count(1)])
         +- Exchange hashpartitioning(Vict Descent#13, 200), ENSURE_REQUIREMENTS, [plan_id=1838]
            +- HashAggregate(keys=[Vict Descent#13], functions=[partial_count(1)])
               +- Project [Vict Descent#13]
                  +- BroadcastHashJoin [Zip Code#4292], [Zip Code#62], Inner, BuildRight, false
                     :- Project [Vict Descent#13, cast(ZIPcode#58 as int) AS Zip Code#4292]
                     :  +- BroadcastHas

                                                                                

11.564699172973633

In [18]:
for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
    query_3(method)

                                                                                

+------------+----+
|Vict Descent|   #|
+------------+----+
|           H|1556|
|           B|1092|
|           W|1002|
|           O| 484|
|           A| 116|
|           K|   7|
|           I|   3|
|           J|   3|
|           C|   2|
|           F|   1|
+------------+----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [##726L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(##726L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=1004]
      +- HashAggregate(keys=[Vict Descent#13], functions=[count(1)])
         +- Exchange hashpartitioning(Vict Descent#13, 200), ENSURE_REQUIREMENTS, [plan_id=1001]
            +- HashAggregate(keys=[Vict Descent#13], functions=[partial_count(1)])
               +- Project [Vict Descent#13]
                  +- BroadcastHashJoin [Zip Code#431], [Zip Code#62], Inner, BuildRight, false
                     :- Project [Vict Descent#13, cast(ZIPcode#58 as int) AS Zip Code#431]
                     :  +- BroadcastHashJoi

                                                                                

+------------+----+
|Vict Descent|   #|
+------------+----+
|           H|1556|
|           B|1092|
|           W|1002|
|           O| 484|
|           A| 116|
|           K|   7|
|           J|   3|
|           I|   3|
|           C|   2|
|           F|   1|
+------------+----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [##1095L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(##1095L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=2379]
      +- HashAggregate(keys=[Vict Descent#13], functions=[count(1)])
         +- Exchange hashpartitioning(Vict Descent#13, 200), ENSURE_REQUIREMENTS, [plan_id=2376]
            +- HashAggregate(keys=[Vict Descent#13], functions=[partial_count(1)])
               +- Project [Vict Descent#13]
                  +- BroadcastHashJoin [Zip Code#801], [Zip Code#62], Inner, BuildRight, false
                     :- Project [Vict Descent#13, cast(ZIPcode#58 as int) AS Zip Code#801]
                     :  +- SortMergeJoin 

                                                                                

+------------+----+
|Vict Descent|   #|
+------------+----+
|           H|1556|
|           B|1092|
|           W|1002|
|           O| 484|
|           A| 116|
|           K|   7|
|           J|   3|
|           I|   3|
|           C|   2|
|           F|   1|
+------------+----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [##1464L DESC NULLS LAST], true, 0
   +- Exchange rangepartitioning(##1464L DESC NULLS LAST, 200), ENSURE_REQUIREMENTS, [plan_id=3426]
      +- HashAggregate(keys=[Vict Descent#13], functions=[count(1)])
         +- Exchange hashpartitioning(Vict Descent#13, 200), ENSURE_REQUIREMENTS, [plan_id=3423]
            +- HashAggregate(keys=[Vict Descent#13], functions=[partial_count(1)])
               +- Project [Vict Descent#13]
                  +- BroadcastHashJoin [Zip Code#1170], [Zip Code#62], Inner, BuildRight, false
                     :- Project [Vict Descent#13, cast(ZIPcode#58 as int) AS Zip Code#1170]
                     :  +- ShuffledHash

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 0) / 6]
Traceback (most recent call last):
  File "/home/user/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/user/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

[Stage 69:>                 (0 + 4) / 6][Stage 70:>                 (0 + 0) / 6]

# Query 4 on 4 Executors

In [29]:
query_4_1a()



+----+------------------+----+
|year|  average_distance|   #|
+----+------------------+----+
|2010|4.3255933001101114|8162|
|2011|2.7909872168227423|7225|
|2012| 37.45827620685533|6539|
|2013| 2.830553808457538|5851|
|2014|11.043993584711998|4559|
|2015| 2.706546019966876|6729|
|2016| 2.718165310899851|8094|
|2017|4.3382539597541765|7781|
|2018|2.7360981635514983|7414|
|2019| 2.741344160752832|7135|
|2020|2.3272432584812806|  46|
|2021|37.382419651864836|2553|
|2022|2.9681689538445584|  44|
|2023| 3.687948331136995|   8|
+----+------------------+----+

Method : CONTINUE | Time 5.425380706787109


                                                                                

5.425380706787109

In [30]:
query_4_1b()



+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street| 6.913894657106163|68326|
|  Southeast|12.025486164617545|57011|
|  Southwest| 6.551382718135036|54271|
|     Newton|5.2361202832288685|43355|
|    Central| 3.166545740156361|40488|
|    Rampart| 7.047477633659231|38908|
|    Olympic|16.976357519281617|37199|
|  Hollywood| 6.751386163782782|32995|
|    Mission|20.666096061806797|32334|
| Hollenbeck|12.702801855459152|29901|
|    Pacific|12.047038740457603|29074|
|N Hollywood|12.204292484887183|28702|
|     Harbor| 7.460827048756396|28540|
|   Foothill|14.305736282276778|27415|
|  Northeast| 9.093017340351224|27230|
|   Wilshire| 7.054111333900138|26772|
|   Van Nuys|  8.87387830707935|26089|
|    Topanga| 8.057323863764594|24822|
| Devonshire|13.826966000368188|23013|
|West Valley|10.031539994619983|22791|
+-----------+------------------+-----+
only showing top 20 rows

Method : CONTINUE | Time 2.42521405220

                                                                                

2.4252140522003174

In [31]:
query_4_2a()



+----+------------------+-----+
|year|  average_distance|    #|
+----+------------------+-----+
|2010|3.9757578208226914| 8162|
|2011|2.4590852830727155| 7225|
|2012| 37.10529402034977| 6539|
|2013| 2.459919591596096| 5851|
|2014|10.659354948051488| 4559|
|2015|2.3889866668876283| 6729|
|2016|2.4268077758998485| 8094|
|2017| 4.006458967994438| 7781|
|2018|2.4123576588935918| 7414|
|2019|2.4311257325392552| 7135|
|2020| 8.300434266099332| 8496|
|2021|  32.0668887686691|17410|
|2022|2.3181733400905924|10139|
|2023|2.2683855904462207| 8955|
+----+------------------+-----+

Method : CONTINUE | Time 95.02895522117615


                                                                                

95.02895522117615

In [32]:
query_4_2b()



+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street|12.567122296795125|94853|
|  Southeast|25.601298738797524|87905|
|  Southwest| 9.329186662576602|72814|
|    Central|23.269811961478677|63606|
|     Newton|13.433184721292792|61408|
|    Olympic|   36.348197450529|60925|
|    Rampart|19.629568008429306|55881|
|  Hollywood|27.671748021149348|51255|
|    Mission| 34.80052558861396|48956|
|    Pacific|24.825461959674055|43019|
|   Foothill|29.797575113580432|41625|
| Hollenbeck| 19.52127635437576|41540|
|N Hollywood| 17.69148313156268|41151|
|     Harbor|13.982027989409763|40854|
|    Topanga| 6.096105827278324|39337|
|   Wilshire|15.736618712905988|37930|
|  Northeast|12.367411075059353|37334|
| Devonshire|23.046036251700617|36902|
|   Van Nuys|19.788057065178123|36264|
|West Valley| 14.71804024306935|34005|
+-----------+------------------+-----+
only showing top 20 rows

Method : CONTINUE | Time 92.9777445793

                                                                                

92.97774457931519

In [33]:
for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH']:
    query_4_1a(method)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [AREA#22943, DR_NO#0, Date Rptd#194, DATE OCC#223, TIME OCC#3, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#252, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, ... 10 more fields]
   +- BroadcastHashJoin [AREA#22943], [AREA#22936], Inner, BuildRight, false
      :- Project [DR_NO#0, Date Rptd#194, DATE OCC#223, TIME OCC#3, AREA #4 AS AREA#22943, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#252, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, ... 5 more fields]
      :  +- Filter isnotnull(AREA #4)
      :     +- InMemoryTableScan [AREA #4, AREA NAME#5, Crm Cd#8, Crm Cd 1#20, Crm Cd 2#21, Crm

                                                                                

+----+------------------+----+
|year|  average_distance|   #|
+----+------------------+----+
|2010|4.3255933001101114|8162|
|2011|2.7909872168227423|7225|
|2012| 37.45827620685533|6539|
|2013| 2.830553808457538|5851|
|2014|11.043993584711998|4559|
|2015| 2.706546019966876|6729|
|2016| 2.718165310899851|8094|
|2017|4.3382539597541765|7781|
|2018|2.7360981635514983|7414|
|2019| 2.741344160752832|7135|
|2020|2.3272432584812806|  46|
|2021|37.382419651864836|2553|
|2022|2.9681689538445584|  44|
|2023| 3.687948331136995|   8|
+----+------------------+----+

Method : BROADCAST | Time 1.3421380519866943
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [AREA#24349, DR_NO#0, Date Rptd#194, DATE OCC#223, TIME OCC#3, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#252, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, .

                                                                                

+----+------------------+----+
|year|  average_distance|   #|
+----+------------------+----+
|2010| 4.325593300110116|8162|
|2011|2.7909872168227494|7225|
|2012| 37.45827620685538|6539|
|2013| 2.830553808457538|5851|
|2014|11.043993584711966|4559|
|2015|2.7065460199668823|6729|
|2016| 2.718165310899849|8094|
|2017| 4.338253959754184|7781|
|2018|2.7360981635515005|7414|
|2019|2.7413441607528366|7135|
|2020|  2.32724325848128|  46|
|2021| 37.38241965186492|2553|
|2022|2.9681689538445593|  44|
|2023|3.6879483311369956|   8|
+----+------------------+----+

Method : MERGE | Time 1.4247801303863525
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [AREA#25756, DR_NO#0, Date Rptd#194, DATE OCC#223, TIME OCC#3, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#252, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, ... 1

                                                                                

In [34]:
for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH']:
    query_4_1b(method)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [AREA#27163, DR_NO#0, Date Rptd#194, DATE OCC#223, TIME OCC#3, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#252, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, ... 10 more fields]
   +- BroadcastHashJoin [AREA#27163], [AREA#27156], Inner, BuildRight, false
      :- Project [DR_NO#0, Date Rptd#194, DATE OCC#223, TIME OCC#3, AREA #4 AS AREA#27163, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#252, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, ... 5 more fields]
      :  +- Filter isnotnull(AREA #4)
      :     +- InMemoryTableScan [AREA #4, AREA NAME#5, Crm Cd#8, Crm Cd 1#20, Crm Cd 2#21, Crm

                                                                                

+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street| 6.913894657106162|68326|
|  Southeast|12.025486164617545|57011|
|  Southwest| 6.551382718135037|54271|
|     Newton| 5.236120283228869|43355|
|    Central|3.1665457401563613|40488|
|    Rampart| 7.047477633659229|38908|
|    Olympic| 16.97635751928162|37199|
|  Hollywood| 6.751386163782781|32995|
|    Mission|20.666096061806797|32334|
| Hollenbeck| 12.70280185545915|29901|
|    Pacific|12.047038740457603|29074|
|N Hollywood|12.204292484887182|28702|
|     Harbor| 7.460827048756397|28540|
|   Foothill|14.305736282276778|27415|
|  Northeast| 9.093017340351224|27230|
|   Wilshire| 7.054111333900138|26772|
|   Van Nuys|  8.87387830707935|26089|
|    Topanga| 8.057323863764594|24822|
| Devonshire|13.826966000368186|23013|
|West Valley|10.031539994619987|22791|
+-----------+------------------+-----+
only showing top 20 rows

Method : BROADCAST | Time 1.9325966835

                                                                                

+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street| 6.913894657106623|68326|
|  Southeast|12.025486164617854|57011|
|  Southwest| 6.551382718135446|54271|
|     Newton| 5.236120283228738|43355|
|    Central|3.1665457401567365|40488|
|    Rampart| 7.047477633659364|38908|
|    Olympic| 16.97635751928094|37199|
|  Hollywood| 6.751386163783306|32995|
|    Mission|20.666096061806485|32334|
| Hollenbeck| 12.70280185545926|29901|
|    Pacific|12.047038740456975|29074|
|N Hollywood|12.204292484887173|28702|
|     Harbor| 7.460827048756571|28540|
|   Foothill|14.305736282276577|27415|
|  Northeast| 9.093017340351318|27230|
|   Wilshire| 7.054111333900119|26772|
|   Van Nuys|  8.87387830707965|26089|
|    Topanga| 8.057323863764802|24822|
| Devonshire|13.826966000368254|23013|
|West Valley|10.031539994619937|22791|
+-----------+------------------+-----+
only showing top 20 rows

Method : MERGE | Time 2.58216118812561



+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street| 6.913894657106429|68326|
|  Southeast|12.025486164617966|57011|
|  Southwest| 6.551382718135404|54271|
|     Newton| 5.236120283228695|43355|
|    Central| 3.166545740156838|40488|
|    Rampart|7.0474776336593425|38908|
|    Olympic|  16.9763575192807|37199|
|  Hollywood| 6.751386163783392|32995|
|    Mission|20.666096061805494|32334|
| Hollenbeck|12.702801855459127|29901|
|    Pacific|12.047038740457138|29074|
|N Hollywood|12.204292484887345|28702|
|     Harbor| 7.460827048756491|28540|
|   Foothill|14.305736282276635|27415|
|  Northeast|  9.09301734035126|27230|
|   Wilshire| 7.054111333900124|26772|
|   Van Nuys| 8.873878307079364|26089|
|    Topanga| 8.057323863764784|24822|
| Devonshire|13.826966000368301|23013|
|West Valley|10.031539994620017|22791|
+-----------+------------------+-----+
only showing top 20 rows

Method : SHUFFLE_HASH | Time 3.8759887

                                                                                

In [25]:
for method in ['BROADCAST','SHUFFLE_REPLICATE_NL']:
    query_4_2a(method)

24/01/12 15:49:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- BroadcastNestedLoopJoin BuildRight, Cross
   :- Project [DR_NO#0, gettimestamp(Date Rptd#1, MM/dd/yyyy hh:mm:ss a, TimestampType, Some(Europe/Athens), false) AS Date Rptd#80, gettimestamp(DATE OCC#2, MM/dd/yyyy hh:mm:ss a, TimestampType, Some(Europe/Athens), false) AS DATE OCC#110, TIME OCC#3, AREA #4, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, Vict Age#11, Vict Sex#12, Vict Descent#13, Premis Cd#14, Premis Desc#15, Weapon Used Cd#16, Weapon Desc#17, Status#18, Status Desc#19, Crm Cd 1#20, Crm Cd 2#21, Crm Cd 3#22, Crm Cd 4#23, ... 5 more fields]
   :  +- FileScan parquet [DR_NO#0,Date Rptd#1,DATE OCC#2,TIME OCC#3,AREA #4,AREA NAME#5,Rpt Dist No#6,Part 1-2#7,Crm Cd#8,Crm Cd Desc#9,Mocodes#10,Vict Age#11,Vict Sex#12,Vict Descent#13,Premis Cd#14,Premis Desc#15,Weapon Used Cd#16,Weapon Desc#17,Status#18,Status Desc#19,Crm Cd 1#20,Crm Cd 2#21,Crm Cd 3#22,Crm Cd 4#23,... 4 more fields] Batched: true

                                                                                

+----+------------------+-----+
|year|  average_distance|    #|
+----+------------------+-----+
|2010|3.9757578208226914| 8162|
|2011|2.4590852830727146| 7225|
|2012|37.105294020349774| 6539|
|2013| 2.459919591596096| 5851|
|2014|10.659354948051488| 4559|
|2015| 2.388986666887628| 6729|
|2016|2.4268077758998485| 8094|
|2017| 4.006458967994438| 7781|
|2018|2.4123576588935918| 7414|
|2019|2.4311257325392552| 7135|
|2020| 8.300434266099332| 8496|
|2021|  32.0668887686691|17410|
|2022|2.3181733400905924|10139|
|2023| 2.268385590446221| 8955|
+----+------------------+-----+

Method : BROADCAST | Time 113.53692102432251
== Physical Plan ==
CartesianProduct
:- *(1) Project [DR_NO#0, gettimestamp(Date Rptd#1, MM/dd/yyyy hh:mm:ss a, TimestampType, Some(Europe/Athens), false) AS Date Rptd#80, gettimestamp(DATE OCC#2, MM/dd/yyyy hh:mm:ss a, TimestampType, Some(Europe/Athens), false) AS DATE OCC#110, TIME OCC#3, AREA #4, AREA NAME#5, Rpt Dist No#6, Part 1-2#7, Crm Cd#8, Crm Cd Desc#9, Mocodes#10, 



+----+------------------+-----+
|year|  average_distance|    #|
+----+------------------+-----+
|2010|3.9757578208226914| 8162|
|2011|2.4590852830727146| 7225|
|2012|37.105294020349774| 6539|
|2013| 2.459919591596096| 5851|
|2014|10.659354948051488| 4559|
|2015| 2.388986666887628| 6729|
|2016|2.4268077758998485| 8094|
|2017| 4.006458967994438| 7781|
|2018|2.4123576588935918| 7414|
|2019|2.4311257325392552| 7135|
|2020| 8.300434266099332| 8496|
|2021|  32.0668887686691|17410|
|2022|2.3181733400905924|10139|
|2023| 2.268385590446221| 8955|
+----+------------------+-----+

Method : SHUFFLE_REPLICATE_NL | Time 94.36919569969177


----------------------------------------                                        
Exception occurred during processing of request from ('127.0.0.1', 44218)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/home/user/opt/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/home/user/opt/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/home/user/opt/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates = read_

In [None]:
for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH']:
    query_4_2b(method)

In [None]:
spark.stop()