In [1]:
# Pyspark Imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import *
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import geopy.distance
import time 
import math

In [2]:
# initialize sparkSession, make the data from csv to parquet,
spark = SparkSession \
    .builder \
    .appName("4 Executors") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/08 20:57:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/08 20:57:27 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# load data into memory, do the necessary joins etc. here
crime_data = spark.read.parquet("hdfs://okeanos-master:54310/parquet/crime_data_*.parquet")
revge = spark.read.parquet("hdfs://okeanos-master:54310/parquet/revgecoding.parquet")
# only 2015 income data needed
income = spark.read \
            .parquet("hdfs://okeanos-master:54310/parquet/income/LA_income_2015.parquet")
lapd_stations = spark.read.parquet("hdfs://okeanos-master:54310/parquet/LAPD_Police_Stations.parquet")

                                                                                

In [4]:
crime_data = crime_data.withColumn("Date Rptd", to_timestamp("Date Rptd", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("DATE OCC", to_timestamp("DATE OCC", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("Vict Age", col("Vict Age").cast("int")) \
    .withColumn("LAT", col("LAT").cast("double")) \
    .withColumn("LON", col("LON").cast("double")) \
    .withColumn("Premis_Desc", col("Premis Desc"))

In [5]:
# calculate distance on a sphere (as earth is not flat)
def haversine(lon1, lat1, lon2, lat2):
    R = 6371
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + \
        math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

haversine_udf = udf(haversine, DoubleType())

In [13]:
def query_4_2a(method = 'CONTINUE'):
    start_time = time.time()

    crime_data_filtered = crime_data.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(col('Weapon Used Cd') < 200) \
                                    .withColumn('year', F.year('Date Rptd'))
    crime_data_filtered.count()

    if method == 'BROADCAST':
        combined_data = crime_data_filtered.hint(method).crossJoin(broadcast(lapd_stations))
        #combined_data.explain()
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        combined_data = crime_data_filtered.hint(method).crossJoin(lapd_stations)
        #combined_data.explain()
    elif method == 'CONTINUE':
        combined_data = crime_data_filtered.crossJoin(lapd_stations)
    else:
        return None
        
    combined_data = combined_data.withColumn("closest_distance", haversine_udf(col("LON"), col("LAT"), col("X"), col("Y")))
    
    windowSpec = Window.partitionBy("DR_NO").orderBy("closest_distance")
    closest_stations = combined_data.withColumn("rank", rank().over(windowSpec)).filter(col("rank") == 1)
    
    final_data = closest_stations.select(col("DR_NO"), col("DIVISION").alias("closest_station"), col("closest_distance"))
    
    joined_with_stations = crime_data_filtered.join(final_data, "DR_NO")
    
    result = joined_with_stations.groupBy('year') \
        .agg((F.sum('closest_distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')) \
       .orderBy(F.col('year'))

    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [7]:
for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
    query_4_2a(method)

                                                                                

+----+------------------+-----+
|year|  average_distance|    #|
+----+------------------+-----+
|2010| 3.975757820822703| 8162|
|2011|2.4590852830727146| 7225|
|2012|37.105294020349675| 6539|
|2013| 2.459919591596103| 5851|
|2014|10.659354948051519| 4559|
|2015|2.3889866668876247| 6729|
|2016| 2.426807775899854| 8094|
|2017| 4.006458967994435| 7781|
|2018|2.4123576588935935| 7414|
|2019| 2.431125732539258| 7135|
|2020| 8.300434266099346| 8496|
|2021| 32.06688876866874|17410|
|2022|2.3181733400905906|10139|
|2023|2.2683855904462287| 8955|
+----+------------------+-----+

Method : BROADCAST | Time 21.68978476524353


24/01/08 21:04:21 WARN HintErrorLogger: Hint (strategy=merge) is not supported in the query: no equi-join keys.
                                                                                

+----+------------------+-----+
|year|  average_distance|    #|
+----+------------------+-----+
|2010| 3.975757820822703| 8162|
|2011|2.4590852830727146| 7225|
|2012|37.105294020349675| 6539|
|2013| 2.459919591596103| 5851|
|2014|10.659354948051519| 4559|
|2015|2.3889866668876247| 6729|
|2016| 2.426807775899854| 8094|
|2017| 4.006458967994435| 7781|
|2018|2.4123576588935935| 7414|
|2019| 2.431125732539258| 7135|
|2020| 8.300434266099346| 8496|
|2021| 32.06688876866874|17410|
|2022|2.3181733400905906|10139|
|2023|2.2683855904462287| 8955|
+----+------------------+-----+

Method : MERGE | Time 9.586205005645752


24/01/08 21:04:31 WARN HintErrorLogger: Hint (strategy=shuffle_hash) is not supported in the query: no equi-join keys.
                                                                                

+----+------------------+-----+
|year|  average_distance|    #|
+----+------------------+-----+
|2010| 3.975757820822703| 8162|
|2011|2.4590852830727146| 7225|
|2012|37.105294020349675| 6539|
|2013| 2.459919591596103| 5851|
|2014|10.659354948051519| 4559|
|2015|2.3889866668876247| 6729|
|2016| 2.426807775899854| 8094|
|2017| 4.006458967994435| 7781|
|2018|2.4123576588935935| 7414|
|2019| 2.431125732539258| 7135|
|2020| 8.300434266099346| 8496|
|2021| 32.06688876866874|17410|
|2022|2.3181733400905906|10139|
|2023|2.2683855904462287| 8955|
+----+------------------+-----+

Method : SHUFFLE_HASH | Time 7.744536876678467


In [8]:
# 2b)
def query_4_2b(method = 'CONTINUE'):
    start_time = time.time()

    crime_data_filtered = crime_data.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                  .filter(F.col('Weapon Used Cd').isNotNull()) \
                                  .withColumn('year', F.year('Date Rptd'))
    
    if method == 'BROADCAST':
        combined_data = crime_data_filtered.hint(method).crossJoin(broadcast(lapd_stations))
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        combined_data = crime_data_filtered.hint(method).crossJoin(lapd_stations)    
    elif method == 'CONTINUE':
        combined_data = crime_data_filtered.crossJoin(lapd_stations)
    else:
        return None

    
    combined_data = combined_data.withColumn("closest_distance", haversine_udf(col("LON"), col("LAT"), col("X"), col("Y")))
    
    windowSpec = Window.partitionBy("DR_NO").orderBy("closest_distance")
    closest_stations = combined_data.withColumn("rank", rank().over(windowSpec)).filter(col("rank") == 1)
    
    final_data = closest_stations.select(col("DR_NO"), col("DIVISION").alias("closest_station"), col("closest_distance"))
    
    joined_result = crime_data_filtered.join(final_data, "DR_NO")
   # crime_data_join_stations = result.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
   #                               .filter(F.col('Weapon Used Cd').isNotNull()) \
   #                                 .withColumn('year', F.year('Date Rptd'))
    
    result = joined_result.groupBy('AREA NAME') \
        .agg(
            (F.sum('closest_distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')
        ) \
        .orderBy(F.col('#').desc()) \
        .withColumnRenamed('AREA NAME', 'division')
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [9]:
for method in ['BROADCAST','MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
    query_4_2b(method)

                                                                                

+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street|  12.5671222967951|94853|
|  Southeast| 25.60129873879751|87905|
|  Southwest|   9.3291866625766|72814|
|    Central|23.269811961478677|63606|
|     Newton| 13.43318472129279|61408|
|    Olympic|36.348197450528986|60925|
|    Rampart|19.629568008429306|55881|
|  Hollywood|27.671748021149362|51255|
|    Mission|34.800525588613944|48956|
|    Pacific| 24.82546195967402|43019|
|   Foothill|29.797575113580425|41625|
| Hollenbeck| 19.52127635437576|41540|
|N Hollywood|17.691483131562677|41151|
|     Harbor|13.982027989409769|40854|
|    Topanga|6.0961058272783255|39337|
|   Wilshire|15.736618712905988|37930|
|  Northeast|12.367411075059353|37334|
| Devonshire|23.046036251700613|36902|
|   Van Nuys| 19.78805706517812|36264|
|West Valley|14.718040243069364|34005|
+-----------+------------------+-----+
only showing top 20 rows

Method : BROADCAST | Time 36.237014055

24/01/08 21:08:10 WARN HintErrorLogger: Hint (strategy=merge) is not supported in the query: no equi-join keys.
                                                                                

+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street|  12.5671222967951|94853|
|  Southeast| 25.60129873879751|87905|
|  Southwest|   9.3291866625766|72814|
|    Central|23.269811961478677|63606|
|     Newton| 13.43318472129279|61408|
|    Olympic|36.348197450528986|60925|
|    Rampart|19.629568008429306|55881|
|  Hollywood|27.671748021149362|51255|
|    Mission|34.800525588613944|48956|
|    Pacific| 24.82546195967402|43019|
|   Foothill|29.797575113580425|41625|
| Hollenbeck| 19.52127635437576|41540|
|N Hollywood|17.691483131562677|41151|
|     Harbor|13.982027989409769|40854|
|    Topanga|6.0961058272783255|39337|
|   Wilshire|15.736618712905988|37930|
|  Northeast|12.367411075059353|37334|
| Devonshire|23.046036251700613|36902|
|   Van Nuys| 19.78805706517812|36264|
|West Valley|14.718040243069364|34005|
+-----------+------------------+-----+
only showing top 20 rows

Method : MERGE | Time 34.7565786838531

24/01/08 21:08:45 WARN HintErrorLogger: Hint (strategy=shuffle_hash) is not supported in the query: no equi-join keys.

+-----------+------------------+-----+
|   division|  average_distance|    #|
+-----------+------------------+-----+
|77th Street|  12.5671222967951|94853|
|  Southeast| 25.60129873879751|87905|
|  Southwest|   9.3291866625766|72814|
|    Central|23.269811961478677|63606|
|     Newton|13.433184721292792|61408|
|    Olympic|36.348197450528986|60925|
|    Rampart|19.629568008429306|55881|
|  Hollywood|27.671748021149362|51255|
|    Mission|34.800525588613944|48956|
|    Pacific| 24.82546195967402|43019|
|   Foothill|29.797575113580432|41625|
| Hollenbeck| 19.52127635437576|41540|
|N Hollywood|17.691483131562677|41151|
|     Harbor|13.982027989409769|40854|
|    Topanga|6.0961058272783255|39337|
|   Wilshire|15.736618712905988|37930|
|  Northeast|12.367411075059353|37334|
| Devonshire|23.046036251700617|36902|
|   Van Nuys| 19.78805706517812|36264|
|West Valley|14.718040243069364|34005|
+-----------+------------------+-----+
only showing top 20 rows

Method : SHUFFLE_HASH | Time 33.326755

                                                                                