In [1]:
# Pyspark Imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import *
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import geopy.distance
import time 
import math

In [2]:
# initialize sparkSession, make the data from csv to parquet,
spark = SparkSession \
    .builder \
    .appName("4 Executors") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/29 01:22:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/29 01:22:13 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# load data into memory, do the necessary joins etc. here
crime_data = spark.read.parquet("hdfs://okeanos-master:54310/parquet/crime_data_*.parquet")
revge = spark.read.parquet("hdfs://okeanos-master:54310/parquet/revgecoding.parquet")
# only 2015 income data needed
income = spark.read \
            .parquet("hdfs://okeanos-master:54310/parquet/income/LA_income_2015.parquet")
lapd_stations = spark.read.parquet("hdfs://okeanos-master:54310/parquet/LAPD_Police_Stations.parquet")

                                                                                

In [4]:
crime_data = crime_data.withColumn("Date Rptd", to_timestamp("Date Rptd", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("DATE OCC", to_timestamp("DATE OCC", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("Vict Age", col("Vict Age").cast("int")) \
    .withColumn("LAT", col("LAT").cast("double")) \
    .withColumn("LON", col("LON").cast("double")) \
    .withColumn("Premis_Desc", col("Premis Desc"))

In [5]:
# calculate distance on a sphere (as earth is not flat)
def haversine(lon1, lat1, lon2, lat2):
    R = 6371
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + \
        math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

haversine_udf = udf(haversine, DoubleType())

In [6]:
# write code for 4th query here
# 1a)
def query_4_1a(method = 'CONTINUE'):
    start_time = time.time()
    lapd_stations_new = lapd_stations.withColumnRenamed('PREC','AREA')

    if method == 'BROADCAST':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(broadcast(lapd_stations_new), 'AREA', 'inner')
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .hint(method).join(lapd_stations_new, 'AREA', 'inner')
    elif method == 'CONTINUE':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(lapd_stations_new, 'AREA', 'inner')
    else:
        return None
    
    crime_data_join_stations = crime_data_join_stations.withColumn('distance',
                                    haversine_udf(col("LON"), col("LAT"), col("X"), col("Y")))
    
    crime_data_join_stations = crime_data_join_stations.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(col('Weapon Used Cd') < 200) \
                                    .withColumn('year', F.year('Date Rptd'))
    
    result = crime_data_join_stations.groupBy('year') \
        .agg((F.sum('distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')) \
       .orderBy(F.col('year'))
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [7]:
# 1b)
def query_4_1b(method = 'CONTINUE'):
    start_time = time.time()
    lapd_stations_new = lapd_stations.withColumnRenamed('PREC','AREA')
    
    if method == 'BROADCAST':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(broadcast(lapd_stations_new), 'AREA', 'inner')
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .hint(method).join(lapd_stations_new, 'AREA', 'inner')
    elif method == 'CONTINUE':
        crime_data_join_stations = crime_data.withColumnRenamed('AREA ', 'AREA') \
                                    .join(lapd_stations_new, 'AREA', 'inner')
    else:
        return None
    
    # distance of LAT and LON using Spark functions
    crime_data_join_stations = crime_data_join_stations.withColumn('distance',
                                    haversine_udf(col("LON"), col("LAT"), col("X"), col("Y"))) # Earths's radius
    
    crime_data_join_stations = crime_data_join_stations.withColumn('Weapon Used Cd', col('Weapon Used Cd').cast('int')) \
                                    .filter(F.col('Weapon Used Cd').isNotNull()) \
                                    .withColumn('year', F.year('Date Rptd'))
    
    result = crime_data_join_stations.groupBy('AREA NAME') \
        .agg(
            (F.sum('distance') / F.count('*')).alias('average_distance'),
            F.count('*').alias('#')
        ) \
        .orderBy(F.col('#').desc()) \
        .withColumnRenamed('AREA NAME', 'division')
    
    result.show()
    end_time = time.time()
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    return end_time - start_time

In [8]:
query_4_1a('MERGE')

                                                                                

+----+------------------+----+
|year|  average_distance|   #|
+----+------------------+----+
|2010|11267.275001828906|8162|
|2011|11201.857685426628|7225|
|2012|11240.673360849738|6539|
|2013|11021.787174628653|5851|
|2014|10812.255951708757|5932|
|2015| 11007.17077223872|6766|
|2016|10992.021316171948|8094|
|2017|11014.024161391715|7784|
|2018|10914.353043897538|7415|
|2019|11296.989281965622|7135|
|2020|11366.081363090094|  47|
|2021| 8905.090912770953|   8|
|2022|11385.108009854534|   8|
|2023|10138.925377481022|   6|
+----+------------------+----+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [year#401 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(year#401 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=351]
      +- HashAggregate(keys=[year#401], functions=[sum(distance#329), count(1)])
         +- Exchange hashpartitioning(year#401, 200), ENSURE_REQUIREMENTS, [plan_id=348]
            +- HashAggregate(keys=[year#401], functions=[partial_sum(di

29.053653717041016