#### Useful Links 
- Spark History Server : http://83.212.73.248:18080/
- Hadoop YARN (scheduler) : http://83.212.73.248:8088/cluster
- HDFS : http://83.212.73.248:9870/dfshealth.html#tab-overview

#### Useful Commands : 
- Connect to okeanos-master (from local) : `$ ssh user@snf-40202.ok-kno.grnetcloud.net `
    - Password : 'Rand0m'
- Connect to okeanos-worker (from okeanos-master) : `$ ssh okeanos-worker`
- Open Jupyter Notebook : `$ jupyter notebook --ip 83.212.73.248 --port 8888`

#### Thinks to do :
- Make the data Csv to Parquet
- Make those columns the type we want
- Write the Queries (!)
- Benchmark and optimize them etc.
- Balance the data onto HDFS across the two datanodes

### Full HDFS path is here : hdfs://okeanos-master:54310/csv_data/
and contains :  
     
     1.  hdfs://okeanos-master:54310/csv_data/LAPD_Police_Stations.csv
     2.  hdfs://okeanos-master:54310/csv_data/crime_data_2019.csv 
     3.  hdfs://okeanos-master:54310/csv_data/crime_data_2023.csv
     4.  hdfs://okeanos-master:54310/csv_data/revgecoding.csv 
     5.  hdfs://okeanos-master:54310/csv_data/income/
         1. LA_income_2015.csv
         2. LA_income_2017.csv
         3. LA_income_2019.csv
         4. LA_income_2021.csv

In [1]:
# Pyspark Imports
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import *
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import geopy.distance
import time 
import math

In [2]:
# initialize sparkSession, make the data from csv to parquet,
spark = SparkSession \
    .builder \
    .appName("2 Executors") \
    .config("spark.driver.cores", "1") \
    .config("spark.driver.memory", "1g") \
    .config("spark.executor.instances", "4") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/29 01:19:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/29 01:19:53 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [3]:
# load data into memory, do the necessary joins etc. here
crime_data = spark.read.parquet("hdfs://okeanos-master:54310/parquet/crime_data_*.parquet")
revge = spark.read.parquet("hdfs://okeanos-master:54310/parquet/revgecoding.parquet")
# only 2015 income data needed
income = spark.read \
            .parquet("hdfs://okeanos-master:54310/parquet/income/LA_income_2015.parquet")
lapd_stations = spark.read.parquet("hdfs://okeanos-master:54310/parquet/LAPD_Police_Stations.parquet")

                                                                                

In [4]:
crime_data = crime_data.withColumn("Date Rptd", to_timestamp("Date Rptd", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("DATE OCC", to_timestamp("DATE OCC", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("Vict Age", col("Vict Age").cast("int")) \
    .withColumn("LAT", col("LAT").cast("double")) \
    .withColumn("LON", col("LON").cast("double")) \
    .withColumn("Premis_Desc", col("Premis Desc"))

In [5]:
# code for column type changing
crime_data = crime_data.withColumn("Date Rptd", to_timestamp("Date Rptd", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("DATE OCC", to_timestamp("DATE OCC", 'MM/dd/yyyy hh:mm:ss a')) \
    .withColumn("Vict Age", col("Vict Age").cast("int")) \
    .withColumn("LAT", col("LAT").cast("double")) \
    .withColumn("LON", col("LON").cast("double")) \
    .withColumn("Premis_Desc", col("Premis Desc"))

# 3rd Query :

        find the 3 zip codes with min and max household income
                    |
                    |
                    v
        // filter(remove) victimless crimes
                    |
                    |
                    v
        select vict_desc, COUNT(*) as count
        where year=2015
        group by vict_desc
        order by count DESC

In [6]:
# write code for 3rd query here
def query_3(method = 'CONTINUE'):
    start_time = time.time()
    if method == 'BROADCAST':
        crime_data_join_revge = crime_data.join(broadcast(revge), ['LAT', 'LON'], 'inner') \
            .withColumnRenamed('ZIPcode', 'Zip Code') \
            .withColumn("Zip Code", col("Zip Code").cast("int")) \
            .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X'))
    elif method in ['MERGE', 'SHUFFLE_HASH', 'SHUFFLE_REPLICATE_NL']:
        crime_data_join_revge = crime_data.hint(method).join(revge, ['LAT', 'LON'], 'inner') \
            .withColumnRenamed('ZIPcode', 'Zip Code') \
            .withColumn("Zip Code", col("Zip Code").cast("int")) \
            .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X'))        
    elif method == 'CONTINUE':
        crime_data_join_revge = crime_data.join(revge, ['LAT', 'LON'], 'inner') \
            .withColumnRenamed('ZIPcode', 'Zip Code') \
            .withColumn("Zip Code", col("Zip Code").cast("int")) \
            .filter((col('Vict Descent') != 'X') & (col('Vict Sex') != 'X')) 
    else:
        return None

    crime_data_join_income = crime_data_join_revge.join(income, 'Zip Code', 'inner') \
                                .withColumn('Estimated Median Income', 
                                            regexp_replace(col('Estimated Median Income'), '[$,]', '')) \
                                .withColumn('Estimated Median Income', 
                                            col('Estimated Median Income') \
                                .cast('double'))
    
    max_income_zip_codes = crime_data_join_income.groupBy('Zip Code') \
                            .agg({'Estimated Median Income': 'max'}) \
                            .withColumnRenamed('max(Estimated Median Income)', 'MaxIncome') \
                            .orderBy(col('MaxIncome').desc()) \
                            .limit(3)
    
    min_income_zip_codes = crime_data_join_income.groupBy('Zip Code') \
                            .agg({'Estimated Median Income': 'min'}) \
                            .withColumnRenamed('min(Estimated Median Income)', 'MinIncome') \
                            .orderBy(col('MinIncome')) \
                            .limit(3)
    
    zip_codes = min_income_zip_codes.union(max_income_zip_codes)
    
    zip_codes_list = [row['Zip Code'] for row in zip_codes.collect()]
    
    result = crime_data_join_income \
                .filter(col('Zip Code').isin(zip_codes_list)) \
                .filter(year(col('Date Rptd')) == 2015) \
                .groupBy('Vict Descent') \
                .count() \
                .withColumnRenamed('count', '#') \
                .orderBy(col('#').desc())
    
    result.show()
    
    end_time = time.time()
    
    num_executors = spark.conf.get("spark.executor.instances")
    print(f'{num_executors} Executors')
    
    result.explain()
    print(f'Method : {method} | Time {end_time - start_time}')
    
    return end_time - start_time

# Query 3 on 2 Executors

In [None]:
query_3("BROADCAST")



In [None]:
spark.stop()